diff --git a/thirdparty/aom/xmake.lua b/thirdparty/aom/xmake.lua
new file mode 100644
index 0000000..74233d5
--- /dev/null
+++ b/thirdparty/aom/xmake.lua
@@ -0,0 +1,23 @@
+package("aom")
+
+    set_homepage("https://aomedia.googlesource.com/aom/")
+    set_description("AV1 Codec Library")
+    set_license("BSD-3-Clause")
+    set_urls("https://aomedia.googlesource.com/aom.git")
+    add_versions("v3.9.0", "6cab58c3925e0f4138e15a4ed510161ea83b6db1")
+
+    add_deps("cmake")
+
+    if is_os("windows") then
+        add_defines("_CRT_SECURE_NO_WARNINGS")
+    end
+
+    on_install("windows", "linux", "macosx", function (package)
+        local configs = {"-DENABLE_EXAMPLES=OFF", "-DENABLE_TESTS=OFF", "-DENABLE_TOOLS=OFF", "-DENABLE_DOCS=OFF"}
+        table.insert(configs, "-DCMAKE_BUILD_TYPE=" .. (package:debug() and "Debug" or "Release"))
+        import("package.tools.cmake").install(package, configs)
+    end)
+
+    on_test(function (package)
+        assert(package:has_cfuncs("aom_codec_version", {includes = "aom/aom_codec.h"}))
+    end)
\ No newline at end of file
diff --git a/thirdparty/libyuv/.clang-format b/thirdparty/libyuv/.clang-format
deleted file mode 100644
index 59d4870..0000000
--- a/thirdparty/libyuv/.clang-format
+++ /dev/null
@@ -1,6 +0,0 @@
-# Defines the Chromium style for automatic reformatting.
-# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
-BasedOnStyle: Chromium
----
-Language: Java
-BasedOnStyle: Google
diff --git a/thirdparty/libyuv/.gitignore b/thirdparty/libyuv/.gitignore
deleted file mode 100644
index 7095d41..0000000
--- a/thirdparty/libyuv/.gitignore
+++ /dev/null
@@ -1,36 +0,0 @@
-*.pyc
-.landmines
-pin-log.txt
-/base
-/build
-/buildtools
-/google_apis
-/links
-/links.db
-/ios
-/mojo
-/native_client
-/net
-/out
-/source/out
-/sde-avx-sse-transition-out.txt
-/testing
-/third_party
-/tools
-
-# Files generated by CMake build
-cmake_install.cmake
-CMakeCache.txt
-CMakeFiles/
-yuvconvert
-libgtest.a
-libyuv.a
-libyuv_unittest
-
-# Files generated by winarm.mk build
-libyuv_arm.lib
-source/*.o
-
-# Files generated by perf
-perf.data
-perf.data.old
diff --git a/thirdparty/libyuv/.gn b/thirdparty/libyuv/.gn
deleted file mode 100644
index be8c3b5..0000000
--- a/thirdparty/libyuv/.gn
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2015 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-import("//build/dotfile_settings.gni")
-
-# The location of the build configuration file.
-buildconfig = "//build/config/BUILDCONFIG.gn"
-
-# The secondary source root is a parallel directory tree where
-# GN build files are placed when they can not be placed directly
-# in the source tree, e.g. for third party source trees.
-secondary_source = "//build/secondary/"
-
-# These are the targets to check headers for by default. The files in targets
-# matching these patterns (see "gn help label_pattern" for format) will have
-# their includes checked for proper dependencies when you run either
-# "gn check" or "gn gen --check".
-check_targets = [ "//libyuv/*" ]
-
-# These are the list of GN files that run exec_script. This whitelist exists
-# to force additional review for new uses of exec_script, which is strongly
-# discouraged except for gypi_to_gn calls.
-exec_script_whitelist = build_dotfile_settings.exec_script_whitelist +
-                        [ "//build_overrides/build.gni" ]
-
-default_args = {
-  mac_sdk_min = "10.12"
-
-  # https://bugs.chromium.org/p/libyuv/issues/detail?id=826
-  ios_deployment_target = "10.0"
-}
diff --git a/thirdparty/libyuv/.vpython b/thirdparty/libyuv/.vpython
deleted file mode 100644
index e0aaf89..0000000
--- a/thirdparty/libyuv/.vpython
+++ /dev/null
@@ -1,59 +0,0 @@
-# This is a vpython "spec" file.
-#
-# It describes patterns for python wheel dependencies of the python scripts in
-# the chromium repo, particularly for dependencies that have compiled components
-# (since pure-python dependencies can be easily vendored into third_party).
-#
-# When vpython is invoked, it finds this file and builds a python VirtualEnv,
-# containing all of the dependencies described in this file, fetching them from
-# CIPD (the "Chrome Infrastructure Package Deployer" service). Unlike `pip`,
-# this never requires the end-user machine to have a working python extension
-# compilation environment. All of these packages are built using:
-#   https://chromium.googlesource.com/infra/infra/+/master/infra/tools/dockerbuild/
-#
-# All python scripts in the repo share this same spec, to avoid dependency
-# fragmentation.
-#
-# If you have depot_tools installed in your $PATH, you can invoke python scripts
-# in this repo by running them as you normally would run them, except
-# substituting `vpython` instead of `python` on the command line, e.g.:
-#   vpython path/to/script.py some --arguments
-#
-# Read more about `vpython` and how to modify this file here:
-#   https://chromium.googlesource.com/infra/infra/+/master/doc/users/vpython.md
-
-python_version: "2.7"
-
-# Used by:
-#   third_party/catapult
-wheel: <
-  name: "infra/python/wheels/psutil/${platform}_${py_python}_${py_abi}"
-  version: "version:5.2.2"
->
-
-# Used by:
-#   third_party/catapult
-wheel: <
-  name: "infra/python/wheels/pypiwin32/${vpython_platform}"
-  version: "version:219"
-  match_tag: <
-    platform: "win32"
-  >
-  match_tag: <
-    platform: "win_amd64"
-  >
->
-
-# Used by:
-#   tools/swarming_client
-wheel: <
-  name: "infra/python/wheels/six-py2_py3"
-  version: "version:1.15.0"
->
-
-# Used by:
-#   build/android
-wheel: <
-  name: "infra/python/wheels/requests-py2_py3"
-  version: "version:2.13.0"
->
diff --git a/thirdparty/libyuv/AUTHORS b/thirdparty/libyuv/AUTHORS
deleted file mode 100644
index 9686ac1..0000000
--- a/thirdparty/libyuv/AUTHORS
+++ /dev/null
@@ -1,4 +0,0 @@
-# Names should be added to this file like so:
-# Name or Organization <email address>
-
-Google Inc.
diff --git a/thirdparty/libyuv/Android.bp b/thirdparty/libyuv/Android.bp
deleted file mode 100644
index ce1f62e..0000000
--- a/thirdparty/libyuv/Android.bp
+++ /dev/null
@@ -1,156 +0,0 @@
-cc_library {
-    name: "libyuv",
-    vendor_available: true,
-    vndk: {
-        enabled: true,
-    },
-
-    srcs: [
-        "source/compare.cc",
-        "source/compare_common.cc",
-        "source/compare_gcc.cc",
-        "source/compare_mmi.cc",
-        "source/compare_msa.cc",
-        "source/compare_neon.cc",
-        "source/compare_neon64.cc",
-        "source/convert.cc",
-        "source/convert_argb.cc",
-        "source/convert_from.cc",
-        "source/convert_from_argb.cc",
-        "source/convert_jpeg.cc",
-        "source/convert_to_argb.cc",
-        "source/convert_to_i420.cc",
-        "source/cpu_id.cc",
-        "source/mjpeg_decoder.cc",
-        "source/mjpeg_validate.cc",
-        "source/planar_functions.cc",
-        "source/rotate.cc",
-        "source/rotate_any.cc",
-        "source/rotate_argb.cc",
-        "source/rotate_common.cc",
-        "source/rotate_gcc.cc",
-        "source/rotate_mmi.cc",
-        "source/rotate_msa.cc",
-        "source/rotate_neon.cc",
-        "source/rotate_neon64.cc",
-        "source/row_any.cc",
-        "source/row_common.cc",
-        "source/row_gcc.cc",
-        "source/row_mmi.cc",
-        "source/row_msa.cc",
-        "source/row_neon.cc",
-        "source/row_neon64.cc",
-        "source/scale.cc",
-        "source/scale_any.cc",
-        "source/scale_argb.cc",
-        "source/scale_common.cc",
-        "source/scale_gcc.cc",
-        "source/scale_mmi.cc",
-        "source/scale_msa.cc",
-        "source/scale_neon.cc",
-        "source/scale_neon64.cc",
-        "source/scale_uv.cc",
-        "source/video_common.cc",
-    ],
-
-    cflags: [
-        "-Wall",
-        "-Werror",
-        "-Wno-unused-parameter",
-        "-fexceptions",
-        "-DHAVE_JPEG",
-    ],
-
-    shared_libs: ["libjpeg"],
-
-    export_include_dirs: ["include"],
-}
-
-// compatibilty static library until all uses of libyuv_static are replaced
-// with libyuv (b/37646797)
-cc_library_static {
-    name: "libyuv_static",
-    vendor_available: true,
-    whole_static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "libyuv_unittest",
-    static_libs: ["libyuv"],
-    shared_libs: ["libjpeg"],
-    cflags: ["-Wall", "-Werror"],
-    srcs: [
-        "unit_test/basictypes_test.cc",
-        "unit_test/color_test.cc",
-        "unit_test/compare_test.cc",
-        "unit_test/convert_test.cc",
-        "unit_test/cpu_test.cc",
-        "unit_test/cpu_thread_test.cc",
-        "unit_test/math_test.cc",
-        "unit_test/planar_test.cc",
-        "unit_test/rotate_argb_test.cc",
-        "unit_test/rotate_test.cc",
-        "unit_test/scale_argb_test.cc",
-        "unit_test/scale_test.cc",
-        "unit_test/scale_uv_test.cc",
-        "unit_test/unit_test.cc",
-        "unit_test/video_common_test.cc",
-    ],
-}
-
-cc_test {
-    name: "compare",
-    gtest: false,
-    srcs: [
-        "util/compare.cc",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "i444tonv12_eg",
-    gtest: false,
-    srcs: [
-        "util/i444tonv12_eg.cc",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "cpuid",
-    gtest: false,
-    srcs: [
-        "util/cpuid.c",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "psnr",
-    gtest: false,
-    srcs: [
-        "util/psnr_main.cc",
-        "util/psnr.cc",
-        "util/ssim.cc",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "yuvconvert",
-    gtest: false,
-    srcs: [
-        "util/yuvconvert.cc",
-    ],
-    static_libs: ["libyuv"],
-    shared_libs: ["libjpeg"],
-}
-
-cc_test {
-    name: "yuvconstants",
-    gtest: false,
-    srcs: [
-        "util/yuvconstants.c",
-    ],
-    static_libs: ["libyuv"],
-}
diff --git a/thirdparty/libyuv/Android.mk b/thirdparty/libyuv/Android.mk
deleted file mode 100644
index 2ceb492..0000000
--- a/thirdparty/libyuv/Android.mk
+++ /dev/null
@@ -1,110 +0,0 @@
-# This is the Android makefile for libyuv for NDK.
-LOCAL_PATH:= $(call my-dir)
-
-include $(CLEAR_VARS)
-
-LOCAL_CPP_EXTENSION := .cc
-
-LOCAL_SRC_FILES := \
-    source/compare.cc           \
-    source/compare_common.cc    \
-    source/compare_gcc.cc       \
-    source/compare_mmi.cc       \
-    source/compare_msa.cc       \
-    source/compare_neon.cc      \
-    source/compare_neon64.cc    \
-    source/compare_win.cc       \
-    source/convert.cc           \
-    source/convert_argb.cc      \
-    source/convert_from.cc      \
-    source/convert_from_argb.cc \
-    source/convert_to_argb.cc   \
-    source/convert_to_i420.cc   \
-    source/cpu_id.cc            \
-    source/planar_functions.cc  \
-    source/rotate.cc            \
-    source/rotate_any.cc        \
-    source/rotate_argb.cc       \
-    source/rotate_common.cc     \
-    source/rotate_gcc.cc        \
-    source/rotate_mmi.cc        \
-    source/rotate_msa.cc        \
-    source/rotate_neon.cc       \
-    source/rotate_neon64.cc     \
-    source/rotate_win.cc        \
-    source/row_any.cc           \
-    source/row_common.cc        \
-    source/row_gcc.cc           \
-    source/row_mmi.cc           \
-    source/row_msa.cc           \
-    source/row_neon.cc          \
-    source/row_neon64.cc        \
-    source/row_win.cc           \
-    source/scale.cc             \
-    source/scale_any.cc         \
-    source/scale_argb.cc        \
-    source/scale_common.cc      \
-    source/scale_gcc.cc         \
-    source/scale_mmi.cc         \
-    source/scale_msa.cc         \
-    source/scale_neon.cc        \
-    source/scale_neon64.cc      \
-    source/scale_uv.cc          \
-    source/scale_win.cc         \
-    source/video_common.cc
-
-common_CFLAGS := -Wall -fexceptions
-ifneq ($(LIBYUV_DISABLE_JPEG), "yes")
-LOCAL_SRC_FILES += \
-    source/convert_jpeg.cc      \
-    source/mjpeg_decoder.cc     \
-    source/mjpeg_validate.cc
-common_CFLAGS += -DHAVE_JPEG
-LOCAL_SHARED_LIBRARIES := libjpeg
-endif
-
-LOCAL_CFLAGS += $(common_CFLAGS)
-LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include
-LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
-LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)/include
-
-LOCAL_MODULE := libyuv_static
-LOCAL_MODULE_TAGS := optional
-
-include $(BUILD_STATIC_LIBRARY)
-
-include $(CLEAR_VARS)
-
-LOCAL_WHOLE_STATIC_LIBRARIES := libyuv_static
-LOCAL_MODULE := libyuv
-ifneq ($(LIBYUV_DISABLE_JPEG), "yes")
-LOCAL_SHARED_LIBRARIES := libjpeg
-endif
-
-include $(BUILD_SHARED_LIBRARY)
-
-include $(CLEAR_VARS)
-LOCAL_STATIC_LIBRARIES := libyuv_static
-LOCAL_SHARED_LIBRARIES := libjpeg
-LOCAL_MODULE_TAGS := tests
-LOCAL_CPP_EXTENSION := .cc
-LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
-LOCAL_SRC_FILES := \
-    unit_test/basictypes_test.cc  \
-    unit_test/color_test.cc       \
-    unit_test/compare_test.cc     \
-    unit_test/convert_test.cc     \
-    unit_test/cpu_test.cc         \
-    unit_test/cpu_thread_test.cc  \
-    unit_test/math_test.cc        \
-    unit_test/planar_test.cc      \
-    unit_test/rotate_argb_test.cc \
-    unit_test/rotate_test.cc      \
-    unit_test/scale_argb_test.cc  \
-    unit_test/scale_test.cc       \
-    unit_test/scale_uv_test.cc    \
-    unit_test/unit_test.cc        \
-    unit_test/video_common_test.cc
-
-LOCAL_MODULE := libyuv_unittest
-include $(BUILD_NATIVE_TEST)
diff --git a/thirdparty/libyuv/BUILD.gn b/thirdparty/libyuv/BUILD.gn
deleted file mode 100644
index e1c7c1d..0000000
--- a/thirdparty/libyuv/BUILD.gn
+++ /dev/null
@@ -1,404 +0,0 @@
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-import("//testing/test.gni")
-import("libyuv.gni")
-
-declare_args() {
-  # Set to false to disable building with absl flags.
-  libyuv_use_absl_flags = true
-
-  # When building a shared library using a target in WebRTC or
-  # Chromium projects that depends on libyuv, setting this flag
-  # to true makes libyuv symbols visible inside that library.
-  libyuv_symbols_visible = false
-}
-
-config("libyuv_config") {
-  include_dirs = [ "include" ]
-  if (is_android && current_cpu == "arm64") {
-    ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
-  }
-  if (is_android && current_cpu != "arm64") {
-    ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
-  }
-}
-
-# This target is built when no specific target is specified on the command line.
-group("default") {
-  testonly = true
-  deps = [ ":libyuv" ]
-  if (libyuv_include_tests) {
-    deps += [
-      ":compare",
-      ":cpuid",
-      ":i444tonv12_eg",
-      ":libyuv_unittest",
-      ":psnr",
-      ":yuvconstants",
-      ":yuvconvert",
-    ]
-  }
-}
-
-group("libyuv") {
-  all_dependent_configs = [ ":libyuv_config" ]
-  deps = []
-
-  if (is_win && target_cpu == "x64") {
-    # Compile with clang in order to get inline assembly
-    public_deps = [ ":libyuv_internal(//build/toolchain/win:win_clang_x64)" ]
-  } else {
-    public_deps = [ ":libyuv_internal" ]
-  }
-
-  if (libyuv_use_neon) {
-    deps += [ ":libyuv_neon" ]
-  }
-
-  if (libyuv_use_msa) {
-    deps += [ ":libyuv_msa" ]
-  }
-
-  if (libyuv_use_mmi) {
-    deps += [ ":libyuv_mmi" ]
-  }
-
-  if (!is_ios && !libyuv_disable_jpeg) {
-    # Make sure that clients of libyuv link with libjpeg. This can't go in
-    # libyuv_internal because in Windows x64 builds that will generate a clang
-    # build of libjpeg, and we don't want two copies.
-    deps += [ "//third_party:jpeg" ]
-  }
-}
-
-static_library("libyuv_internal") {
-  visibility = [ ":*" ]
-
-  sources = [
-    # Headers
-    "include/libyuv.h",
-    "include/libyuv/basic_types.h",
-    "include/libyuv/compare.h",
-    "include/libyuv/convert.h",
-    "include/libyuv/convert_argb.h",
-    "include/libyuv/convert_from.h",
-    "include/libyuv/convert_from_argb.h",
-    "include/libyuv/cpu_id.h",
-    "include/libyuv/mjpeg_decoder.h",
-    "include/libyuv/planar_functions.h",
-    "include/libyuv/rotate.h",
-    "include/libyuv/rotate_argb.h",
-    "include/libyuv/rotate_row.h",
-    "include/libyuv/row.h",
-    "include/libyuv/scale.h",
-    "include/libyuv/scale_argb.h",
-    "include/libyuv/scale_row.h",
-    "include/libyuv/scale_uv.h",
-    "include/libyuv/version.h",
-    "include/libyuv/video_common.h",
-
-    # Source Files
-    "source/compare.cc",
-    "source/compare_common.cc",
-    "source/compare_gcc.cc",
-    "source/compare_win.cc",
-    "source/convert.cc",
-    "source/convert_argb.cc",
-    "source/convert_from.cc",
-    "source/convert_from_argb.cc",
-    "source/convert_jpeg.cc",
-    "source/convert_to_argb.cc",
-    "source/convert_to_i420.cc",
-    "source/cpu_id.cc",
-    "source/mjpeg_decoder.cc",
-    "source/mjpeg_validate.cc",
-    "source/planar_functions.cc",
-    "source/rotate.cc",
-    "source/rotate_any.cc",
-    "source/rotate_argb.cc",
-    "source/rotate_common.cc",
-    "source/rotate_gcc.cc",
-    "source/rotate_win.cc",
-    "source/row_any.cc",
-    "source/row_common.cc",
-    "source/row_gcc.cc",
-    "source/row_win.cc",
-    "source/scale.cc",
-    "source/scale_any.cc",
-    "source/scale_argb.cc",
-    "source/scale_common.cc",
-    "source/scale_gcc.cc",
-    "source/scale_uv.cc",
-    "source/scale_win.cc",
-    "source/video_common.cc",
-  ]
-
-  configs += [ ":libyuv_config" ]
-  defines = []
-  deps = []
-
-  if (libyuv_symbols_visible) {
-    configs -= [ "//build/config/gcc:symbol_visibility_hidden" ]
-    configs += [ "//build/config/gcc:symbol_visibility_default" ]
-  }
-
-  if (!is_ios && !libyuv_disable_jpeg) {
-    defines += [ "HAVE_JPEG" ]
-
-    # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps
-    # because in Windows x64 build it will get compiled with clang.
-    deps += [ "//third_party:jpeg_includes" ]
-  }
-
-  # Always enable optimization for Release and NaCl builds (to workaround
-  # crbug.com/538243).
-  if (!is_debug || is_nacl) {
-    configs -= [ "//build/config/compiler:default_optimization" ]
-
-    # Enable optimize for speed (-O2) over size (-Os).
-    configs += [ "//build/config/compiler:optimize_max" ]
-  }
-
-  # To enable AVX2 or other cpu optimization, pass flag here
-  if (!is_win) {
-    cflags = [
-      # "-mpopcnt",
-      # "-mavx2",
-      # "-mfma",
-      "-ffp-contract=fast",  # Enable fma vectorization for NEON.
-    ]
-  }
-  if (!libyuv_use_mmi) {
-    defines += [ "LIBYUV_DISABLE_MMI" ]
-  }
-}
-
-if (libyuv_use_neon) {
-  static_library("libyuv_neon") {
-    sources = [
-      # ARM Source Files
-      "source/compare_neon.cc",
-      "source/compare_neon64.cc",
-      "source/rotate_neon.cc",
-      "source/rotate_neon64.cc",
-      "source/row_neon.cc",
-      "source/row_neon64.cc",
-      "source/scale_neon.cc",
-      "source/scale_neon64.cc",
-    ]
-
-    deps = [ ":libyuv_internal" ]
-
-    public_configs = [ ":libyuv_config" ]
-
-    # Always enable optimization for Release and NaCl builds (to workaround
-    # crbug.com/538243).
-    if (!is_debug) {
-      configs -= [ "//build/config/compiler:default_optimization" ]
-
-      # Enable optimize for speed (-O2) over size (-Os).
-      # TODO(fbarchard): Consider optimize_speed which is O3.
-      configs += [ "//build/config/compiler:optimize_max" ]
-    }
-
-    if (current_cpu != "arm64") {
-      configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
-      cflags = [ "-mfpu=neon" ]
-    }
-  }
-}
-
-if (libyuv_use_msa) {
-  static_library("libyuv_msa") {
-    sources = [
-      # MSA Source Files
-      "source/compare_msa.cc",
-      "source/rotate_msa.cc",
-      "source/row_msa.cc",
-      "source/scale_msa.cc",
-    ]
-
-    deps = [ ":libyuv_internal" ]
-
-    public_configs = [ ":libyuv_config" ]
-  }
-}
-
-if (libyuv_use_mmi) {
-  static_library("libyuv_mmi") {
-    sources = [
-      # MMI Source Files
-      "source/compare_mmi.cc",
-      "source/rotate_mmi.cc",
-      "source/row_mmi.cc",
-      "source/scale_mmi.cc",
-    ]
-
-    deps = [ ":libyuv_internal" ]
-
-    public_configs = [ ":libyuv_config" ]
-  }
-}
-
-if (libyuv_include_tests) {
-  config("libyuv_unittest_warnings_config") {
-    if (!is_win) {
-      cflags = [
-        # TODO(fbarchard): Fix sign and unused variable warnings.
-        "-Wno-sign-compare",
-        "-Wno-unused-variable",
-      ]
-    }
-    if (is_win) {
-      cflags = [
-        "/wd4245",  # signed/unsigned mismatch
-        "/wd4189",  # local variable is initialized but not referenced
-      ]
-    }
-  }
-  config("libyuv_unittest_config") {
-    defines = [ "GTEST_RELATIVE_PATH" ]
-  }
-
-  test("libyuv_unittest") {
-    testonly = true
-
-    sources = [
-      "unit_test/basictypes_test.cc",
-      "unit_test/color_test.cc",
-      "unit_test/compare_test.cc",
-      "unit_test/convert_test.cc",
-      "unit_test/cpu_test.cc",
-      "unit_test/cpu_thread_test.cc",
-      "unit_test/math_test.cc",
-      "unit_test/planar_test.cc",
-      "unit_test/rotate_argb_test.cc",
-      "unit_test/rotate_test.cc",
-      "unit_test/scale_argb_test.cc",
-      "unit_test/scale_test.cc",
-      "unit_test/scale_uv_test.cc",
-      "unit_test/unit_test.cc",
-      "unit_test/unit_test.h",
-      "unit_test/video_common_test.cc",
-    ]
-
-    deps = [
-      ":libyuv",
-      "//testing/gtest",
-    ]
-
-    defines = []
-    if (libyuv_use_absl_flags) {
-      defines += [ "LIBYUV_USE_ABSL_FLAGS" ]
-      deps += [
-        "//third_party/abseil-cpp/absl/flags:flag",
-        "//third_party/abseil-cpp/absl/flags:parse",
-      ]
-    }
-
-    configs += [ ":libyuv_unittest_warnings_config" ]
-
-    public_deps = [ "//testing/gtest" ]
-    public_configs = [ ":libyuv_unittest_config" ]
-
-    if (is_linux || is_chromeos) {
-      cflags = [ "-fexceptions" ]
-    }
-    if (is_ios) {
-      configs -= [ "//build/config/compiler:default_symbols" ]
-      configs += [ "//build/config/compiler:symbols" ]
-      cflags = [ "-Wno-sometimes-uninitialized" ]
-    }
-    if (!is_ios && !libyuv_disable_jpeg) {
-      defines += [ "HAVE_JPEG" ]
-    }
-    if (is_android) {
-      deps += [ "//testing/android/native_test:native_test_native_code" ]
-    }
-
-    # TODO(YangZhang): These lines can be removed when high accuracy
-    # YUV to RGB to Neon is ported.
-    if ((target_cpu == "armv7" || target_cpu == "armv7s" ||
-         (target_cpu == "arm" && arm_version >= 7) || target_cpu == "arm64") &&
-        (arm_use_neon || arm_optionally_use_neon)) {
-      defines += [ "LIBYUV_NEON" ]
-    }
-
-    defines += [
-      # Enable the following 3 macros to turn off assembly for specified CPU.
-      # "LIBYUV_DISABLE_X86",
-      # "LIBYUV_DISABLE_NEON",
-      # Enable the following macro to build libyuv as a shared library (dll).
-      # "LIBYUV_USING_SHARED_LIBRARY"
-    ]
-  }
-
-  executable("compare") {
-    sources = [
-      # sources
-      "util/compare.cc",
-    ]
-    deps = [ ":libyuv" ]
-    if (is_linux || is_chromeos) {
-      cflags = [ "-fexceptions" ]
-    }
-  }
-
-  executable("yuvconvert") {
-    sources = [
-      # sources
-      "util/yuvconvert.cc",
-    ]
-    deps = [ ":libyuv" ]
-    if (is_linux || is_chromeos) {
-      cflags = [ "-fexceptions" ]
-    }
-  }
-
-  executable("yuvconstants") {
-    sources = [
-      # sources
-      "util/yuvconstants.c",
-    ]
-    deps = [ ":libyuv" ]
-    if (is_linux || is_chromeos) {
-      cflags = [ "-fexceptions" ]
-    }
-  }
-
-  executable("psnr") {
-    sources = [
-      # sources
-      "util/psnr.cc",
-      "util/psnr_main.cc",
-      "util/ssim.cc",
-    ]
-    deps = [ ":libyuv" ]
-
-    if (!is_ios && !libyuv_disable_jpeg) {
-      defines = [ "HAVE_JPEG" ]
-    }
-  }
-
-  executable("i444tonv12_eg") {
-    sources = [
-      # sources
-      "util/i444tonv12_eg.cc",
-    ]
-    deps = [ ":libyuv" ]
-  }
-
-  executable("cpuid") {
-    sources = [
-      # sources
-      "util/cpuid.c",
-    ]
-    deps = [ ":libyuv" ]
-  }
-}
diff --git a/thirdparty/libyuv/CM_linux_packages.cmake b/thirdparty/libyuv/CM_linux_packages.cmake
deleted file mode 100644
index 5f676f8..0000000
--- a/thirdparty/libyuv/CM_linux_packages.cmake
+++ /dev/null
@@ -1,69 +0,0 @@
-# determine the version number from the #define in libyuv/version.h
-EXECUTE_PROCESS (
-	COMMAND grep --perl-regex --only-matching "(?<=LIBYUV_VERSION )[0-9]+" include/libyuv/version.h
-	WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-	OUTPUT_VARIABLE YUV_VERSION_NUMBER
-	OUTPUT_STRIP_TRAILING_WHITESPACE )
-SET ( YUV_VER_MAJOR 0 )
-SET ( YUV_VER_MINOR 0 )
-SET ( YUV_VER_PATCH ${YUV_VERSION_NUMBER} )
-SET ( YUV_VERSION ${YUV_VER_MAJOR}.${YUV_VER_MINOR}.${YUV_VER_PATCH} )
-MESSAGE ( "Building ver.: ${YUV_VERSION}" )
-
-# is this a 32-bit or 64-bit build?
-IF ( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-	SET ( YUV_BIT_SIZE 64 )
-ELSEIF ( CMAKE_SIZEOF_VOID_P EQUAL 4 )
-	SET ( YUV_BIT_SIZE 32 )
-ELSE ()
-	MESSAGE ( FATAL_ERROR "CMAKE_SIZEOF_VOID_P=${CMAKE_SIZEOF_VOID_P}" )
-ENDIF ()
-
-# detect if this is a ARM build
-STRING (FIND "${CMAKE_CXX_COMPILER}" "arm-linux-gnueabihf-g++" pos)
-IF ( ${pos} EQUAL -1 )
-	SET ( YUV_CROSS_COMPILE_FOR_ARM7 FALSE )
-ELSE ()
-	MESSAGE ( "Cross compiling for ARM7" )
-	SET ( YUV_CROSS_COMPILE_FOR_ARM7 TRUE )
-ENDIF ()
-STRING (FIND "${CMAKE_SYSTEM_PROCESSOR}" "arm" pos)
-IF ( ${pos} EQUAL -1 )
-	SET ( YUV_COMPILE_FOR_ARM7 FALSE )
-ELSE ()
-	MESSAGE ( "Compiling for ARM" )
-	SET ( YUV_COMPILE_FOR_ARM7 TRUE )
-ENDIF ()
-
-# setup the sytem name, such as "x86-32", "amd-64", and "arm-32
-IF ( ${YUV_CROSS_COMPILE_FOR_ARM7} OR ${YUV_COMPILE_FOR_ARM7} )
-	SET ( YUV_SYSTEM_NAME "armhf-${YUV_BIT_SIZE}" )
-ELSE ()
-	IF ( YUV_BIT_SIZE EQUAL 32 )
-		SET ( YUV_SYSTEM_NAME "x86-${YUV_BIT_SIZE}" )
-	ELSE ()
-		SET ( YUV_SYSTEM_NAME "amd-${YUV_BIT_SIZE}" )
-	ENDIF ()
-ENDIF ()
-MESSAGE ( "Packaging for: ${YUV_SYSTEM_NAME}" )
-
-# define all the variables needed by CPack to create .deb and .rpm packages
-SET ( CPACK_PACKAGE_VENDOR					"Frank Barchard" )
-SET ( CPACK_PACKAGE_CONTACT					"fbarchard@chromium.org" )
-SET ( CPACK_PACKAGE_VERSION					${YUV_VERSION} )
-SET ( CPACK_PACKAGE_VERSION_MAJOR			${YUV_VER_MAJOR} )
-SET ( CPACK_PACKAGE_VERSION_MINOR			${YUV_VER_MINOR} )
-SET ( CPACK_PACKAGE_VERSION_PATCH			${YUV_VER_PATCH} )
-SET ( CPACK_RESOURCE_FILE_LICENSE			${PROJECT_SOURCE_DIR}/LICENSE )
-SET ( CPACK_SYSTEM_NAME						"linux-${YUV_SYSTEM_NAME}" )
-SET ( CPACK_PACKAGE_NAME					"libyuv" )
-SET ( CPACK_PACKAGE_DESCRIPTION_SUMMARY		"YUV library" )
-SET ( CPACK_PACKAGE_DESCRIPTION				"YUV library and YUV conversion tool" )
-SET ( CPACK_DEBIAN_PACKAGE_SECTION			"other" )
-SET ( CPACK_DEBIAN_PACKAGE_PRIORITY			"optional" )
-SET ( CPACK_DEBIAN_PACKAGE_MAINTAINER		"Frank Barchard <fbarchard@chromium.org>" )
-SET ( CPACK_GENERATOR						"DEB;RPM" )
-
-# create the .deb and .rpm files (you'll need build-essential and rpm tools)
-INCLUDE( CPack )
-
diff --git a/thirdparty/libyuv/CMakeLists.txt b/thirdparty/libyuv/CMakeLists.txt
deleted file mode 100644
index f25ce12..0000000
--- a/thirdparty/libyuv/CMakeLists.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-# CMakeLists for libyuv
-# Originally created for "roxlu build system" to compile libyuv on windows
-# Run with -DTEST=ON to build unit tests
-
-PROJECT(YUV C CXX) # "C" is required even for C++ projects
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
-OPTION(TEST "Built unit tests" OFF)
-
-SET(ly_base_dir ${PROJECT_SOURCE_DIR})
-SET(ly_src_dir ${ly_base_dir}/source)
-SET(ly_inc_dir ${ly_base_dir}/include)
-SET(ly_tst_dir ${ly_base_dir}/unit_test)
-SET(ly_lib_name yuv)
-SET(ly_lib_static ${ly_lib_name})
-SET(ly_lib_shared ${ly_lib_name}_shared)
-
-FILE(GLOB_RECURSE ly_source_files ${ly_src_dir}/*.cc)
-LIST(SORT ly_source_files)
-
-FILE(GLOB_RECURSE ly_unittest_sources ${ly_tst_dir}/*.cc)
-LIST(SORT ly_unittest_sources)
-
-INCLUDE_DIRECTORIES(BEFORE ${ly_inc_dir})
-
-# this creates the static library (.a)
-ADD_LIBRARY(${ly_lib_static} STATIC ${ly_source_files})
-
-# this creates the shared library (.so)
-ADD_LIBRARY(${ly_lib_shared} SHARED ${ly_source_files})
-SET_TARGET_PROPERTIES(${ly_lib_shared} PROPERTIES OUTPUT_NAME "${ly_lib_name}")
-SET_TARGET_PROPERTIES(${ly_lib_shared} PROPERTIES PREFIX "lib")
-
-# this creates the conversion tool
-# ADD_EXECUTABLE(yuvconvert ${ly_base_dir}/util/yuvconvert.cc)
-# TARGET_LINK_LIBRARIES(yuvconvert ${ly_lib_static})
-# INCLUDE(FindJPEG)
-
-# if(JPEG_FOUND)
-# include_directories(${JPEG_INCLUDE_DIR})
-# target_link_libraries(yuvconvert ${JPEG_LIBRARY})
-# add_definitions(-DHAVE_JPEG)
-# endif()
-if(TEST)
-  find_library(GTEST_LIBRARY gtest)
-
-  if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND")
-    set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources")
-
-    if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc)
-      message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}")
-      set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc)
-      add_library(gtest STATIC ${gtest_sources})
-      include_directories(${GTEST_SRC_DIR})
-      include_directories(${GTEST_SRC_DIR}/include)
-      set(GTEST_LIBRARY gtest)
-    else()
-      message(FATAL_ERROR "TEST is set but unable to find gtest library")
-    endif()
-  endif()
-
-  add_executable(libyuv_unittest ${ly_unittest_sources})
-  target_link_libraries(libyuv_unittest ${ly_lib_name} ${GTEST_LIBRARY})
-  find_library(PTHREAD_LIBRARY pthread)
-
-  if(NOT PTHREAD_LIBRARY STREQUAL "PTHREAD_LIBRARY-NOTFOUND")
-    target_link_libraries(libyuv_unittest pthread)
-  endif()
-
-  if(JPEG_FOUND)
-    target_link_libraries(libyuv_unittest ${JPEG_LIBRARY})
-  endif()
-
-  if(NACL AND NACL_LIBC STREQUAL "newlib")
-    target_link_libraries(libyuv_unittest glibc-compat)
-  endif()
-endif()
-
-# install the conversion tool, .so, .a, and all the header files
-# INSTALL(PROGRAMS ${CMAKE_BINARY_DIR}/yuvconvert DESTINATION bin)
-INSTALL(TARGETS ${ly_lib_static} DESTINATION lib)
-
-# INSTALL ( TARGETS ${ly_lib_shared} LIBRARY				DESTINATION lib RUNTIME DESTINATION bin )
-INSTALL(DIRECTORY ${PROJECT_SOURCE_DIR}/include/ DESTINATION include)
-
-# create the .deb and .rpm packages using cpack
-INCLUDE(CM_linux_packages.cmake)
diff --git a/thirdparty/libyuv/DIR_METADATA b/thirdparty/libyuv/DIR_METADATA
deleted file mode 100644
index 8bc04f1..0000000
--- a/thirdparty/libyuv/DIR_METADATA
+++ /dev/null
@@ -1,3 +0,0 @@
-monorail {
-  component: "Internals>Images>Codecs"
-}
diff --git a/thirdparty/libyuv/LICENSE b/thirdparty/libyuv/LICENSE
deleted file mode 100644
index c911747..0000000
--- a/thirdparty/libyuv/LICENSE
+++ /dev/null
@@ -1,29 +0,0 @@
-Copyright 2011 The LibYuv Project Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-
-  * Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in
-    the documentation and/or other materials provided with the
-    distribution.
-
-  * Neither the name of Google nor the names of its contributors may
-    be used to endorse or promote products derived from this software
-    without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/thirdparty/libyuv/OWNERS b/thirdparty/libyuv/OWNERS
deleted file mode 100644
index a96669f..0000000
--- a/thirdparty/libyuv/OWNERS
+++ /dev/null
@@ -1,10 +0,0 @@
-mbonadei@chromium.org
-fbarchard@chromium.org
-magjed@chromium.org
-pbos@chromium.org
-
-per-file *.gn=mbonadei@chromium.org
-per-file .gitignore=*
-per-file AUTHORS=*
-per-file DEPS=*
-per-file PRESUBMIT.py=mbonadei@chromium.org
diff --git a/thirdparty/libyuv/PATENTS b/thirdparty/libyuv/PATENTS
deleted file mode 100644
index 64aa5c9..0000000
--- a/thirdparty/libyuv/PATENTS
+++ /dev/null
@@ -1,24 +0,0 @@
-Additional IP Rights Grant (Patents)
-
-"This implementation" means the copyrightable works distributed by
-Google as part of the LibYuv code package.
-
-Google hereby grants to you a perpetual, worldwide, non-exclusive,
-no-charge, irrevocable (except as stated in this section) patent
-license to make, have made, use, offer to sell, sell, import,
-transfer, and otherwise run, modify and propagate the contents of this
-implementation of the LibYuv code package, where such license applies
-only to those patent claims, both currently owned by Google and
-acquired in the future, licensable by Google that are necessarily
-infringed by this implementation of the LibYuv code package. This
-grant does not include claims that would be infringed only as a
-consequence of further modification of this implementation. If you or
-your agent or exclusive licensee institute or order or agree to the
-institution of patent litigation against any entity (including a
-cross-claim or counterclaim in a lawsuit) alleging that this
-implementation of the LibYuv code package or any code incorporated
-within this implementation of the LibYuv code package constitutes
-direct or contributory patent infringement, or inducement of patent
-infringement, then any patent rights granted to you under this License
-for this implementation of the LibYuv code package shall terminate as
-of the date such litigation is filed.
\ No newline at end of file
diff --git a/thirdparty/libyuv/PRESUBMIT.py b/thirdparty/libyuv/PRESUBMIT.py
deleted file mode 100644
index b867239..0000000
--- a/thirdparty/libyuv/PRESUBMIT.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2017 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-
-def _CommonChecks(input_api, output_api):
-  """Checks common to both upload and commit."""
-  results = []
-  results.extend(input_api.canned_checks.RunPylint(input_api, output_api,
-      files_to_skip=(r'^base[\\\/].*\.py$',
-                     r'^build[\\\/].*\.py$',
-                     r'^buildtools[\\\/].*\.py$',
-                     r'^ios[\\\/].*\.py$',
-                     r'^out.*[\\\/].*\.py$',
-                     r'^testing[\\\/].*\.py$',
-                     r'^third_party[\\\/].*\.py$',
-                     r'^tools[\\\/].*\.py$',
-                     # TODO(kjellander): should arguably be checked.
-                     r'^tools_libyuv[\\\/]valgrind[\\\/].*\.py$',
-                     r'^xcodebuild.*[\\\/].*\.py$',),
-      disabled_warnings=['F0401',  # Failed to import x
-                         'E0611',  # No package y in x
-                         'W0232',  # Class has no __init__ method
-                        ],
-      pylintrc='pylintrc'))
-  return results
-
-
-def CheckChangeOnUpload(input_api, output_api):
-  results = []
-  results.extend(_CommonChecks(input_api, output_api))
-  results.extend(
-      input_api.canned_checks.CheckGNFormatted(input_api, output_api))
-  return results
-
-
-def CheckChangeOnCommit(input_api, output_api):
-  results = []
-  results.extend(_CommonChecks(input_api, output_api))
-  results.extend(input_api.canned_checks.CheckOwners(input_api, output_api))
-  results.extend(input_api.canned_checks.CheckChangeWasUploaded(
-      input_api, output_api))
-  results.extend(input_api.canned_checks.CheckChangeHasDescription(
-      input_api, output_api))
-  return results
diff --git a/thirdparty/libyuv/README.chromium b/thirdparty/libyuv/README.chromium
deleted file mode 100644
index a493527..0000000
--- a/thirdparty/libyuv/README.chromium
+++ /dev/null
@@ -1,8 +0,0 @@
-Name: libyuv
-URL: http://code.google.com/p/libyuv/
-Version: 1787
-License: BSD
-License File: LICENSE
-
-Description:
-libyuv is an open source project that includes YUV conversion and scaling functionality.
diff --git a/thirdparty/libyuv/README.md b/thirdparty/libyuv/README.md
deleted file mode 100644
index db70b7f..0000000
--- a/thirdparty/libyuv/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-**libyuv** is an open source project that includes YUV scaling and conversion functionality.
-
-* Scale YUV to prepare content for compression, with point, bilinear or box filter.
-* Convert to YUV from webcam formats for compression.
-* Convert to RGB formats for rendering/effects.
-* Rotate by 90/180/270 degrees to adjust for mobile devices in portrait mode.
-* Optimized for SSSE3/AVX2 on x86/x64.
-* Optimized for Neon on Arm.
-* Optimized for MSA on Mips.
-
-### Development
-
-See [Getting started][1] for instructions on how to get started developing.
-
-You can also browse the [docs directory][2] for more documentation.
-
-[1]: ./docs/getting_started.md
-[2]: ./docs/
diff --git a/thirdparty/libyuv/build_overrides/build.gni b/thirdparty/libyuv/build_overrides/build.gni
deleted file mode 100644
index 473aea5..0000000
--- a/thirdparty/libyuv/build_overrides/build.gni
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2016 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# Variable that can be used to support multiple build scenarios, like having
-# Chromium specific targets in a client project's GN file etc.
-build_with_chromium = false
-
-# Some non-Chromium builds don't support building java targets.
-enable_java_templates = true
-
-# Allow using custom suppressions files (currently not used by libyuv).
-asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc"
-lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc"
-tsan_suppressions_file = "//build/sanitizers/tsan_suppressions.cc"
-
-msan_blacklist_path =
-    rebase_path("//tools_libyuv/msan/blacklist.txt", root_build_dir)
-ubsan_blacklist_path =
-    rebase_path("//tools_libyuv/ubsan/blacklist.txt", root_build_dir)
-ubsan_vptr_blacklist_path =
-    rebase_path("//tools_libyuv/ubsan/vptr_blacklist.txt", root_build_dir)
-
-# For Chromium, Android 32-bit non-component, non-clang builds hit a 4GiB size
-# limit, making them requiring symbol_level=2. WebRTC doesn't hit that problem
-# so we just ignore that assert. See https://crbug.com/648948 for more info.
-ignore_elf32_limitations = true
-
-# Use bundled hermetic Xcode installation maintained by Chromium,
-# except for local iOS builds where it is unsupported.
-if (host_os == "mac") {
-  _result = exec_script("//build/mac/should_use_hermetic_xcode.py",
-                        [ target_os ],
-                        "value")
-  assert(_result != 2,
-         "Do not allow building targets with the default" +
-             "hermetic toolchain if the minimum OS version is not met.")
-  use_system_xcode = _result == 0
-}
-
-declare_args() {
-  # Tracing support requires //third_party/perfetto.
-  enable_base_tracing = false
-  use_perfetto_client_library = false
-
-  # Allows googletest to pretty-print various absl types.
-  # Defined here rather than in gtest.gni to match chromium.
-  gtest_enable_absl_printers = true
-}
diff --git a/thirdparty/libyuv/build_overrides/gtest.gni b/thirdparty/libyuv/build_overrides/gtest.gni
deleted file mode 100644
index d3c3f68..0000000
--- a/thirdparty/libyuv/build_overrides/gtest.gni
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2016 The LibYuv project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# Include support for registering main function in multi-process tests.
-gtest_include_multiprocess = true
-
-# Include support for platform-specific operations across unit tests.
-gtest_include_platform_test = true
-
-# Exclude support for testing Objective C code on OS X and iOS.
-gtest_include_objc_support = true
-
-# Exclude support for flushing coverage files on iOS.
-gtest_include_ios_coverage = true
diff --git a/thirdparty/libyuv/cleanup_links.py b/thirdparty/libyuv/cleanup_links.py
deleted file mode 100644
index ba29078..0000000
--- a/thirdparty/libyuv/cleanup_links.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2017 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This is a copy of the file from WebRTC in:
-# https://chromium.googlesource.com/external/webrtc/+/master/cleanup_links.py
-
-"""Script to cleanup symlinks created from setup_links.py.
-
-Before 177567c518b121731e507e9b9c4049c4dc96e4c8 (#15754) we had a Chromium
-checkout which we created symlinks into. In order to do clean syncs after
-landing that change, this script cleans up any old symlinks, avoiding annoying
-manual cleanup needed in order to complete gclient sync.
-"""
-
-import logging
-import optparse
-import os
-import shelve
-import subprocess
-import sys
-
-
-ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
-LINKS_DB = 'links'
-
-# Version management to make future upgrades/downgrades easier to support.
-SCHEMA_VERSION = 1
-
-class WebRTCLinkSetup(object):
-  def __init__(self, links_db, dry_run=False):
-    self._dry_run = dry_run
-    self._links_db = links_db
-
-  def CleanupLinks(self):
-    logging.debug('CleanupLinks')
-    for source, link_path  in self._links_db.iteritems():
-      if source == 'SCHEMA_VERSION':
-        continue
-      if os.path.islink(link_path) or sys.platform.startswith('win'):
-        # os.path.islink() always returns false on Windows
-        # See http://bugs.python.org/issue13143.
-        logging.debug('Removing link to %s at %s', source, link_path)
-        if not self._dry_run:
-          if os.path.exists(link_path):
-            if sys.platform.startswith('win') and os.path.isdir(link_path):
-              subprocess.check_call(['rmdir', '/q', '/s', link_path],
-                                    shell=True)
-            else:
-              os.remove(link_path)
-          del self._links_db[source]
-
-
-def _initialize_database(filename):
-  links_database = shelve.open(filename)
-  # Wipe the database if this version of the script ends up looking at a
-  # newer (future) version of the links db, just to be sure.
-  version = links_database.get('SCHEMA_VERSION')
-  if version and version != SCHEMA_VERSION:
-    logging.info('Found database with schema version %s while this script only '
-                 'supports %s. Wiping previous database contents.', version,
-                 SCHEMA_VERSION)
-    links_database.clear()
-  links_database['SCHEMA_VERSION'] = SCHEMA_VERSION
-  return links_database
-
-
-def main():
-  parser = optparse.OptionParser()
-  parser.add_option('-d', '--dry-run', action='store_true', default=False,
-                    help='Print what would be done, but don\'t perform any '
-                         'operations. This will automatically set logging to '
-                         'verbose.')
-  parser.add_option('-v', '--verbose', action='store_const',
-                    const=logging.DEBUG, default=logging.INFO,
-                    help='Print verbose output for debugging.')
-  options, _ = parser.parse_args()
-
-  if options.dry_run:
-    options.verbose = logging.DEBUG
-  logging.basicConfig(format='%(message)s', level=options.verbose)
-
-  # Work from the root directory of the checkout.
-  script_dir = os.path.dirname(os.path.abspath(__file__))
-  os.chdir(script_dir)
-
-  # The database file gets .db appended on some platforms.
-  db_filenames = [LINKS_DB, LINKS_DB + '.db']
-  if any(os.path.isfile(f) for f in db_filenames):
-    links_database = _initialize_database(LINKS_DB)
-    try:
-      symlink_creator = WebRTCLinkSetup(links_database, options.dry_run)
-      symlink_creator.CleanupLinks()
-    finally:
-      for f in db_filenames:
-        if os.path.isfile(f):
-          os.remove(f)
-  return 0
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/thirdparty/libyuv/codereview.settings b/thirdparty/libyuv/codereview.settings
deleted file mode 100644
index b226fae..0000000
--- a/thirdparty/libyuv/codereview.settings
+++ /dev/null
@@ -1,5 +0,0 @@
-# This file is used by `git cl` to get repository specific information.
-CODE_REVIEW_SERVER: codereview.chromium.org
-GERRIT_HOST: True
-PROJECT: libyuv
-VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
diff --git a/thirdparty/libyuv/download_vs_toolchain.py b/thirdparty/libyuv/download_vs_toolchain.py
deleted file mode 100644
index 49d0693..0000000
--- a/thirdparty/libyuv/download_vs_toolchain.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This script is used to run the vs_toolchain.py script to download the
-# Visual Studio toolchain. It's just a temporary measure while waiting for the
-# Chrome team to move find_depot_tools into src/build to get rid of these
-# workarounds (similar one in gyp_libyuv).
-
-import os
-import sys
-
-
-checkout_root = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, os.path.join(checkout_root, 'build'))
-sys.path.insert(0, os.path.join(checkout_root, 'tools', 'find_depot_tools'))
-
-
-import vs_toolchain  # pylint: disable=wrong-import-position
-
-
-if __name__ == '__main__':
-  sys.exit(vs_toolchain.main())
diff --git a/thirdparty/libyuv/include/libyuv.h b/thirdparty/libyuv/include/libyuv.h
deleted file mode 100644
index a06e123..0000000
--- a/thirdparty/libyuv/include/libyuv.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_H_
-#define INCLUDE_LIBYUV_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/compare.h"
-#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
-#include "libyuv/convert_from.h"
-#include "libyuv/convert_from_argb.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/mjpeg_decoder.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/rotate_argb.h"
-#include "libyuv/row.h"
-#include "libyuv/scale.h"
-#include "libyuv/scale_argb.h"
-#include "libyuv/scale_row.h"
-#include "libyuv/scale_uv.h"
-#include "libyuv/version.h"
-#include "libyuv/video_common.h"
-
-#endif  // INCLUDE_LIBYUV_H_
diff --git a/thirdparty/libyuv/include/libyuv/basic_types.h b/thirdparty/libyuv/include/libyuv/basic_types.h
deleted file mode 100644
index 1bea67f..0000000
--- a/thirdparty/libyuv/include/libyuv/basic_types.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
-#define INCLUDE_LIBYUV_BASIC_TYPES_H_
-
-#include <stddef.h>  // For size_t and NULL
-
-#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG)
-#define INT_TYPES_DEFINED
-
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-#include <sys/types.h>  // for uintptr_t on x86
-typedef unsigned __int64 uint64_t;
-typedef __int64 int64_t;
-typedef unsigned int uint32_t;
-typedef int int32_t;
-typedef unsigned short uint16_t;
-typedef short int16_t;
-typedef unsigned char uint8_t;
-typedef signed char int8_t;
-#else
-#include <stdint.h>  // for uintptr_t and C99 types
-#endif               // defined(_MSC_VER) && (_MSC_VER < 1600)
-// Types are deprecated.  Enable this macro for legacy types.
-#ifdef LIBYUV_LEGACY_TYPES
-typedef uint64_t uint64;
-typedef int64_t int64;
-typedef uint32_t uint32;
-typedef int32_t int32;
-typedef uint16_t uint16;
-typedef int16_t int16;
-typedef uint8_t uint8;
-typedef int8_t int8;
-#endif  // LIBYUV_LEGACY_TYPES
-#endif  // INT_TYPES_DEFINED
-
-#if !defined(LIBYUV_API)
-#if defined(_WIN32) || defined(__CYGWIN__)
-#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
-#define LIBYUV_API __declspec(dllexport)
-#elif defined(LIBYUV_USING_SHARED_LIBRARY)
-#define LIBYUV_API __declspec(dllimport)
-#else
-#define LIBYUV_API
-#endif  // LIBYUV_BUILDING_SHARED_LIBRARY
-#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
-    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) ||                      \
-     defined(LIBYUV_USING_SHARED_LIBRARY))
-#define LIBYUV_API __attribute__((visibility("default")))
-#else
-#define LIBYUV_API
-#endif  // __GNUC__
-#endif  // LIBYUV_API
-
-// TODO(fbarchard): Remove bool macros.
-#define LIBYUV_BOOL int
-#define LIBYUV_FALSE 0
-#define LIBYUV_TRUE 1
-
-#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/thirdparty/libyuv/include/libyuv/compare.h b/thirdparty/libyuv/include/libyuv/compare.h
deleted file mode 100644
index 3353ad7..0000000
--- a/thirdparty/libyuv/include/libyuv/compare.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_COMPARE_H_
-#define INCLUDE_LIBYUV_COMPARE_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Compute a hash for specified memory. Seed of 5381 recommended.
-LIBYUV_API
-uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed);
-
-// Hamming Distance
-LIBYUV_API
-uint64_t ComputeHammingDistance(const uint8_t* src_a,
-                                const uint8_t* src_b,
-                                int count);
-
-// Scan an opaque argb image and return fourcc based on alpha offset.
-// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
-LIBYUV_API
-uint32_t ARGBDetect(const uint8_t* argb,
-                    int stride_argb,
-                    int width,
-                    int height);
-
-// Sum Square Error - used to compute Mean Square Error or PSNR.
-LIBYUV_API
-uint64_t ComputeSumSquareError(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count);
-
-LIBYUV_API
-uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
-                                    int stride_a,
-                                    const uint8_t* src_b,
-                                    int stride_b,
-                                    int width,
-                                    int height);
-
-static const int kMaxPsnr = 128;
-
-LIBYUV_API
-double SumSquareErrorToPsnr(uint64_t sse, uint64_t count);
-
-LIBYUV_API
-double CalcFramePsnr(const uint8_t* src_a,
-                     int stride_a,
-                     const uint8_t* src_b,
-                     int stride_b,
-                     int width,
-                     int height);
-
-LIBYUV_API
-double I420Psnr(const uint8_t* src_y_a,
-                int stride_y_a,
-                const uint8_t* src_u_a,
-                int stride_u_a,
-                const uint8_t* src_v_a,
-                int stride_v_a,
-                const uint8_t* src_y_b,
-                int stride_y_b,
-                const uint8_t* src_u_b,
-                int stride_u_b,
-                const uint8_t* src_v_b,
-                int stride_v_b,
-                int width,
-                int height);
-
-LIBYUV_API
-double CalcFrameSsim(const uint8_t* src_a,
-                     int stride_a,
-                     const uint8_t* src_b,
-                     int stride_b,
-                     int width,
-                     int height);
-
-LIBYUV_API
-double I420Ssim(const uint8_t* src_y_a,
-                int stride_y_a,
-                const uint8_t* src_u_a,
-                int stride_u_a,
-                const uint8_t* src_v_a,
-                int stride_v_a,
-                const uint8_t* src_y_b,
-                int stride_y_b,
-                const uint8_t* src_u_b,
-                int stride_u_b,
-                const uint8_t* src_v_b,
-                int stride_v_b,
-                int width,
-                int height);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_COMPARE_H_
diff --git a/thirdparty/libyuv/include/libyuv/compare_row.h b/thirdparty/libyuv/include/libyuv/compare_row.h
deleted file mode 100644
index 18c5fa4..0000000
--- a/thirdparty/libyuv/include/libyuv/compare_row.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_
-#define INCLUDE_LIBYUV_COMPARE_ROW_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__native_client__) && defined(__x86_64__)) || \
-    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
-#define LIBYUV_DISABLE_X86
-#endif
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
-    _MSC_VER >= 1700
-#define VISUALC_HAS_AVX2 1
-#endif  // VisualStudio >= 2012
-
-// clang >= 3.4.0 required for AVX2.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
-#define CLANG_HAS_AVX2 1
-#endif  // clang >= 3.4
-#endif  // __clang__
-
-// The following are available for Visual C and GCC:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
-#define HAS_HASHDJB2_SSE41
-#define HAS_SUMSQUAREERROR_SSE2
-#define HAS_HAMMINGDISTANCE_SSE42
-#endif
-
-// The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
-     defined(_MSC_VER) && !defined(__clang__) && \
-    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
-#define HAS_HASHDJB2_AVX2
-#define HAS_SUMSQUAREERROR_AVX2
-#endif
-
-// The following are available for GCC and clangcl:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__))
-#define HAS_HAMMINGDISTANCE_SSSE3
-#endif
-
-// The following are available for GCC and clangcl:
-#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
-    (defined(__x86_64__) || defined(__i386__))
-#define HAS_HAMMINGDISTANCE_AVX2
-#endif
-
-// The following are available for Neon:
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SUMSQUAREERROR_NEON
-#define HAS_HAMMINGDISTANCE_NEON
-#endif
-
-#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
-#define HAS_HAMMINGDISTANCE_MSA
-#define HAS_SUMSQUAREERROR_MSA
-#endif
-
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_HAMMINGDISTANCE_MMI
-#define HAS_SUMSQUAREERROR_MMI
-#endif
-
-uint32_t HammingDistance_C(const uint8_t* src_a,
-                           const uint8_t* src_b,
-                           int count);
-uint32_t HammingDistance_SSE42(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count);
-uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count);
-uint32_t HammingDistance_AVX2(const uint8_t* src_a,
-                              const uint8_t* src_b,
-                              int count);
-uint32_t HammingDistance_NEON(const uint8_t* src_a,
-                              const uint8_t* src_b,
-                              int count);
-uint32_t HammingDistance_MSA(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count);
-uint32_t HammingDistance_MMI(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count);
-uint32_t SumSquareError_C(const uint8_t* src_a,
-                          const uint8_t* src_b,
-                          int count);
-uint32_t SumSquareError_SSE2(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count);
-uint32_t SumSquareError_AVX2(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count);
-uint32_t SumSquareError_NEON(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count);
-uint32_t SumSquareError_MSA(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count);
-uint32_t SumSquareError_MMI(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count);
-
-uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
-uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
-uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_COMPARE_ROW_H_
diff --git a/thirdparty/libyuv/include/libyuv/convert.h b/thirdparty/libyuv/include/libyuv/convert.h
deleted file mode 100644
index 93e7550..0000000
--- a/thirdparty/libyuv/include/libyuv/convert.h
+++ /dev/null
@@ -1,860 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_H_
-#define INCLUDE_LIBYUV_CONVERT_H_
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/rotate.h"  // For enum RotationMode.
-
-// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
-#include "libyuv/convert_argb.h"      // For WebRTC I420ToARGB. b/620
-#include "libyuv/convert_from.h"      // For WebRTC ConvertFromI420. b/620
-#include "libyuv/planar_functions.h"  // For WebRTC I420Rect, CopyPlane. b/618
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert I444 to I420.
-LIBYUV_API
-int I444ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I444 to NV12.
-LIBYUV_API
-int I444ToNV12(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert I444 to NV21.
-LIBYUV_API
-int I444ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-// Convert I422 to I420.
-LIBYUV_API
-int I422ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I422 to I444.
-LIBYUV_API
-int I422ToI444(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I422 to NV21.
-LIBYUV_API
-int I422ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-// Copy I420 to I420.
-#define I420ToI420 I420Copy
-LIBYUV_API
-int I420Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_u,
-             int src_stride_u,
-             const uint8_t* src_v,
-             int src_stride_v,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_u,
-             int dst_stride_u,
-             uint8_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height);
-
-// Convert I420 to I444.
-LIBYUV_API
-int I420ToI444(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Copy I010 to I010
-#define I010ToI010 I010Copy
-#define H010ToH010 I010Copy
-LIBYUV_API
-int I010Copy(const uint16_t* src_y,
-             int src_stride_y,
-             const uint16_t* src_u,
-             int src_stride_u,
-             const uint16_t* src_v,
-             int src_stride_v,
-             uint16_t* dst_y,
-             int dst_stride_y,
-             uint16_t* dst_u,
-             int dst_stride_u,
-             uint16_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height);
-
-// Convert 10 bit YUV to 8 bit
-#define H010ToH420 I010ToI420
-LIBYUV_API
-int I010ToI420(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-#define H210ToH422 I210ToI422
-LIBYUV_API
-int I210ToI422(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-#define H410ToH444 I410ToI444
-LIBYUV_API
-int I410ToI444(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-#define H012ToH420 I012ToI420
-LIBYUV_API
-int I012ToI420(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-#define H212ToH422 I212ToI422
-LIBYUV_API
-int I212ToI422(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-#define H412ToH444 I412ToI444
-LIBYUV_API
-int I412ToI444(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-#define I412ToI012 I410ToI010
-#define H410ToH010 I410ToI010
-#define H412ToH012 I410ToI010
-LIBYUV_API
-int I410ToI010(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-#define I212ToI012 I210ToI010
-#define H210ToH010 I210ToI010
-#define H212ToH012 I210ToI010
-LIBYUV_API
-int I210ToI010(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I010 to I410
-LIBYUV_API
-int I010ToI410(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I012 to I412
-#define I012ToI412 I010ToI410
-
-// Convert I210 to I410
-LIBYUV_API
-int I210ToI410(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I212 to I412
-#define I212ToI412 I210ToI410
-
-// Convert I010 to P010
-LIBYUV_API
-int I010ToP010(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert I210 to P210
-LIBYUV_API
-int I210ToP210(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert I012 to P012
-LIBYUV_API
-int I012ToP012(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert I212 to P212
-LIBYUV_API
-int I212ToP212(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert I400 (grey) to I420.
-LIBYUV_API
-int I400ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I400 (grey) to NV21.
-LIBYUV_API
-int I400ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-#define J400ToJ420 I400ToI420
-
-// Convert NV12 to I420.
-LIBYUV_API
-int NV12ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert NV21 to I420.
-LIBYUV_API
-int NV21ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert NV12 to NV24.
-LIBYUV_API
-int NV12ToNV24(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert NV16 to NV24.
-LIBYUV_API
-int NV16ToNV24(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert P010 to P410.
-LIBYUV_API
-int P010ToP410(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_uv,
-               int src_stride_uv,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert P012 to P412.
-#define P012ToP412 P010ToP410
-
-// Convert P016 to P416.
-#define P016ToP416 P010ToP410
-
-// Convert P210 to P410.
-LIBYUV_API
-int P210ToP410(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_uv,
-               int src_stride_uv,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert P212 to P412.
-#define P212ToP412 P210ToP410
-
-// Convert P216 to P416.
-#define P216ToP416 P210ToP410
-
-// Convert YUY2 to I420.
-LIBYUV_API
-int YUY2ToI420(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert UYVY to I420.
-LIBYUV_API
-int UYVYToI420(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert AYUV to NV12.
-LIBYUV_API
-int AYUVToNV12(const uint8_t* src_ayuv,
-               int src_stride_ayuv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert AYUV to NV21.
-LIBYUV_API
-int AYUVToNV21(const uint8_t* src_ayuv,
-               int src_stride_ayuv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-// Convert Android420 to I420.
-LIBYUV_API
-int Android420ToI420(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     uint8_t* dst_u,
-                     int dst_stride_u,
-                     uint8_t* dst_v,
-                     int dst_stride_v,
-                     int width,
-                     int height);
-
-// ARGB little endian (bgra in memory) to I420.
-LIBYUV_API
-int ARGBToI420(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// BGRA little endian (argb in memory) to I420.
-LIBYUV_API
-int BGRAToI420(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// ABGR little endian (rgba in memory) to I420.
-LIBYUV_API
-int ABGRToI420(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// RGBA little endian (abgr in memory) to I420.
-LIBYUV_API
-int RGBAToI420(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// RGB little endian (bgr in memory) to I420.
-LIBYUV_API
-int RGB24ToI420(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_u,
-                int dst_stride_u,
-                uint8_t* dst_v,
-                int dst_stride_v,
-                int width,
-                int height);
-
-// RGB little endian (bgr in memory) to J420.
-LIBYUV_API
-int RGB24ToJ420(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_u,
-                int dst_stride_u,
-                uint8_t* dst_v,
-                int dst_stride_v,
-                int width,
-                int height);
-
-// RGB big endian (rgb in memory) to I420.
-LIBYUV_API
-int RAWToI420(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int width,
-              int height);
-
-// RGB big endian (rgb in memory) to J420.
-LIBYUV_API
-int RAWToJ420(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int width,
-              int height);
-
-// RGB16 (RGBP fourcc) little endian to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8_t* src_rgb565,
-                 int src_stride_rgb565,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 uint8_t* dst_u,
-                 int dst_stride_u,
-                 uint8_t* dst_v,
-                 int dst_stride_v,
-                 int width,
-                 int height);
-
-// RGB15 (RGBO fourcc) little endian to I420.
-LIBYUV_API
-int ARGB1555ToI420(const uint8_t* src_argb1555,
-                   int src_stride_argb1555,
-                   uint8_t* dst_y,
-                   int dst_stride_y,
-                   uint8_t* dst_u,
-                   int dst_stride_u,
-                   uint8_t* dst_v,
-                   int dst_stride_v,
-                   int width,
-                   int height);
-
-// RGB12 (R444 fourcc) little endian to I420.
-LIBYUV_API
-int ARGB4444ToI420(const uint8_t* src_argb4444,
-                   int src_stride_argb4444,
-                   uint8_t* dst_y,
-                   int dst_stride_y,
-                   uint8_t* dst_u,
-                   int dst_stride_u,
-                   uint8_t* dst_v,
-                   int dst_stride_v,
-                   int width,
-                   int height);
-
-// RGB little endian (bgr in memory) to J400.
-LIBYUV_API
-int RGB24ToJ400(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_yj,
-                int dst_stride_yj,
-                int width,
-                int height);
-
-// RGB big endian (rgb in memory) to J400.
-LIBYUV_API
-int RAWToJ400(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_yj,
-              int dst_stride_yj,
-              int width,
-              int height);
-
-// src_width/height provided by capture.
-// dst_width/height for clipping determine final size.
-LIBYUV_API
-int MJPGToI420(const uint8_t* sample,
-               size_t sample_size,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height);
-
-// JPEG to NV21
-LIBYUV_API
-int MJPGToNV21(const uint8_t* sample,
-               size_t sample_size,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height);
-
-// JPEG to NV12
-LIBYUV_API
-int MJPGToNV12(const uint8_t* sample,
-               size_t sample_size,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height);
-
-// Query size of MJPG in pixels.
-LIBYUV_API
-int MJPGSize(const uint8_t* sample,
-             size_t sample_size,
-             int* width,
-             int* height);
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// "src_size" is needed to parse MJPG.
-// "dst_stride_y" number of bytes in a row of the dst_y plane.
-//   Normally this would be the same as dst_width, with recommended alignment
-//   to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected. The caller should
-//   allocate the I420 buffer according to rotation.
-// "dst_stride_u" number of bytes in a row of the dst_u plane.
-//   Normally this would be the same as (dst_width + 1) / 2, with
-//   recommended alignment to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected.
-// "crop_x" and "crop_y" are starting position for cropping.
-//   To center, crop_x = (src_width - dst_width) / 2
-//              crop_y = (src_height - dst_height) / 2
-// "src_width" / "src_height" is size of src_frame in pixels.
-//   "src_height" can be negative indicating a vertically flipped image source.
-// "crop_width" / "crop_height" is the size to crop the src to.
-//    Must be less than or equal to src_width/src_height
-//    Cropping parameters are pre-rotation.
-// "rotation" can be 0, 90, 180 or 270.
-// "fourcc" is a fourcc. ie 'I420', 'YUY2'
-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
-LIBYUV_API
-int ConvertToI420(const uint8_t* sample,
-                  size_t sample_size,
-                  uint8_t* dst_y,
-                  int dst_stride_y,
-                  uint8_t* dst_u,
-                  int dst_stride_u,
-                  uint8_t* dst_v,
-                  int dst_stride_v,
-                  int crop_x,
-                  int crop_y,
-                  int src_width,
-                  int src_height,
-                  int crop_width,
-                  int crop_height,
-                  enum RotationMode rotation,
-                  uint32_t fourcc);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/thirdparty/libyuv/include/libyuv/convert_argb.h b/thirdparty/libyuv/include/libyuv/convert_argb.h
deleted file mode 100644
index eb4ebd5..0000000
--- a/thirdparty/libyuv/include/libyuv/convert_argb.h
+++ /dev/null
@@ -1,1974 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
-#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/rotate.h"  // For enum RotationMode.
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Conversion matrix for YUV to RGB
-LIBYUV_API extern const struct YuvConstants kYuvI601Constants;   // BT.601
-LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants;   // BT.601 full
-LIBYUV_API extern const struct YuvConstants kYuvH709Constants;   // BT.709
-LIBYUV_API extern const struct YuvConstants kYuvF709Constants;   // BT.709 full
-LIBYUV_API extern const struct YuvConstants kYuv2020Constants;   // BT.2020
-LIBYUV_API extern const struct YuvConstants kYuvV2020Constants;  // BT.2020 full
-
-// Conversion matrix for YVU to BGR
-LIBYUV_API extern const struct YuvConstants kYvuI601Constants;   // BT.601
-LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;   // BT.601 full
-LIBYUV_API extern const struct YuvConstants kYvuH709Constants;   // BT.709
-LIBYUV_API extern const struct YuvConstants kYvuF709Constants;   // BT.709 full
-LIBYUV_API extern const struct YuvConstants kYvu2020Constants;   // BT.2020
-LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
-
-// Macros for end swapped destination Matrix conversions.
-// Swap UV and pass mirrored kYvuJPEGConstants matrix.
-// TODO(fbarchard): Add macro for each Matrix function.
-#define kYuvI601ConstantsVU kYvuI601Constants
-#define kYuvJPEGConstantsVU kYvuJPEGConstants
-#define kYuvH709ConstantsVU kYvuH709Constants
-#define kYuvF709ConstantsVU kYvuF709Constants
-#define kYuv2020ConstantsVU kYvu2020Constants
-#define kYuvV2020ConstantsVU kYvuV2020Constants
-
-#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
-  NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
-#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
-  NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
-#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
-  NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
-#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
-  NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
-#define I010ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
-  I010ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
-#define I210ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
-  I210ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
-#define I410ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
-  I410ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
-#define I010ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
-  I010ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
-#define I210ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
-  I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
-#define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
-  I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
-#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
-  I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
-#define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
-  I422AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
-#define I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
-  I444AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
-#define I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
-  I010AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
-#define I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
-  I210AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
-#define I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
-  I410AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
-
-// Alias.
-#define ARGBToARGB ARGBCopy
-
-// Copy ARGB to ARGB.
-LIBYUV_API
-int ARGBCopy(const uint8_t* src_argb,
-             int src_stride_argb,
-             uint8_t* dst_argb,
-             int dst_stride_argb,
-             int width,
-             int height);
-
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert U420 to ARGB.
-LIBYUV_API
-int U420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert U420 to ABGR.
-LIBYUV_API
-int U420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert I422 to ARGB.
-LIBYUV_API
-int I422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert J422 to ARGB.
-LIBYUV_API
-int J422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert U422 to ARGB.
-LIBYUV_API
-int U422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert U422 to ABGR.
-LIBYUV_API
-int U422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert I444 to ARGB.
-LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert J444 to ABGR.
-LIBYUV_API
-int J444ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert H444 to ARGB.
-LIBYUV_API
-int H444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert H444 to ABGR.
-LIBYUV_API
-int H444ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert U444 to ARGB.
-LIBYUV_API
-int U444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert U444 to ABGR.
-LIBYUV_API
-int U444ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert I010 to ARGB.
-LIBYUV_API
-int I010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I010 to ABGR.
-LIBYUV_API
-int I010ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert H010 to ARGB.
-LIBYUV_API
-int H010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert H010 to ABGR.
-LIBYUV_API
-int H010ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert U010 to ARGB.
-LIBYUV_API
-int U010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert U010 to ABGR.
-LIBYUV_API
-int U010ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert I210 to ARGB.
-LIBYUV_API
-int I210ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I210 to ABGR.
-LIBYUV_API
-int I210ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert H210 to ARGB.
-LIBYUV_API
-int H210ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert H210 to ABGR.
-LIBYUV_API
-int H210ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert U210 to ARGB.
-LIBYUV_API
-int U210ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert U210 to ABGR.
-LIBYUV_API
-int U210ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert I420 with Alpha to preattenuated ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height,
-                    int attenuate);
-
-// Convert I420 with Alpha to preattenuated ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_abgr,
-                    int dst_stride_abgr,
-                    int width,
-                    int height,
-                    int attenuate);
-
-// Convert I422 with Alpha to preattenuated ARGB.
-LIBYUV_API
-int I422AlphaToARGB(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height,
-                    int attenuate);
-
-// Convert I422 with Alpha to preattenuated ABGR.
-LIBYUV_API
-int I422AlphaToABGR(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_abgr,
-                    int dst_stride_abgr,
-                    int width,
-                    int height,
-                    int attenuate);
-
-// Convert I444 with Alpha to preattenuated ARGB.
-LIBYUV_API
-int I444AlphaToARGB(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height,
-                    int attenuate);
-
-// Convert I444 with Alpha to preattenuated ABGR.
-LIBYUV_API
-int I444AlphaToABGR(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_abgr,
-                    int dst_stride_abgr,
-                    int width,
-                    int height,
-                    int attenuate);
-
-// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
-LIBYUV_API
-int I400ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert J400 (jpeg grey) to ARGB.
-LIBYUV_API
-int J400ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Alias.
-#define YToARGB I400ToARGB
-
-// Convert NV12 to ARGB.
-LIBYUV_API
-int NV12ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert NV12 to ABGR.
-LIBYUV_API
-int NV12ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert NV21 to ABGR.
-LIBYUV_API
-int NV21ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert NV12 to RGB24.
-LIBYUV_API
-int NV12ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_uv,
-                int src_stride_uv,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-// Convert NV21 to RGB24.
-LIBYUV_API
-int NV21ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_vu,
-                int src_stride_vu,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-// Convert NV21 to YUV24.
-LIBYUV_API
-int NV21ToYUV24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_vu,
-                int src_stride_vu,
-                uint8_t* dst_yuv24,
-                int dst_stride_yuv24,
-                int width,
-                int height);
-
-// Convert NV12 to RAW.
-LIBYUV_API
-int NV12ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_uv,
-              int src_stride_uv,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-// Convert NV21 to RAW.
-LIBYUV_API
-int NV21ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_vu,
-              int src_stride_vu,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-// Convert YUY2 to ARGB.
-LIBYUV_API
-int YUY2ToARGB(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert UYVY to ARGB.
-LIBYUV_API
-int UYVYToARGB(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I010 to AR30.
-LIBYUV_API
-int I010ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert H010 to AR30.
-LIBYUV_API
-int H010ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert I010 to AB30.
-LIBYUV_API
-int I010ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// Convert H010 to AB30.
-LIBYUV_API
-int H010ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// Convert U010 to AR30.
-LIBYUV_API
-int U010ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert U010 to AB30.
-LIBYUV_API
-int U010ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// Convert I210 to AR30.
-LIBYUV_API
-int I210ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert I210 to AB30.
-LIBYUV_API
-int I210ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// Convert H210 to AR30.
-LIBYUV_API
-int H210ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert H210 to AB30.
-LIBYUV_API
-int H210ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// Convert U210 to AR30.
-LIBYUV_API
-int U210ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert U210 to AB30.
-LIBYUV_API
-int U210ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// BGRA little endian (argb in memory) to ARGB.
-LIBYUV_API
-int BGRAToARGB(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// ABGR little endian (rgba in memory) to ARGB.
-LIBYUV_API
-int ABGRToARGB(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// RGBA little endian (abgr in memory) to ARGB.
-LIBYUV_API
-int RGBAToARGB(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Deprecated function name.
-#define BG24ToARGB RGB24ToARGB
-
-// RGB little endian (bgr in memory) to ARGB.
-LIBYUV_API
-int RGB24ToARGB(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_argb,
-                int dst_stride_argb,
-                int width,
-                int height);
-
-// RGB big endian (rgb in memory) to ARGB.
-LIBYUV_API
-int RAWToARGB(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int width,
-              int height);
-
-// RGB big endian (rgb in memory) to RGBA.
-LIBYUV_API
-int RAWToRGBA(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_rgba,
-              int dst_stride_rgba,
-              int width,
-              int height);
-
-// RGB16 (RGBP fourcc) little endian to ARGB.
-LIBYUV_API
-int RGB565ToARGB(const uint8_t* src_rgb565,
-                 int src_stride_rgb565,
-                 uint8_t* dst_argb,
-                 int dst_stride_argb,
-                 int width,
-                 int height);
-
-// RGB15 (RGBO fourcc) little endian to ARGB.
-LIBYUV_API
-int ARGB1555ToARGB(const uint8_t* src_argb1555,
-                   int src_stride_argb1555,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height);
-
-// RGB12 (R444 fourcc) little endian to ARGB.
-LIBYUV_API
-int ARGB4444ToARGB(const uint8_t* src_argb4444,
-                   int src_stride_argb4444,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height);
-
-// Aliases
-#define AB30ToARGB AR30ToABGR
-#define AB30ToABGR AR30ToARGB
-#define AB30ToAR30 AR30ToAB30
-
-// Convert AR30 To ARGB.
-LIBYUV_API
-int AR30ToARGB(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert AR30 To ABGR.
-LIBYUV_API
-int AR30ToABGR(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert AR30 To AB30.
-LIBYUV_API
-int AR30ToAB30(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// Convert AR64 to ARGB.
-LIBYUV_API
-int AR64ToARGB(const uint16_t* src_ar64,
-               int src_stride_ar64,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert AB64 to ABGR.
-#define AB64ToABGR AR64ToARGB
-
-// Convert AB64 to ARGB.
-LIBYUV_API
-int AB64ToARGB(const uint16_t* src_ab64,
-               int src_stride_ab64,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert AR64 to ABGR.
-#define AR64ToABGR AB64ToARGB
-
-// Convert AR64 To AB64.
-LIBYUV_API
-int AR64ToAB64(const uint16_t* src_ar64,
-               int src_stride_ar64,
-               uint16_t* dst_ab64,
-               int dst_stride_ab64,
-               int width,
-               int height);
-
-// Convert AB64 To AR64.
-#define AB64ToAR64 AR64ToAB64
-
-// src_width/height provided by capture
-// dst_width/height for clipping determine final size.
-LIBYUV_API
-int MJPGToARGB(const uint8_t* sample,
-               size_t sample_size,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height);
-
-// Convert Android420 to ARGB.
-LIBYUV_API
-int Android420ToARGB(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     int width,
-                     int height);
-
-// Convert Android420 to ABGR.
-LIBYUV_API
-int Android420ToABGR(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_abgr,
-                     int dst_stride_abgr,
-                     int width,
-                     int height);
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_uv,
-                 int src_stride_uv,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height);
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-LIBYUV_API
-int J420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-LIBYUV_API
-int J420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-LIBYUV_API
-int J420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-LIBYUV_API
-int H420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const uint8_t* dither4x4,
-                       int width,
-                       int height);
-
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb1555,
-                   int dst_stride_argb1555,
-                   int width,
-                   int height);
-
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb4444,
-                   int dst_stride_argb4444,
-                   int width,
-                   int height);
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert I420 to AB30.
-LIBYUV_API
-int I420ToAB30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert H420 to AB30.
-LIBYUV_API
-int H420ToAB30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// Convert I420 to ARGB with matrix.
-LIBYUV_API
-int I420ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert I422 to ARGB with matrix.
-LIBYUV_API
-int I422ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert I444 to ARGB with matrix.
-LIBYUV_API
-int I444ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert 10 bit 420 YUV to ARGB with matrix.
-LIBYUV_API
-int I010ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert 10 bit 420 YUV to ARGB with matrix.
-LIBYUV_API
-int I210ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert 10 bit 444 YUV to ARGB with matrix.
-LIBYUV_API
-int I410ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert 10 bit YUV to ARGB with matrix.
-LIBYUV_API
-int I010ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// multiply 12 bit yuv into high bits to allow any number of bits.
-LIBYUV_API
-int I012ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert 12 bit YUV to ARGB with matrix.
-LIBYUV_API
-int I012ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert 10 bit 422 YUV to ARGB with matrix.
-LIBYUV_API
-int I210ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert 10 bit 444 YUV to ARGB with matrix.
-LIBYUV_API
-int I410ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert P010 to ARGB with matrix.
-LIBYUV_API
-int P010ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert P210 to ARGB with matrix.
-LIBYUV_API
-int P210ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert P010 to AR30 with matrix.
-LIBYUV_API
-int P010ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert P210 to AR30 with matrix.
-LIBYUV_API
-int P210ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// P012 and P010 use most significant bits so the conversion is the same.
-// Convert P012 to ARGB with matrix.
-#define P012ToARGBMatrix P010ToARGBMatrix
-// Convert P012 to AR30 with matrix.
-#define P012ToAR30Matrix P010ToAR30Matrix
-// Convert P212 to ARGB with matrix.
-#define P212ToARGBMatrix P210ToARGBMatrix
-// Convert P212 to AR30 with matrix.
-#define P212ToAR30Matrix P210ToAR30Matrix
-
-// Convert P016 to ARGB with matrix.
-#define P016ToARGBMatrix P010ToARGBMatrix
-// Convert P016 to AR30 with matrix.
-#define P016ToAR30Matrix P010ToAR30Matrix
-// Convert P216 to ARGB with matrix.
-#define P216ToARGBMatrix P210ToARGBMatrix
-// Convert P216 to AR30 with matrix.
-#define P216ToAR30Matrix P210ToAR30Matrix
-
-// Convert I420 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I420AlphaToARGBMatrix(const uint8_t* src_y,
-                          int src_stride_y,
-                          const uint8_t* src_u,
-                          int src_stride_u,
-                          const uint8_t* src_v,
-                          int src_stride_v,
-                          const uint8_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate);
-
-// Convert I422 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I422AlphaToARGBMatrix(const uint8_t* src_y,
-                          int src_stride_y,
-                          const uint8_t* src_u,
-                          int src_stride_u,
-                          const uint8_t* src_v,
-                          int src_stride_v,
-                          const uint8_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate);
-
-// Convert I444 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I444AlphaToARGBMatrix(const uint8_t* src_y,
-                          int src_stride_y,
-                          const uint8_t* src_u,
-                          int src_stride_u,
-                          const uint8_t* src_v,
-                          int src_stride_v,
-                          const uint8_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate);
-
-// Convert I010 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I010AlphaToARGBMatrix(const uint16_t* src_y,
-                          int src_stride_y,
-                          const uint16_t* src_u,
-                          int src_stride_u,
-                          const uint16_t* src_v,
-                          int src_stride_v,
-                          const uint16_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate);
-
-// Convert I210 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I210AlphaToARGBMatrix(const uint16_t* src_y,
-                          int src_stride_y,
-                          const uint16_t* src_u,
-                          int src_stride_u,
-                          const uint16_t* src_v,
-                          int src_stride_v,
-                          const uint16_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate);
-
-// Convert I410 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I410AlphaToARGBMatrix(const uint16_t* src_y,
-                          int src_stride_y,
-                          const uint16_t* src_u,
-                          int src_stride_u,
-                          const uint16_t* src_v,
-                          int src_stride_v,
-                          const uint16_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate);
-
-// Convert NV12 to ARGB with matrix.
-LIBYUV_API
-int NV12ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert NV21 to ARGB with matrix.
-LIBYUV_API
-int NV21ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_vu,
-                     int src_stride_vu,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert NV12 to RGB565 with matrix.
-LIBYUV_API
-int NV12ToRGB565Matrix(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_uv,
-                       int src_stride_uv,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width,
-                       int height);
-
-// Convert NV12 to RGB24 with matrix.
-LIBYUV_API
-int NV12ToRGB24Matrix(const uint8_t* src_y,
-                      int src_stride_y,
-                      const uint8_t* src_uv,
-                      int src_stride_uv,
-                      uint8_t* dst_rgb24,
-                      int dst_stride_rgb24,
-                      const struct YuvConstants* yuvconstants,
-                      int width,
-                      int height);
-
-// Convert NV21 to RGB24 with matrix.
-LIBYUV_API
-int NV21ToRGB24Matrix(const uint8_t* src_y,
-                      int src_stride_y,
-                      const uint8_t* src_vu,
-                      int src_stride_vu,
-                      uint8_t* dst_rgb24,
-                      int dst_stride_rgb24,
-                      const struct YuvConstants* yuvconstants,
-                      int width,
-                      int height);
-
-// Convert Android420 to ARGB with matrix.
-LIBYUV_API
-int Android420ToARGBMatrix(const uint8_t* src_y,
-                           int src_stride_y,
-                           const uint8_t* src_u,
-                           int src_stride_u,
-                           const uint8_t* src_v,
-                           int src_stride_v,
-                           int src_pixel_stride_uv,
-                           uint8_t* dst_argb,
-                           int dst_stride_argb,
-                           const struct YuvConstants* yuvconstants,
-                           int width,
-                           int height);
-
-// Convert I422 to RGBA with matrix.
-LIBYUV_API
-int I422ToRGBAMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_rgba,
-                     int dst_stride_rgba,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert I422 to RGBA with matrix.
-LIBYUV_API
-int I420ToRGBAMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_rgba,
-                     int dst_stride_rgba,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert I420 to RGB24 with matrix.
-LIBYUV_API
-int I420ToRGB24Matrix(const uint8_t* src_y,
-                      int src_stride_y,
-                      const uint8_t* src_u,
-                      int src_stride_u,
-                      const uint8_t* src_v,
-                      int src_stride_v,
-                      uint8_t* dst_rgb24,
-                      int dst_stride_rgb24,
-                      const struct YuvConstants* yuvconstants,
-                      int width,
-                      int height);
-
-// Convert I420 to RGB565 with specified color matrix.
-LIBYUV_API
-int I420ToRGB565Matrix(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width,
-                       int height);
-
-// Convert I420 to AR30 with matrix.
-LIBYUV_API
-int I420ToAR30Matrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
-LIBYUV_API
-int I400ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height);
-
-// Convert camera sample to ARGB with cropping, rotation and vertical flip.
-// "sample_size" is needed to parse MJPG.
-// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
-//   Normally this would be the same as dst_width, with recommended alignment
-//   to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected. The caller should
-//   allocate the I420 buffer according to rotation.
-// "dst_stride_u" number of bytes in a row of the dst_u plane.
-//   Normally this would be the same as (dst_width + 1) / 2, with
-//   recommended alignment to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected.
-// "crop_x" and "crop_y" are starting position for cropping.
-//   To center, crop_x = (src_width - dst_width) / 2
-//              crop_y = (src_height - dst_height) / 2
-// "src_width" / "src_height" is size of src_frame in pixels.
-//   "src_height" can be negative indicating a vertically flipped image source.
-// "crop_width" / "crop_height" is the size to crop the src to.
-//    Must be less than or equal to src_width/src_height
-//    Cropping parameters are pre-rotation.
-// "rotation" can be 0, 90, 180 or 270.
-// "fourcc" is a fourcc. ie 'I420', 'YUY2'
-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
-LIBYUV_API
-int ConvertToARGB(const uint8_t* sample,
-                  size_t sample_size,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int crop_x,
-                  int crop_y,
-                  int src_width,
-                  int src_height,
-                  int crop_width,
-                  int crop_height,
-                  enum RotationMode rotation,
-                  uint32_t fourcc);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/thirdparty/libyuv/include/libyuv/convert_from.h b/thirdparty/libyuv/include/libyuv/convert_from.h
deleted file mode 100644
index 32f42a6..0000000
--- a/thirdparty/libyuv/include/libyuv/convert_from.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_
-#define INCLUDE_LIBYUV_CONVERT_FROM_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/rotate.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// See Also convert.h for conversions from formats to I420.
-
-// Convert 8 bit YUV to 10 bit.
-#define H420ToH010 I420ToI010
-LIBYUV_API
-int I420ToI010(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert 8 bit YUV to 12 bit.
-#define H420ToH012 I420ToI012
-LIBYUV_API
-int I420ToI012(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToI422(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToI444(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
-LIBYUV_API
-int I400Copy(const uint8_t* src_y,
-             int src_stride_y,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             int width,
-             int height);
-
-LIBYUV_API
-int I420ToNV12(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToYUY2(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_yuy2,
-               int dst_stride_yuy2,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToUYVY(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_uyvy,
-               int dst_stride_uyvy,
-               int width,
-               int height);
-
-// The following are from convert_argb.h
-// DEPRECATED: The prototypes will be removed in future.  Use convert_argb.h
-
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert I420 to specified format.
-// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
-//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
-LIBYUV_API
-int ConvertFromI420(const uint8_t* y,
-                    int y_stride,
-                    const uint8_t* u,
-                    int u_stride,
-                    const uint8_t* v,
-                    int v_stride,
-                    uint8_t* dst_sample,
-                    int dst_sample_stride,
-                    int width,
-                    int height,
-                    uint32_t fourcc);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_
diff --git a/thirdparty/libyuv/include/libyuv/convert_from_argb.h b/thirdparty/libyuv/include/libyuv/convert_from_argb.h
deleted file mode 100644
index bf48786..0000000
--- a/thirdparty/libyuv/include/libyuv/convert_from_argb.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
-#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy ARGB to ARGB.
-#define ARGBToARGB ARGBCopy
-LIBYUV_API
-int ARGBCopy(const uint8_t* src_argb,
-             int src_stride_argb,
-             uint8_t* dst_argb,
-             int dst_stride_argb,
-             int width,
-             int height);
-
-// Convert ARGB To BGRA.
-LIBYUV_API
-int ARGBToBGRA(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height);
-
-// Convert ARGB To ABGR.
-LIBYUV_API
-int ARGBToABGR(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert ARGB To RGBA.
-LIBYUV_API
-int ARGBToRGBA(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height);
-
-// Aliases
-#define ARGBToAB30 ABGRToAR30
-#define ABGRToAB30 ARGBToAR30
-
-// Convert ABGR To AR30.
-LIBYUV_API
-int ABGRToAR30(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert ARGB To AR30.
-LIBYUV_API
-int ARGBToAR30(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Aliases
-#define ABGRToRGB24 ARGBToRAW
-#define ABGRToRAW ARGBToRGB24
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8_t* src_argb,
-                int src_stride_argb,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-// Convert ARGB To RAW.
-LIBYUV_API
-int ARGBToRAW(const uint8_t* src_argb,
-              int src_stride_argb,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-// Convert ARGB To RGB565.
-LIBYUV_API
-int ARGBToRGB565(const uint8_t* src_argb,
-                 int src_stride_argb,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
-// const uint8_t(*dither)[4][4];
-LIBYUV_API
-int ARGBToRGB565Dither(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const uint8_t* dither4x4,
-                       int width,
-                       int height);
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb1555,
-                   int dst_stride_argb1555,
-                   int width,
-                   int height);
-
-// Convert ARGB To ARGB4444.
-LIBYUV_API
-int ARGBToARGB4444(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb4444,
-                   int dst_stride_argb4444,
-                   int width,
-                   int height);
-
-// Convert ARGB To I444.
-LIBYUV_API
-int ARGBToI444(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert ARGB to AR64.
-LIBYUV_API
-int ARGBToAR64(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint16_t* dst_ar64,
-               int dst_stride_ar64,
-               int width,
-               int height);
-
-// Convert ABGR to AB64.
-#define ABGRToAB64 ARGBToAR64
-
-// Convert ARGB to AB64.
-LIBYUV_API
-int ARGBToAB64(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint16_t* dst_ab64,
-               int dst_stride_ab64,
-               int width,
-               int height);
-
-// Convert ABGR to AR64.
-#define ABGRToAR64 ARGBToAB64
-
-// Convert ARGB To I422.
-LIBYUV_API
-int ARGBToI422(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert ARGB To I420. (also in convert.h)
-LIBYUV_API
-int ARGBToI420(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert ARGB to J420. (JPeg full range I420).
-LIBYUV_API
-int ARGBToJ420(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_yj,
-               int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert ARGB to J422.
-LIBYUV_API
-int ARGBToJ422(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_yj,
-               int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert ARGB to J400. (JPeg full range).
-LIBYUV_API
-int ARGBToJ400(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_yj,
-               int dst_stride_yj,
-               int width,
-               int height);
-
-// Convert RGBA to J400. (JPeg full range).
-LIBYUV_API
-int RGBAToJ400(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_yj,
-               int dst_stride_yj,
-               int width,
-               int height);
-
-// Convert ARGB to I400.
-LIBYUV_API
-int ARGBToI400(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height);
-
-// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
-LIBYUV_API
-int ARGBToG(const uint8_t* src_argb,
-            int src_stride_argb,
-            uint8_t* dst_g,
-            int dst_stride_g,
-            int width,
-            int height);
-
-// Convert ARGB To NV12.
-LIBYUV_API
-int ARGBToNV12(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert ARGB To NV21.
-LIBYUV_API
-int ARGBToNV21(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-// Convert ABGR To NV12.
-LIBYUV_API
-int ABGRToNV12(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert ABGR To NV21.
-LIBYUV_API
-int ABGRToNV21(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-// Convert ARGB To YUY2.
-LIBYUV_API
-int ARGBToYUY2(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_yuy2,
-               int dst_stride_yuy2,
-               int width,
-               int height);
-
-// Convert ARGB To UYVY.
-LIBYUV_API
-int ARGBToUYVY(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_uyvy,
-               int dst_stride_uyvy,
-               int width,
-               int height);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
diff --git a/thirdparty/libyuv/include/libyuv/cpu_id.h b/thirdparty/libyuv/include/libyuv/cpu_id.h
deleted file mode 100644
index 3e27cc1..0000000
--- a/thirdparty/libyuv/include/libyuv/cpu_id.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CPU_ID_H_
-#define INCLUDE_LIBYUV_CPU_ID_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Internal flag to indicate cpuid requires initialization.
-static const int kCpuInitialized = 0x1;
-
-// These flags are only valid on ARM processors.
-static const int kCpuHasARM = 0x2;
-static const int kCpuHasNEON = 0x4;
-// 0x8 reserved for future ARM flag.
-
-// These flags are only valid on x86 processors.
-static const int kCpuHasX86 = 0x10;
-static const int kCpuHasSSE2 = 0x20;
-static const int kCpuHasSSSE3 = 0x40;
-static const int kCpuHasSSE41 = 0x80;
-static const int kCpuHasSSE42 = 0x100;  // unused at this time.
-static const int kCpuHasAVX = 0x200;
-static const int kCpuHasAVX2 = 0x400;
-static const int kCpuHasERMS = 0x800;
-static const int kCpuHasFMA3 = 0x1000;
-static const int kCpuHasF16C = 0x2000;
-static const int kCpuHasGFNI = 0x4000;
-static const int kCpuHasAVX512BW = 0x8000;
-static const int kCpuHasAVX512VL = 0x10000;
-static const int kCpuHasAVX512VBMI = 0x20000;
-static const int kCpuHasAVX512VBMI2 = 0x40000;
-static const int kCpuHasAVX512VBITALG = 0x80000;
-static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
-
-// These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x200000;
-static const int kCpuHasMSA = 0x400000;
-static const int kCpuHasMMI = 0x800000;
-
-// Optional init function. TestCpuFlag does an auto-init.
-// Returns cpu_info flags.
-LIBYUV_API
-int InitCpuFlags(void);
-
-// Detect CPU has SSE2 etc.
-// Test_flag parameter should be one of kCpuHas constants above.
-// Returns non-zero if instruction set is detected
-static __inline int TestCpuFlag(int test_flag) {
-  LIBYUV_API extern int cpu_info_;
-#ifdef __ATOMIC_RELAXED
-  int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
-#else
-  int cpu_info = cpu_info_;
-#endif
-  return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
-}
-
-// Internal function for parsing /proc/cpuinfo.
-LIBYUV_API
-int ArmCpuCaps(const char* cpuinfo_name);
-LIBYUV_API
-int MipsCpuCaps(const char* cpuinfo_name);
-
-// For testing, allow CPU flags to be disabled.
-// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
-// MaskCpuFlags(-1) to enable all cpu specific optimizations.
-// MaskCpuFlags(1) to disable all cpu specific optimizations.
-// MaskCpuFlags(0) to reset state so next call will auto init.
-// Returns cpu_info flags.
-LIBYUV_API
-int MaskCpuFlags(int enable_flags);
-
-// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags|
-// should be a valid combination of the kCpuHas constants above and include
-// kCpuInitialized. Use this method when running in a sandboxed process where
-// the detection code might fail (as it might access /proc/cpuinfo). In such
-// cases the cpu_info can be obtained from a non sandboxed process by calling
-// InitCpuFlags() and passed to the sandboxed process (via command line
-// parameters, IPC...) which can then call this method to initialize the CPU
-// flags.
-// Notes:
-// - when specifying 0 for |cpu_flags|, the auto initialization is enabled
-//   again.
-// - enabling CPU features that are not supported by the CPU will result in
-//   undefined behavior.
-// TODO(fbarchard): consider writing a helper function that translates from
-// other library CPU info to libyuv CPU info and add a .md doc that explains
-// CPU detection.
-static __inline void SetCpuFlags(int cpu_flags) {
-  LIBYUV_API extern int cpu_info_;
-#ifdef __ATOMIC_RELAXED
-  __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);
-#else
-  cpu_info_ = cpu_flags;
-#endif
-}
-
-// Low level cpuid for X86. Returns zeros on other CPUs.
-// eax is the info type that you want.
-// ecx is typically the cpu number, and should normally be zero.
-LIBYUV_API
-void CpuId(int info_eax, int info_ecx, int* cpu_info);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CPU_ID_H_
diff --git a/thirdparty/libyuv/include/libyuv/macros_msa.h b/thirdparty/libyuv/include/libyuv/macros_msa.h
deleted file mode 100644
index 4e232b6..0000000
--- a/thirdparty/libyuv/include/libyuv/macros_msa.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
-#define INCLUDE_LIBYUV_MACROS_MSA_H_
-
-#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
-#include <msa.h>
-#include <stdint.h>
-
-#if (__mips_isa_rev >= 6)
-#define LW(psrc)                                       \
-  ({                                                   \
-    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
-    uint32_t val_m;                                    \
-    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"     \
-                 : [val_m] "=r"(val_m)                 \
-                 : [psrc_lw_m] "m"(*psrc_lw_m));       \
-    val_m;                                             \
-  })
-
-#if (__mips == 64)
-#define LD(psrc)                                       \
-  ({                                                   \
-    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
-    uint64_t val_m = 0;                                \
-    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"     \
-                 : [val_m] "=r"(val_m)                 \
-                 : [psrc_ld_m] "m"(*psrc_ld_m));       \
-    val_m;                                             \
-  })
-#else  // !(__mips == 64)
-#define LD(psrc)                                                         \
-  ({                                                                     \
-    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \
-    uint32_t val0_m, val1_m;                                             \
-    uint64_t val_m = 0;                                                  \
-    val0_m = LW(psrc_ld_m);                                              \
-    val1_m = LW(psrc_ld_m + 4);                                          \
-    val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
-    val_m;                                                               \
-  })
-#endif  // (__mips == 64)
-
-#define SW(val, pdst)                                   \
-  ({                                                    \
-    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
-    uint32_t val_m = (val);                             \
-    asm volatile("sw  %[val_m],  %[pdst_sw_m]  \n"      \
-                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
-                 : [val_m] "r"(val_m));                 \
-  })
-
-#if (__mips == 64)
-#define SD(val, pdst)                                   \
-  ({                                                    \
-    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
-    uint64_t val_m = (val);                             \
-    asm volatile("sd  %[val_m],  %[pdst_sd_m]  \n"      \
-                 : [pdst_sd_m] "=m"(*pdst_sd_m)         \
-                 : [val_m] "r"(val_m));                 \
-  })
-#else  // !(__mips == 64)
-#define SD(val, pdst)                                        \
-  ({                                                         \
-    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \
-    uint32_t val0_m, val1_m;                                 \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-    SW(val0_m, pdst_sd_m);                                   \
-    SW(val1_m, pdst_sd_m + 4);                               \
-  })
-#endif  // !(__mips == 64)
-#else   // !(__mips_isa_rev >= 6)
-#define LW(psrc)                                       \
-  ({                                                   \
-    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
-    uint32_t val_m;                                    \
-    asm volatile("ulw  %[val_m],  %[psrc_lw_m]  \n"    \
-                 : [val_m] "=r"(val_m)                 \
-                 : [psrc_lw_m] "m"(*psrc_lw_m));       \
-    val_m;                                             \
-  })
-
-#if (__mips == 64)
-#define LD(psrc)                                       \
-  ({                                                   \
-    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
-    uint64_t val_m = 0;                                \
-    asm volatile("uld  %[val_m],  %[psrc_ld_m]  \n"    \
-                 : [val_m] "=r"(val_m)                 \
-                 : [psrc_ld_m] "m"(*psrc_ld_m));       \
-    val_m;                                             \
-  })
-#else  // !(__mips == 64)
-#define LD(psrc)                                                         \
-  ({                                                                     \
-    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \
-    uint32_t val0_m, val1_m;                                             \
-    uint64_t val_m = 0;                                                  \
-    val0_m = LW(psrc_ld_m);                                              \
-    val1_m = LW(psrc_ld_m + 4);                                          \
-    val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
-    val_m;                                                               \
-  })
-#endif  // (__mips == 64)
-
-#define SW(val, pdst)                                   \
-  ({                                                    \
-    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
-    uint32_t val_m = (val);                             \
-    asm volatile("usw  %[val_m],  %[pdst_sw_m]  \n"     \
-                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
-                 : [val_m] "r"(val_m));                 \
-  })
-
-#define SD(val, pdst)                                        \
-  ({                                                         \
-    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \
-    uint32_t val0_m, val1_m;                                 \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-    SW(val0_m, pdst_sd_m);                                   \
-    SW(val1_m, pdst_sd_m + 4);                               \
-  })
-#endif  // (__mips_isa_rev >= 6)
-
-// TODO(fbarchard): Consider removing __VAR_ARGS versions.
-#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
-#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
-
-#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
-#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
-#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
-
-/* Description : Load two vectors with 16 'byte' sized elements
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Load 16 byte elements in 'out0' from (psrc)
-                 Load 16 byte elements in 'out1' from (psrc + stride)
-*/
-#define LD_B2(RTYPE, psrc, stride, out0, out1) \
-  {                                            \
-    out0 = LD_B(RTYPE, (psrc));                \
-    out1 = LD_B(RTYPE, (psrc) + stride);       \
-  }
-#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__)
-
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
-  {                                                        \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
-  }
-#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__)
-
-/* Description : Store two vectors with stride each having 16 'byte' sized
-                 elements
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 16 byte elements from 'in0' to (pdst)
-                 Store 16 byte elements from 'in1' to (pdst + stride)
-*/
-#define ST_B2(RTYPE, in0, in1, pdst, stride) \
-  {                                          \
-    ST_B(RTYPE, in0, (pdst));                \
-    ST_B(RTYPE, in1, (pdst) + stride);       \
-  }
-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
-
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
-  {                                                      \
-    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-  }
-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
-
-/* Description : Store vectors of 8 halfword elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 8 halfword elements from 'in0' to (pdst)
-                 Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) \
-  {                                          \
-    ST_H(RTYPE, in0, (pdst));                \
-    ST_H(RTYPE, in1, (pdst) + stride);       \
-  }
-#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
-
-// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
-/* Description : Shuffle byte vector elements as per mask vector
-   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
-                 'out0' as per control vector 'mask0'
-*/
-#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
-  {                                                                   \
-    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
-    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
-  }
-#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
-
-/* Description : Interleave both left and right half of input vectors
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of byte elements from 'in0' and 'in1' are
-                 interleaved and written to 'out0'
-*/
-#define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
-    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
-  }
-#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
-
-#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
-
-#endif  // INCLUDE_LIBYUV_MACROS_MSA_H_
diff --git a/thirdparty/libyuv/include/libyuv/mjpeg_decoder.h b/thirdparty/libyuv/include/libyuv/mjpeg_decoder.h
deleted file mode 100644
index 275f8d4..0000000
--- a/thirdparty/libyuv/include/libyuv/mjpeg_decoder.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_
-#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-// NOTE: For a simplified public API use convert.h MJPGToI420().
-
-struct jpeg_common_struct;
-struct jpeg_decompress_struct;
-struct jpeg_source_mgr;
-
-namespace libyuv {
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-static const uint32_t kUnknownDataSize = 0xFFFFFFFF;
-
-enum JpegSubsamplingType {
-  kJpegYuv420,
-  kJpegYuv422,
-  kJpegYuv444,
-  kJpegYuv400,
-  kJpegUnknown
-};
-
-struct Buffer {
-  const uint8_t* data;
-  int len;
-};
-
-struct BufferVector {
-  Buffer* buffers;
-  int len;
-  int pos;
-};
-
-struct SetJmpErrorMgr;
-
-// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
-// simply independent JPEG images with a fixed huffman table (which is omitted).
-// It is rarely used in video transmission, but is common as a camera capture
-// format, especially in Logitech devices. This class implements a decoder for
-// MJPEG frames.
-//
-// See http://tools.ietf.org/html/rfc2435
-class LIBYUV_API MJpegDecoder {
- public:
-  typedef void (*CallbackFunction)(void* opaque,
-                                   const uint8_t* const* data,
-                                   const int* strides,
-                                   int rows);
-
-  static const int kColorSpaceUnknown;
-  static const int kColorSpaceGrayscale;
-  static const int kColorSpaceRgb;
-  static const int kColorSpaceYCbCr;
-  static const int kColorSpaceCMYK;
-  static const int kColorSpaceYCCK;
-
-  MJpegDecoder();
-  ~MJpegDecoder();
-
-  // Loads a new frame, reads its headers, and determines the uncompressed
-  // image format.
-  // Returns LIBYUV_TRUE if image looks valid and format is supported.
-  // If return value is LIBYUV_TRUE, then the values for all the following
-  // getters are populated.
-  // src_len is the size of the compressed mjpeg frame in bytes.
-  LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len);
-
-  // Returns width of the last loaded frame in pixels.
-  int GetWidth();
-
-  // Returns height of the last loaded frame in pixels.
-  int GetHeight();
-
-  // Returns format of the last loaded frame. The return value is one of the
-  // kColorSpace* constants.
-  int GetColorSpace();
-
-  // Number of color components in the color space.
-  int GetNumComponents();
-
-  // Sample factors of the n-th component.
-  int GetHorizSampFactor(int component);
-
-  int GetVertSampFactor(int component);
-
-  int GetHorizSubSampFactor(int component);
-
-  int GetVertSubSampFactor(int component);
-
-  // Public for testability.
-  int GetImageScanlinesPerImcuRow();
-
-  // Public for testability.
-  int GetComponentScanlinesPerImcuRow(int component);
-
-  // Width of a component in bytes.
-  int GetComponentWidth(int component);
-
-  // Height of a component.
-  int GetComponentHeight(int component);
-
-  // Width of a component in bytes with padding for DCTSIZE. Public for testing.
-  int GetComponentStride(int component);
-
-  // Size of a component in bytes.
-  int GetComponentSize(int component);
-
-  // Call this after LoadFrame() if you decide you don't want to decode it
-  // after all.
-  LIBYUV_BOOL UnloadFrame();
-
-  // Decodes the entire image into a one-buffer-per-color-component format.
-  // dst_width must match exactly. dst_height must be <= to image height; if
-  // less, the image is cropped. "planes" must have size equal to at least
-  // GetNumComponents() and they must point to non-overlapping buffers of size
-  // at least GetComponentSize(i). The pointers in planes are incremented
-  // to point to after the end of the written data.
-  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height);
-
-  // Decodes the entire image and passes the data via repeated calls to a
-  // callback function. Each call will get the data for a whole number of
-  // image scanlines.
-  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,
-                               void* opaque,
-                               int dst_width,
-                               int dst_height);
-
-  // The helper function which recognizes the jpeg sub-sampling type.
-  static JpegSubsamplingType JpegSubsamplingTypeHelper(
-      int* subsample_x,
-      int* subsample_y,
-      int number_of_components);
-
- private:
-  void AllocOutputBuffers(int num_outbufs);
-  void DestroyOutputBuffers();
-
-  LIBYUV_BOOL StartDecode();
-  LIBYUV_BOOL FinishDecode();
-
-  void SetScanlinePointers(uint8_t** data);
-  LIBYUV_BOOL DecodeImcuRow();
-
-  int GetComponentScanlinePadding(int component);
-
-  // A buffer holding the input data for a frame.
-  Buffer buf_;
-  BufferVector buf_vec_;
-
-  jpeg_decompress_struct* decompress_struct_;
-  jpeg_source_mgr* source_mgr_;
-  SetJmpErrorMgr* error_mgr_;
-
-  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
-  // GetComponentScanlinePadding() != 0.)
-  LIBYUV_BOOL has_scanline_padding_;
-
-  // Temporaries used to point to scanline outputs.
-  int num_outbufs_;  // Outermost size of all arrays below.
-  uint8_t*** scanlines_;
-  int* scanlines_sizes_;
-  // Temporary buffer used for decoding when we can't decode directly to the
-  // output buffers. Large enough for just one iMCU row.
-  uint8_t** databuf_;
-  int* databuf_strides_;
-};
-
-}  // namespace libyuv
-
-#endif  //  __cplusplus
-#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_
diff --git a/thirdparty/libyuv/include/libyuv/planar_functions.h b/thirdparty/libyuv/include/libyuv/planar_functions.h
deleted file mode 100644
index fdecdee..0000000
--- a/thirdparty/libyuv/include/libyuv/planar_functions.h
+++ /dev/null
@@ -1,1055 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
-#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
-
-#include "libyuv/basic_types.h"
-
-// TODO(fbarchard): Remove the following headers includes.
-#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// TODO(fbarchard): Move cpu macros to row.h
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__native_client__) && defined(__x86_64__)) || \
-    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
-#define LIBYUV_DISABLE_X86
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_ARGBAFFINEROW_SSE2
-#endif
-
-// Copy a plane of data.
-LIBYUV_API
-void CopyPlane(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height);
-
-LIBYUV_API
-void CopyPlane_16(const uint16_t* src_y,
-                  int src_stride_y,
-                  uint16_t* dst_y,
-                  int dst_stride_y,
-                  int width,
-                  int height);
-
-LIBYUV_API
-void Convert16To8Plane(const uint16_t* src_y,
-                       int src_stride_y,
-                       uint8_t* dst_y,
-                       int dst_stride_y,
-                       int scale,  // 16384 for 10 bits
-                       int width,
-                       int height);
-
-LIBYUV_API
-void Convert8To16Plane(const uint8_t* src_y,
-                       int src_stride_y,
-                       uint16_t* dst_y,
-                       int dst_stride_y,
-                       int scale,  // 1024 for 10 bits
-                       int width,
-                       int height);
-
-// Set a plane of data to a 32 bit value.
-LIBYUV_API
-void SetPlane(uint8_t* dst_y,
-              int dst_stride_y,
-              int width,
-              int height,
-              uint32_t value);
-
-// Split interleaved UV plane into separate U and V planes.
-LIBYUV_API
-void SplitUVPlane(const uint8_t* src_uv,
-                  int src_stride_uv,
-                  uint8_t* dst_u,
-                  int dst_stride_u,
-                  uint8_t* dst_v,
-                  int dst_stride_v,
-                  int width,
-                  int height);
-
-// Merge separate U and V planes into one interleaved UV plane.
-LIBYUV_API
-void MergeUVPlane(const uint8_t* src_u,
-                  int src_stride_u,
-                  const uint8_t* src_v,
-                  int src_stride_v,
-                  uint8_t* dst_uv,
-                  int dst_stride_uv,
-                  int width,
-                  int height);
-
-// Split interleaved msb UV plane into separate lsb U and V planes.
-LIBYUV_API
-void SplitUVPlane_16(const uint16_t* src_uv,
-                     int src_stride_uv,
-                     uint16_t* dst_u,
-                     int dst_stride_u,
-                     uint16_t* dst_v,
-                     int dst_stride_v,
-                     int width,
-                     int height,
-                     int depth);
-
-// Merge separate lsb U and V planes into one interleaved msb UV plane.
-LIBYUV_API
-void MergeUVPlane_16(const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint16_t* dst_uv,
-                     int dst_stride_uv,
-                     int width,
-                     int height,
-                     int depth);
-
-// Convert lsb plane to msb plane
-LIBYUV_API
-void ConvertToMSBPlane_16(const uint16_t* src_y,
-                          int src_stride_y,
-                          uint16_t* dst_y,
-                          int dst_stride_y,
-                          int width,
-                          int height,
-                          int depth);
-
-// Convert msb plane to lsb plane
-LIBYUV_API
-void ConvertToLSBPlane_16(const uint16_t* src_y,
-                          int src_stride_y,
-                          uint16_t* dst_y,
-                          int dst_stride_y,
-                          int width,
-                          int height,
-                          int depth);
-
-// Scale U and V to half width and height and merge into interleaved UV plane.
-// width and height are source size, allowing odd sizes.
-// Use for converting I444 or I422 to NV12.
-LIBYUV_API
-void HalfMergeUVPlane(const uint8_t* src_u,
-                      int src_stride_u,
-                      const uint8_t* src_v,
-                      int src_stride_v,
-                      uint8_t* dst_uv,
-                      int dst_stride_uv,
-                      int width,
-                      int height);
-
-// Swap U and V channels in interleaved UV plane.
-LIBYUV_API
-void SwapUVPlane(const uint8_t* src_uv,
-                 int src_stride_uv,
-                 uint8_t* dst_vu,
-                 int dst_stride_vu,
-                 int width,
-                 int height);
-
-// Split interleaved RGB plane into separate R, G and B planes.
-LIBYUV_API
-void SplitRGBPlane(const uint8_t* src_rgb,
-                   int src_stride_rgb,
-                   uint8_t* dst_r,
-                   int dst_stride_r,
-                   uint8_t* dst_g,
-                   int dst_stride_g,
-                   uint8_t* dst_b,
-                   int dst_stride_b,
-                   int width,
-                   int height);
-
-// Merge separate R, G and B planes into one interleaved RGB plane.
-LIBYUV_API
-void MergeRGBPlane(const uint8_t* src_r,
-                   int src_stride_r,
-                   const uint8_t* src_g,
-                   int src_stride_g,
-                   const uint8_t* src_b,
-                   int src_stride_b,
-                   uint8_t* dst_rgb,
-                   int dst_stride_rgb,
-                   int width,
-                   int height);
-
-// Split interleaved ARGB plane into separate R, G, B and A planes.
-// dst_a can be NULL to discard alpha plane.
-LIBYUV_API
-void SplitARGBPlane(const uint8_t* src_argb,
-                    int src_stride_argb,
-                    uint8_t* dst_r,
-                    int dst_stride_r,
-                    uint8_t* dst_g,
-                    int dst_stride_g,
-                    uint8_t* dst_b,
-                    int dst_stride_b,
-                    uint8_t* dst_a,
-                    int dst_stride_a,
-                    int width,
-                    int height);
-
-// Merge separate R, G, B and A planes into one interleaved ARGB plane.
-// src_a can be NULL to fill opaque value to alpha.
-LIBYUV_API
-void MergeARGBPlane(const uint8_t* src_r,
-                    int src_stride_r,
-                    const uint8_t* src_g,
-                    int src_stride_g,
-                    const uint8_t* src_b,
-                    int src_stride_b,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height);
-
-// Merge separate 'depth' bit R, G and B planes stored in lsb
-// into one interleaved XR30 plane.
-// depth should in range [10, 16]
-LIBYUV_API
-void MergeXR30Plane(const uint16_t* src_r,
-                    int src_stride_r,
-                    const uint16_t* src_g,
-                    int src_stride_g,
-                    const uint16_t* src_b,
-                    int src_stride_b,
-                    uint8_t* dst_ar30,
-                    int dst_stride_ar30,
-                    int width,
-                    int height,
-                    int depth);
-
-// Merge separate 'depth' bit R, G, B and A planes stored in lsb
-// into one interleaved AR64 plane.
-// src_a can be NULL to fill opaque value to alpha.
-// depth should in range [1, 16]
-LIBYUV_API
-void MergeAR64Plane(const uint16_t* src_r,
-                    int src_stride_r,
-                    const uint16_t* src_g,
-                    int src_stride_g,
-                    const uint16_t* src_b,
-                    int src_stride_b,
-                    const uint16_t* src_a,
-                    int src_stride_a,
-                    uint16_t* dst_ar64,
-                    int dst_stride_ar64,
-                    int width,
-                    int height,
-                    int depth);
-
-// Merge separate 'depth' bit R, G, B and A planes stored in lsb
-// into one interleaved ARGB plane.
-// src_a can be NULL to fill opaque value to alpha.
-// depth should in range [8, 16]
-LIBYUV_API
-void MergeARGB16To8Plane(const uint16_t* src_r,
-                         int src_stride_r,
-                         const uint16_t* src_g,
-                         int src_stride_g,
-                         const uint16_t* src_b,
-                         int src_stride_b,
-                         const uint16_t* src_a,
-                         int src_stride_a,
-                         uint8_t* dst_argb,
-                         int dst_stride_argb,
-                         int width,
-                         int height,
-                         int depth);
-
-// Copy I400.  Supports inverting.
-LIBYUV_API
-int I400ToI400(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height);
-
-#define J400ToJ400 I400ToI400
-
-// Copy I422 to I422.
-#define I422ToI422 I422Copy
-LIBYUV_API
-int I422Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_u,
-             int src_stride_u,
-             const uint8_t* src_v,
-             int src_stride_v,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_u,
-             int dst_stride_u,
-             uint8_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height);
-
-// Copy I444 to I444.
-#define I444ToI444 I444Copy
-LIBYUV_API
-int I444Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_u,
-             int src_stride_u,
-             const uint8_t* src_v,
-             int src_stride_v,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_u,
-             int dst_stride_u,
-             uint8_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height);
-
-// Copy NV12. Supports inverting.
-int NV12Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_uv,
-             int src_stride_uv,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_uv,
-             int dst_stride_uv,
-             int width,
-             int height);
-
-// Copy NV21. Supports inverting.
-int NV21Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_vu,
-             int src_stride_vu,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_vu,
-             int dst_stride_vu,
-             int width,
-             int height);
-
-// Convert YUY2 to I422.
-LIBYUV_API
-int YUY2ToI422(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert UYVY to I422.
-LIBYUV_API
-int UYVYToI422(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-LIBYUV_API
-int YUY2ToNV12(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-LIBYUV_API
-int UYVYToNV12(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert NV21 to NV12.
-LIBYUV_API
-int NV21ToNV12(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-LIBYUV_API
-int YUY2ToY(const uint8_t* src_yuy2,
-            int src_stride_yuy2,
-            uint8_t* dst_y,
-            int dst_stride_y,
-            int width,
-            int height);
-
-// Convert I420 to I400. (calls CopyPlane ignoring u/v).
-LIBYUV_API
-int I420ToI400(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height);
-
-// Alias
-#define J420ToJ400 I420ToI400
-#define I420ToI420Mirror I420Mirror
-
-// I420 mirror.
-LIBYUV_API
-int I420Mirror(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Alias
-#define I400ToI400Mirror I400Mirror
-
-// I400 mirror.  A single plane is mirrored horizontally.
-// Pass negative height to achieve 180 degree rotation.
-LIBYUV_API
-int I400Mirror(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height);
-
-// Alias
-#define NV12ToNV12Mirror NV12Mirror
-
-// NV12 mirror.
-LIBYUV_API
-int NV12Mirror(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Alias
-#define ARGBToARGBMirror ARGBMirror
-
-// ARGB mirror.
-LIBYUV_API
-int ARGBMirror(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Alias
-#define RGB24ToRGB24Mirror RGB24Mirror
-
-// RGB24 mirror.
-LIBYUV_API
-int RGB24Mirror(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-// Mirror a plane of data.
-LIBYUV_API
-void MirrorPlane(const uint8_t* src_y,
-                 int src_stride_y,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 int width,
-                 int height);
-
-// Mirror a plane of UV data.
-LIBYUV_API
-void MirrorUVPlane(const uint8_t* src_uv,
-                   int src_stride_uv,
-                   uint8_t* dst_uv,
-                   int dst_stride_uv,
-                   int width,
-                   int height);
-
-// Alias
-#define RGB24ToRAW RAWToRGB24
-
-LIBYUV_API
-int RAWToRGB24(const uint8_t* src_raw,
-               int src_stride_raw,
-               uint8_t* dst_rgb24,
-               int dst_stride_rgb24,
-               int width,
-               int height);
-
-// Draw a rectangle into I420.
-LIBYUV_API
-int I420Rect(uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_u,
-             int dst_stride_u,
-             uint8_t* dst_v,
-             int dst_stride_v,
-             int x,
-             int y,
-             int width,
-             int height,
-             int value_y,
-             int value_u,
-             int value_v);
-
-// Draw a rectangle into ARGB.
-LIBYUV_API
-int ARGBRect(uint8_t* dst_argb,
-             int dst_stride_argb,
-             int dst_x,
-             int dst_y,
-             int width,
-             int height,
-             uint32_t value);
-
-// Convert ARGB to gray scale ARGB.
-LIBYUV_API
-int ARGBGrayTo(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Make a rectangle of ARGB gray scale.
-LIBYUV_API
-int ARGBGray(uint8_t* dst_argb,
-             int dst_stride_argb,
-             int dst_x,
-             int dst_y,
-             int width,
-             int height);
-
-// Make a rectangle of ARGB Sepia tone.
-LIBYUV_API
-int ARGBSepia(uint8_t* dst_argb,
-              int dst_stride_argb,
-              int dst_x,
-              int dst_y,
-              int width,
-              int height);
-
-// Apply a matrix rotation to each ARGB pixel.
-// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
-// The first 4 coefficients apply to B, G, R, A and produce B of the output.
-// The next 4 coefficients apply to B, G, R, A and produce G of the output.
-// The next 4 coefficients apply to B, G, R, A and produce R of the output.
-// The last 4 coefficients apply to B, G, R, A and produce A of the output.
-LIBYUV_API
-int ARGBColorMatrix(const uint8_t* src_argb,
-                    int src_stride_argb,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    const int8_t* matrix_argb,
-                    int width,
-                    int height);
-
-// Deprecated. Use ARGBColorMatrix instead.
-// Apply a matrix rotation to each ARGB pixel.
-// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
-// The first 4 coefficients apply to B, G, R, A and produce B of the output.
-// The next 4 coefficients apply to B, G, R, A and produce G of the output.
-// The last 4 coefficients apply to B, G, R, A and produce R of the output.
-LIBYUV_API
-int RGBColorMatrix(uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   const int8_t* matrix_rgb,
-                   int dst_x,
-                   int dst_y,
-                   int width,
-                   int height);
-
-// Apply a color table each ARGB pixel.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int ARGBColorTable(uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   const uint8_t* table_argb,
-                   int dst_x,
-                   int dst_y,
-                   int width,
-                   int height);
-
-// Apply a color table each ARGB pixel but preserve destination alpha.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int RGBColorTable(uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  const uint8_t* table_argb,
-                  int dst_x,
-                  int dst_y,
-                  int width,
-                  int height);
-
-// Apply a luma/color table each ARGB pixel but preserve destination alpha.
-// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
-// RGB (YJ style) and C is an 8 bit color component (R, G or B).
-LIBYUV_API
-int ARGBLumaColorTable(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_argb,
-                       int dst_stride_argb,
-                       const uint8_t* luma,
-                       int width,
-                       int height);
-
-// Apply a 3 term polynomial to ARGB values.
-// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
-// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
-// g squared, r squared and a squared.  The 4rd row is coefficients for b to
-// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
-// result clamped to 0 to 255.
-// A polynomial approximation can be dirived using software such as 'R'.
-
-LIBYUV_API
-int ARGBPolynomial(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   const float* poly,
-                   int width,
-                   int height);
-
-// Convert plane of 16 bit shorts to half floats.
-// Source values are multiplied by scale before storing as half float.
-LIBYUV_API
-int HalfFloatPlane(const uint16_t* src_y,
-                   int src_stride_y,
-                   uint16_t* dst_y,
-                   int dst_stride_y,
-                   float scale,
-                   int width,
-                   int height);
-
-// Convert a buffer of bytes to floats, scale the values and store as floats.
-LIBYUV_API
-int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);
-
-// Quantize a rectangle of ARGB. Alpha unaffected.
-// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
-// interval_size should be a value between 1 and 255.
-// interval_offset should be a value between 0 and 255.
-LIBYUV_API
-int ARGBQuantize(uint8_t* dst_argb,
-                 int dst_stride_argb,
-                 int scale,
-                 int interval_size,
-                 int interval_offset,
-                 int dst_x,
-                 int dst_y,
-                 int width,
-                 int height);
-
-// Copy ARGB to ARGB.
-LIBYUV_API
-int ARGBCopy(const uint8_t* src_argb,
-             int src_stride_argb,
-             uint8_t* dst_argb,
-             int dst_stride_argb,
-             int width,
-             int height);
-
-// Copy Alpha channel of ARGB to alpha of ARGB.
-LIBYUV_API
-int ARGBCopyAlpha(const uint8_t* src_argb,
-                  int src_stride_argb,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int width,
-                  int height);
-
-// Extract the alpha channel from ARGB.
-LIBYUV_API
-int ARGBExtractAlpha(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_a,
-                     int dst_stride_a,
-                     int width,
-                     int height);
-
-// Copy Y channel to Alpha of ARGB.
-LIBYUV_API
-int ARGBCopyYToAlpha(const uint8_t* src_y,
-                     int src_stride_y,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     int width,
-                     int height);
-
-typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
-                             const uint8_t* src_argb1,
-                             uint8_t* dst_argb,
-                             int width);
-
-// Get function to Alpha Blend ARGB pixels and store to destination.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend();
-
-// Alpha Blend ARGB images and store to destination.
-// Source is pre-multiplied by alpha using ARGBAttenuate.
-// Alpha of destination is set to 255.
-LIBYUV_API
-int ARGBBlend(const uint8_t* src_argb0,
-              int src_stride_argb0,
-              const uint8_t* src_argb1,
-              int src_stride_argb1,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int width,
-              int height);
-
-// Alpha Blend plane and store to destination.
-// Source is not pre-multiplied by alpha.
-LIBYUV_API
-int BlendPlane(const uint8_t* src_y0,
-               int src_stride_y0,
-               const uint8_t* src_y1,
-               int src_stride_y1,
-               const uint8_t* alpha,
-               int alpha_stride,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height);
-
-// Alpha Blend YUV images and store to destination.
-// Source is not pre-multiplied by alpha.
-// Alpha is full width x height and subsampled to half size to apply to UV.
-LIBYUV_API
-int I420Blend(const uint8_t* src_y0,
-              int src_stride_y0,
-              const uint8_t* src_u0,
-              int src_stride_u0,
-              const uint8_t* src_v0,
-              int src_stride_v0,
-              const uint8_t* src_y1,
-              int src_stride_y1,
-              const uint8_t* src_u1,
-              int src_stride_u1,
-              const uint8_t* src_v1,
-              int src_stride_v1,
-              const uint8_t* alpha,
-              int alpha_stride,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int width,
-              int height);
-
-// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
-LIBYUV_API
-int ARGBMultiply(const uint8_t* src_argb0,
-                 int src_stride_argb0,
-                 const uint8_t* src_argb1,
-                 int src_stride_argb1,
-                 uint8_t* dst_argb,
-                 int dst_stride_argb,
-                 int width,
-                 int height);
-
-// Add ARGB image with ARGB image. Saturates to 255.
-LIBYUV_API
-int ARGBAdd(const uint8_t* src_argb0,
-            int src_stride_argb0,
-            const uint8_t* src_argb1,
-            int src_stride_argb1,
-            uint8_t* dst_argb,
-            int dst_stride_argb,
-            int width,
-            int height);
-
-// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
-LIBYUV_API
-int ARGBSubtract(const uint8_t* src_argb0,
-                 int src_stride_argb0,
-                 const uint8_t* src_argb1,
-                 int src_stride_argb1,
-                 uint8_t* dst_argb,
-                 int dst_stride_argb,
-                 int width,
-                 int height);
-
-// Convert I422 to YUY2.
-LIBYUV_API
-int I422ToYUY2(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_yuy2,
-               int dst_stride_yuy2,
-               int width,
-               int height);
-
-// Convert I422 to UYVY.
-LIBYUV_API
-int I422ToUYVY(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_uyvy,
-               int dst_stride_uyvy,
-               int width,
-               int height);
-
-// Convert unattentuated ARGB to preattenuated ARGB.
-LIBYUV_API
-int ARGBAttenuate(const uint8_t* src_argb,
-                  int src_stride_argb,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int width,
-                  int height);
-
-// Convert preattentuated ARGB to unattenuated ARGB.
-LIBYUV_API
-int ARGBUnattenuate(const uint8_t* src_argb,
-                    int src_stride_argb,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height);
-
-// Internal function - do not call directly.
-// Computes table of cumulative sum for image where the value is the sum
-// of all values above and to the left of the entry. Used by ARGBBlur.
-LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8_t* src_argb,
-                             int src_stride_argb,
-                             int32_t* dst_cumsum,
-                             int dst_stride32_cumsum,
-                             int width,
-                             int height);
-
-// Blur ARGB image.
-// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
-//   16 byte boundary.
-// dst_stride32_cumsum is number of ints in a row (width * 4).
-// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
-// Blur is optimized for radius of 5 (11x11) or less.
-LIBYUV_API
-int ARGBBlur(const uint8_t* src_argb,
-             int src_stride_argb,
-             uint8_t* dst_argb,
-             int dst_stride_argb,
-             int32_t* dst_cumsum,
-             int dst_stride32_cumsum,
-             int width,
-             int height,
-             int radius);
-
-// Gaussian 5x5 blur a float plane.
-// Coefficients of 1, 4, 6, 4, 1.
-// Each destination pixel is a blur of the 5x5
-// pixels from the source.
-// Source edges are clamped.
-LIBYUV_API
-int GaussPlane_F32(const float* src,
-                   int src_stride,
-                   float* dst,
-                   int dst_stride,
-                   int width,
-                   int height);
-
-// Multiply ARGB image by ARGB value.
-LIBYUV_API
-int ARGBShade(const uint8_t* src_argb,
-              int src_stride_argb,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int width,
-              int height,
-              uint32_t value);
-
-// Interpolate between two images using specified amount of interpolation
-// (0 to 255) and store to destination.
-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
-// and 255 means 1% src0 and 99% src1.
-LIBYUV_API
-int InterpolatePlane(const uint8_t* src0,
-                     int src_stride0,
-                     const uint8_t* src1,
-                     int src_stride1,
-                     uint8_t* dst,
-                     int dst_stride,
-                     int width,
-                     int height,
-                     int interpolation);
-
-// Interpolate between two ARGB images using specified amount of interpolation
-// Internally calls InterpolatePlane with width * 4 (bpp).
-LIBYUV_API
-int ARGBInterpolate(const uint8_t* src_argb0,
-                    int src_stride_argb0,
-                    const uint8_t* src_argb1,
-                    int src_stride_argb1,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height,
-                    int interpolation);
-
-// Interpolate between two YUV images using specified amount of interpolation
-// Internally calls InterpolatePlane on each plane where the U and V planes
-// are half width and half height.
-LIBYUV_API
-int I420Interpolate(const uint8_t* src0_y,
-                    int src0_stride_y,
-                    const uint8_t* src0_u,
-                    int src0_stride_u,
-                    const uint8_t* src0_v,
-                    int src0_stride_v,
-                    const uint8_t* src1_y,
-                    int src1_stride_y,
-                    const uint8_t* src1_u,
-                    int src1_stride_u,
-                    const uint8_t* src1_v,
-                    int src1_stride_v,
-                    uint8_t* dst_y,
-                    int dst_stride_y,
-                    uint8_t* dst_u,
-                    int dst_stride_u,
-                    uint8_t* dst_v,
-                    int dst_stride_v,
-                    int width,
-                    int height,
-                    int interpolation);
-
-// Row function for copying pixels from a source with a slope to a row
-// of destination. Useful for scaling, rotation, mirror, texture mapping.
-LIBYUV_API
-void ARGBAffineRow_C(const uint8_t* src_argb,
-                     int src_argb_stride,
-                     uint8_t* dst_argb,
-                     const float* uv_dudv,
-                     int width);
-// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8_t* src_argb,
-                        int src_argb_stride,
-                        uint8_t* dst_argb,
-                        const float* uv_dudv,
-                        int width);
-
-// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
-// shuffler is 16 bytes.
-LIBYUV_API
-int ARGBShuffle(const uint8_t* src_bgra,
-                int src_stride_bgra,
-                uint8_t* dst_argb,
-                int dst_stride_argb,
-                const uint8_t* shuffler,
-                int width,
-                int height);
-
-// Shuffle AR64 channel order.  e.g. AR64 to AB64.
-// shuffler is 16 bytes.
-LIBYUV_API
-int AR64Shuffle(const uint16_t* src_ar64,
-                int src_stride_ar64,
-                uint16_t* dst_ar64,
-                int dst_stride_ar64,
-                const uint8_t* shuffler,
-                int width,
-                int height);
-
-// Sobel ARGB effect with planar output.
-LIBYUV_API
-int ARGBSobelToPlane(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     int width,
-                     int height);
-
-// Sobel ARGB effect.
-LIBYUV_API
-int ARGBSobel(const uint8_t* src_argb,
-              int src_stride_argb,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int width,
-              int height);
-
-// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
-LIBYUV_API
-int ARGBSobelXY(const uint8_t* src_argb,
-                int src_stride_argb,
-                uint8_t* dst_argb,
-                int dst_stride_argb,
-                int width,
-                int height);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
diff --git a/thirdparty/libyuv/include/libyuv/rotate.h b/thirdparty/libyuv/include/libyuv/rotate.h
deleted file mode 100644
index 3088822..0000000
--- a/thirdparty/libyuv/include/libyuv/rotate.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROTATE_H_
-#define INCLUDE_LIBYUV_ROTATE_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Supported rotation.
-typedef enum RotationMode {
-  kRotate0 = 0,      // No rotation.
-  kRotate90 = 90,    // Rotate 90 degrees clockwise.
-  kRotate180 = 180,  // Rotate 180 degrees.
-  kRotate270 = 270,  // Rotate 270 degrees clockwise.
-
-  // Deprecated.
-  kRotateNone = 0,
-  kRotateClockwise = 90,
-  kRotateCounterClockwise = 270,
-} RotationModeEnum;
-
-// Rotate I420 frame.
-LIBYUV_API
-int I420Rotate(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height,
-               enum RotationMode mode);
-
-// Rotate I444 frame.
-LIBYUV_API
-int I444Rotate(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height,
-               enum RotationMode mode);
-
-// Rotate NV12 input and store in I420.
-LIBYUV_API
-int NV12ToI420Rotate(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     uint8_t* dst_u,
-                     int dst_stride_u,
-                     uint8_t* dst_v,
-                     int dst_stride_v,
-                     int width,
-                     int height,
-                     enum RotationMode mode);
-
-// Rotate a plane by 0, 90, 180, or 270.
-LIBYUV_API
-int RotatePlane(const uint8_t* src,
-                int src_stride,
-                uint8_t* dst,
-                int dst_stride,
-                int width,
-                int height,
-                enum RotationMode mode);
-
-// Rotate planes by 90, 180, 270. Deprecated.
-LIBYUV_API
-void RotatePlane90(const uint8_t* src,
-                   int src_stride,
-                   uint8_t* dst,
-                   int dst_stride,
-                   int width,
-                   int height);
-
-LIBYUV_API
-void RotatePlane180(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-LIBYUV_API
-void RotatePlane270(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them. Deprecated.
-LIBYUV_API
-void RotateUV90(const uint8_t* src,
-                int src_stride,
-                uint8_t* dst_a,
-                int dst_stride_a,
-                uint8_t* dst_b,
-                int dst_stride_b,
-                int width,
-                int height);
-
-LIBYUV_API
-void RotateUV180(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height);
-
-LIBYUV_API
-void RotateUV270(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height);
-
-// The 90 and 270 functions are based on transposes.
-// Doing a transpose with reversing the read/write
-// order will result in a rotation by +- 90 degrees.
-// Deprecated.
-LIBYUV_API
-void TransposePlane(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-LIBYUV_API
-void TransposeUV(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/thirdparty/libyuv/include/libyuv/rotate_argb.h b/thirdparty/libyuv/include/libyuv/rotate_argb.h
deleted file mode 100644
index 2043294..0000000
--- a/thirdparty/libyuv/include/libyuv/rotate_argb.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_
-#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/rotate.h"  // For RotationMode.
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Rotate ARGB frame
-LIBYUV_API
-int ARGBRotate(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int src_width,
-               int src_height,
-               enum RotationMode mode);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_
diff --git a/thirdparty/libyuv/include/libyuv/rotate_row.h b/thirdparty/libyuv/include/libyuv/rotate_row.h
deleted file mode 100644
index 5a9cf93..0000000
--- a/thirdparty/libyuv/include/libyuv/rotate_row.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
-#define INCLUDE_LIBYUV_ROTATE_ROW_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__native_client__) && defined(__x86_64__)) || \
-    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
-#define LIBYUV_DISABLE_X86
-#endif
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// The following are available for Visual C 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
-    !defined(__clang__)
-#define HAS_TRANSPOSEWX8_SSSE3
-#define HAS_TRANSPOSEUVWX8_SSE2
-#endif
-
-// The following are available for GCC 32 or 64 bit:
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
-#define HAS_TRANSPOSEWX8_SSSE3
-#endif
-
-// The following are available for 64 bit GCC:
-#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
-#define HAS_TRANSPOSEWX8_FAST_SSSE3
-#define HAS_TRANSPOSEUVWX8_SSE2
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_TRANSPOSEWX8_NEON
-#define HAS_TRANSPOSEUVWX8_NEON
-#endif
-
-#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
-#define HAS_TRANSPOSEWX16_MSA
-#define HAS_TRANSPOSEUVWX16_MSA
-#endif
-
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_TRANSPOSEWX8_MMI
-#define HAS_TRANSPOSEUVWX8_MMI
-#endif
-
-void TransposeWxH_C(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-void TransposeWx8_C(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width);
-void TransposeWx16_C(const uint8_t* src,
-                     int src_stride,
-                     uint8_t* dst,
-                     int dst_stride,
-                     int width);
-void TransposeWx8_NEON(const uint8_t* src,
-                       int src_stride,
-                       uint8_t* dst,
-                       int dst_stride,
-                       int width);
-void TransposeWx8_SSSE3(const uint8_t* src,
-                        int src_stride,
-                        uint8_t* dst,
-                        int dst_stride,
-                        int width);
-void TransposeWx8_MMI(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst,
-                      int dst_stride,
-                      int width);
-void TransposeWx8_Fast_SSSE3(const uint8_t* src,
-                             int src_stride,
-                             uint8_t* dst,
-                             int dst_stride,
-                             int width);
-void TransposeWx16_MSA(const uint8_t* src,
-                       int src_stride,
-                       uint8_t* dst,
-                       int dst_stride,
-                       int width);
-
-void TransposeWx8_Any_NEON(const uint8_t* src,
-                           int src_stride,
-                           uint8_t* dst,
-                           int dst_stride,
-                           int width);
-void TransposeWx8_Any_SSSE3(const uint8_t* src,
-                            int src_stride,
-                            uint8_t* dst,
-                            int dst_stride,
-                            int width);
-void TransposeWx8_Any_MMI(const uint8_t* src,
-                          int src_stride,
-                          uint8_t* dst,
-                          int dst_stride,
-                          int width);
-void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
-                                 int src_stride,
-                                 uint8_t* dst,
-                                 int dst_stride,
-                                 int width);
-void TransposeWx16_Any_MSA(const uint8_t* src,
-                           int src_stride,
-                           uint8_t* dst,
-                           int dst_stride,
-                           int width);
-
-void TransposeUVWxH_C(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst_a,
-                      int dst_stride_a,
-                      uint8_t* dst_b,
-                      int dst_stride_b,
-                      int width,
-                      int height);
-
-void TransposeUVWx8_C(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst_a,
-                      int dst_stride_a,
-                      uint8_t* dst_b,
-                      int dst_stride_b,
-                      int width);
-void TransposeUVWx16_C(const uint8_t* src,
-                       int src_stride,
-                       uint8_t* dst_a,
-                       int dst_stride_a,
-                       uint8_t* dst_b,
-                       int dst_stride_b,
-                       int width);
-void TransposeUVWx8_SSE2(const uint8_t* src,
-                         int src_stride,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         int width);
-void TransposeUVWx8_NEON(const uint8_t* src,
-                         int src_stride,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         int width);
-void TransposeUVWx8_MMI(const uint8_t* src,
-                        int src_stride,
-                        uint8_t* dst_a,
-                        int dst_stride_a,
-                        uint8_t* dst_b,
-                        int dst_stride_b,
-                        int width);
-void TransposeUVWx16_MSA(const uint8_t* src,
-                         int src_stride,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         int width);
-
-void TransposeUVWx8_Any_SSE2(const uint8_t* src,
-                             int src_stride,
-                             uint8_t* dst_a,
-                             int dst_stride_a,
-                             uint8_t* dst_b,
-                             int dst_stride_b,
-                             int width);
-void TransposeUVWx8_Any_NEON(const uint8_t* src,
-                             int src_stride,
-                             uint8_t* dst_a,
-                             int dst_stride_a,
-                             uint8_t* dst_b,
-                             int dst_stride_b,
-                             int width);
-void TransposeUVWx8_Any_MMI(const uint8_t* src,
-                            int src_stride,
-                            uint8_t* dst_a,
-                            int dst_stride_a,
-                            uint8_t* dst_b,
-                            int dst_stride_b,
-                            int width);
-void TransposeUVWx16_Any_MSA(const uint8_t* src,
-                             int src_stride,
-                             uint8_t* dst_a,
-                             int dst_stride_a,
-                             uint8_t* dst_b,
-                             int dst_stride_b,
-                             int width);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_
diff --git a/thirdparty/libyuv/include/libyuv/row.h b/thirdparty/libyuv/include/libyuv/row.h
deleted file mode 100644
index 6c3f81e..0000000
--- a/thirdparty/libyuv/include/libyuv/row.h
+++ /dev/null
@@ -1,5274 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROW_H_
-#define INCLUDE_LIBYUV_ROW_H_
-
-#include <stdlib.h>  // For malloc.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__native_client__) && defined(__x86_64__)) || \
-    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
-#define LIBYUV_DISABLE_X86
-#endif
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// clang >= 3.5.0 required for Arm64.
-#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
-#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
-#define LIBYUV_DISABLE_NEON
-#endif  // clang >= 3.5
-#endif  // __clang__
-
-// GCC >= 4.7.0 required for AVX2.
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define GCC_HAS_AVX2 1
-#endif  // GNUC >= 4.7
-#endif  // __GNUC__
-
-// clang >= 3.4.0 required for AVX2.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
-#define CLANG_HAS_AVX2 1
-#endif  // clang >= 3.4
-#endif  // __clang__
-
-// clang >= 6.0.0 required for AVX512.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-// clang in xcode follows a different versioning scheme.
-// TODO(fbarchard): fix xcode 9 ios b/789.
-#if (__clang_major__ >= 7) && !defined(__APPLE__)
-#define CLANG_HAS_AVX512 1
-#endif  // clang >= 7
-#endif  // __clang__
-
-// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
-    _MSC_VER >= 1700
-#define VISUALC_HAS_AVX2 1
-#endif  // VisualStudio >= 2012
-
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-// Conversions:
-#define HAS_ABGRTOUVROW_SSSE3
-#define HAS_ABGRTOYROW_SSSE3
-#define HAS_ARGB1555TOARGBROW_SSE2
-#define HAS_ARGB4444TOARGBROW_SSE2
-#define HAS_ARGBEXTRACTALPHAROW_SSE2
-#define HAS_ARGBSETROW_X86
-#define HAS_ARGBSHUFFLEROW_SSSE3
-#define HAS_ARGBTOARGB1555ROW_SSE2
-#define HAS_ARGBTOARGB4444ROW_SSE2
-#define HAS_ARGBTORAWROW_SSSE3
-#define HAS_ARGBTORGB24ROW_SSSE3
-#define HAS_ARGBTORGB565DITHERROW_SSE2
-#define HAS_ARGBTORGB565ROW_SSE2
-#define HAS_ARGBTOUV444ROW_SSSE3
-#define HAS_ARGBTOUVJROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
-#define HAS_ARGBTOYJROW_SSSE3
-#define HAS_ARGBTOYROW_SSSE3
-#define HAS_BGRATOUVROW_SSSE3
-#define HAS_BGRATOYROW_SSSE3
-#define HAS_COPYROW_ERMS
-#define HAS_COPYROW_SSE2
-#define HAS_H422TOARGBROW_SSSE3
-#define HAS_HALFFLOATROW_SSE2
-#define HAS_I422TOARGB1555ROW_SSSE3
-#define HAS_I422TOARGB4444ROW_SSSE3
-#define HAS_I422TOARGBROW_SSSE3
-#define HAS_I422TORGB24ROW_SSSE3
-#define HAS_I422TORGB565ROW_SSSE3
-#define HAS_I422TORGBAROW_SSSE3
-#define HAS_I422TOUYVYROW_SSE2
-#define HAS_I422TOYUY2ROW_SSE2
-#define HAS_I444TOARGBROW_SSSE3
-#define HAS_J400TOARGBROW_SSE2
-#define HAS_J422TOARGBROW_SSSE3
-#define HAS_MERGEUVROW_SSE2
-#define HAS_MIRRORROW_SSSE3
-#define HAS_MIRRORSPLITUVROW_SSSE3
-#define HAS_NV12TOARGBROW_SSSE3
-#define HAS_NV12TORGB24ROW_SSSE3
-#define HAS_NV12TORGB565ROW_SSSE3
-#define HAS_NV21TOARGBROW_SSSE3
-#define HAS_NV21TORGB24ROW_SSSE3
-#define HAS_RAWTOARGBROW_SSSE3
-#define HAS_RAWTORGB24ROW_SSSE3
-#define HAS_RAWTOYROW_SSSE3
-#define HAS_RGB24TOARGBROW_SSSE3
-#define HAS_RGB24TOYROW_SSSE3
-#define HAS_RGB24TOYJROW_SSSE3
-#define HAS_RAWTOYJROW_SSSE3
-#define HAS_RGB565TOARGBROW_SSE2
-#define HAS_RGBATOUVROW_SSSE3
-#define HAS_RGBATOYROW_SSSE3
-#define HAS_SETROW_ERMS
-#define HAS_SETROW_X86
-#define HAS_SPLITUVROW_SSE2
-#define HAS_UYVYTOARGBROW_SSSE3
-#define HAS_UYVYTOUV422ROW_SSE2
-#define HAS_UYVYTOUVROW_SSE2
-#define HAS_UYVYTOYROW_SSE2
-#define HAS_YUY2TOARGBROW_SSSE3
-#define HAS_YUY2TOUV422ROW_SSE2
-#define HAS_YUY2TOUVROW_SSE2
-#define HAS_YUY2TOYROW_SSE2
-
-// Effects:
-#define HAS_ARGBADDROW_SSE2
-#define HAS_ARGBAFFINEROW_SSE2
-#define HAS_ARGBATTENUATEROW_SSSE3
-#define HAS_ARGBBLENDROW_SSSE3
-#define HAS_ARGBCOLORMATRIXROW_SSSE3
-#define HAS_ARGBCOLORTABLEROW_X86
-#define HAS_ARGBCOPYALPHAROW_SSE2
-#define HAS_ARGBCOPYYTOALPHAROW_SSE2
-#define HAS_ARGBGRAYROW_SSSE3
-#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
-#define HAS_ARGBMIRRORROW_SSE2
-#define HAS_ARGBMULTIPLYROW_SSE2
-#define HAS_ARGBPOLYNOMIALROW_SSE2
-#define HAS_ARGBQUANTIZEROW_SSE2
-#define HAS_ARGBSEPIAROW_SSSE3
-#define HAS_ARGBSHADEROW_SSE2
-#define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBUNATTENUATEROW_SSE2
-#define HAS_BLENDPLANEROW_SSSE3
-#define HAS_COMPUTECUMULATIVESUMROW_SSE2
-#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-#define HAS_INTERPOLATEROW_SSSE3
-#define HAS_RGBCOLORTABLEROW_X86
-#define HAS_SOBELROW_SSE2
-#define HAS_SOBELTOPLANEROW_SSE2
-#define HAS_SOBELXROW_SSE2
-#define HAS_SOBELXYROW_SSE2
-#define HAS_SOBELYROW_SSE2
-
-// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
-// caveat: clangcl uses row_win.cc which works.
-#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
-    defined(_MSC_VER)
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I422ALPHATOARGBROW_SSSE3
-#define HAS_I444ALPHATOARGBROW_SSSE3
-#endif
-#endif
-
-// The following are available on all x86 platforms, but
-// require VS2012, clang 3.4 or gcc 4.7.
-#if !defined(LIBYUV_DISABLE_X86) &&                          \
-    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
-     defined(GCC_HAS_AVX2))
-#define HAS_ARGBCOPYALPHAROW_AVX2
-#define HAS_ARGBCOPYYTOALPHAROW_AVX2
-#define HAS_ARGBEXTRACTALPHAROW_AVX2
-#define HAS_ARGBMIRRORROW_AVX2
-#define HAS_ARGBPOLYNOMIALROW_AVX2
-#define HAS_ARGBSHUFFLEROW_AVX2
-#define HAS_ARGBTORGB565DITHERROW_AVX2
-#define HAS_ARGBTOUVJROW_AVX2
-#define HAS_ARGBTOUVROW_AVX2
-#define HAS_ARGBTOYJROW_AVX2
-#define HAS_ARGBTOYROW_AVX2
-#define HAS_RGB24TOYJROW_AVX2
-#define HAS_RAWTOYJROW_AVX2
-#define HAS_COPYROW_AVX
-#define HAS_H422TOARGBROW_AVX2
-#define HAS_HALFFLOATROW_AVX2
-//  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
-#define HAS_I422TOARGB1555ROW_AVX2
-#define HAS_I422TOARGB4444ROW_AVX2
-#define HAS_I422TOARGBROW_AVX2
-#define HAS_I422TORGB24ROW_AVX2
-#define HAS_I422TORGB565ROW_AVX2
-#define HAS_I422TORGBAROW_AVX2
-#define HAS_I444TOARGBROW_AVX2
-#define HAS_INTERPOLATEROW_AVX2
-#define HAS_J422TOARGBROW_AVX2
-#define HAS_MERGEUVROW_AVX2
-#define HAS_MIRRORROW_AVX2
-#define HAS_NV12TOARGBROW_AVX2
-#define HAS_NV12TORGB24ROW_AVX2
-#define HAS_NV12TORGB565ROW_AVX2
-#define HAS_NV21TOARGBROW_AVX2
-#define HAS_NV21TORGB24ROW_AVX2
-#define HAS_SPLITUVROW_AVX2
-#define HAS_UYVYTOARGBROW_AVX2
-#define HAS_UYVYTOUV422ROW_AVX2
-#define HAS_UYVYTOUVROW_AVX2
-#define HAS_UYVYTOYROW_AVX2
-#define HAS_YUY2TOARGBROW_AVX2
-#define HAS_YUY2TOUV422ROW_AVX2
-#define HAS_YUY2TOUVROW_AVX2
-#define HAS_YUY2TOYROW_AVX2
-
-// Effects:
-#define HAS_ARGBADDROW_AVX2
-#define HAS_ARGBATTENUATEROW_AVX2
-#define HAS_ARGBMULTIPLYROW_AVX2
-#define HAS_ARGBSUBTRACTROW_AVX2
-#define HAS_ARGBUNATTENUATEROW_AVX2
-#define HAS_BLENDPLANEROW_AVX2
-
-#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
-    defined(_MSC_VER)
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I422ALPHATOARGBROW_AVX2
-#define HAS_I444ALPHATOARGBROW_AVX2
-#endif
-#endif
-
-// The following are available for AVX2 Visual C 32 bit:
-// TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
-    !defined(__clang__) && defined(VISUALC_HAS_AVX2)
-#define HAS_ARGB1555TOARGBROW_AVX2
-#define HAS_ARGB4444TOARGBROW_AVX2
-#define HAS_ARGBTOARGB1555ROW_AVX2
-#define HAS_ARGBTOARGB4444ROW_AVX2
-#define HAS_ARGBTORGB565ROW_AVX2
-#define HAS_J400TOARGBROW_AVX2
-#define HAS_RGB565TOARGBROW_AVX2
-#endif
-
-// The following are also available on x64 Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \
-    (!defined(__clang__) || defined(__SSSE3__))
-#define HAS_I444ALPHATOARGBROW_SSSE3
-#define HAS_I444TOARGBROW_SSSE3
-#define HAS_I422ALPHATOARGBROW_SSSE3
-#define HAS_I422TOARGBROW_SSSE3
-#endif
-
-// The following are available for gcc/clang x86 platforms:
-// TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__))
-#define HAS_ABGRTOAR30ROW_SSSE3
-#define HAS_ARGBTOAR30ROW_SSSE3
-#define HAS_ARGBTOAR64ROW_SSSE3
-#define HAS_ARGBTOAB64ROW_SSSE3
-#define HAS_AR64TOARGBROW_SSSE3
-#define HAS_AB64TOARGBROW_SSSE3
-#define HAS_CONVERT16TO8ROW_SSSE3
-#define HAS_CONVERT8TO16ROW_SSE2
-#define HAS_HALFMERGEUVROW_SSSE3
-#define HAS_I210TOAR30ROW_SSSE3
-#define HAS_I210TOARGBROW_SSSE3
-#define HAS_I212TOAR30ROW_SSSE3
-#define HAS_I212TOARGBROW_SSSE3
-#define HAS_I400TOARGBROW_SSE2
-#define HAS_I422TOAR30ROW_SSSE3
-#define HAS_I410TOAR30ROW_SSSE3
-#define HAS_I410TOARGBROW_SSSE3
-#define HAS_MERGEARGBROW_SSE2
-#define HAS_MERGEXRGBROW_SSE2
-#define HAS_MERGERGBROW_SSSE3
-#define HAS_MIRRORUVROW_SSSE3
-#define HAS_P210TOAR30ROW_SSSE3
-#define HAS_P210TOARGBROW_SSSE3
-#define HAS_P410TOAR30ROW_SSSE3
-#define HAS_P410TOARGBROW_SSSE3
-#define HAS_RAWTORGBAROW_SSSE3
-#define HAS_RGB24MIRRORROW_SSSE3
-#define HAS_RGBATOYJROW_SSSE3
-#define HAS_SPLITARGBROW_SSE2
-#define HAS_SPLITARGBROW_SSSE3
-#define HAS_SPLITXRGBROW_SSE2
-#define HAS_SPLITXRGBROW_SSSE3
-#define HAS_SPLITRGBROW_SSSE3
-#define HAS_SWAPUVROW_SSSE3
-
-#if defined(__x86_64__) || !defined(__pic__)
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I210ALPHATOARGBROW_SSSE3
-#define HAS_I410ALPHATOARGBROW_SSSE3
-#endif
-#endif
-
-// The following are available for AVX2 gcc/clang x86 platforms:
-// TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) &&                                       \
-    (defined(__x86_64__) || defined(__i386__)) && \
-    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-#define HAS_ABGRTOAR30ROW_AVX2
-#define HAS_ABGRTOUVROW_AVX2
-#define HAS_ABGRTOYROW_AVX2
-#define HAS_ARGBTOAR30ROW_AVX2
-#define HAS_ARGBTORAWROW_AVX2
-#define HAS_ARGBTORGB24ROW_AVX2
-#define HAS_ARGBTOAR64ROW_AVX2
-#define HAS_ARGBTOAB64ROW_AVX2
-#define HAS_AR64TOARGBROW_AVX2
-#define HAS_AB64TOARGBROW_AVX2
-#define HAS_CONVERT16TO8ROW_AVX2
-#define HAS_CONVERT8TO16ROW_AVX2
-#define HAS_DIVIDEROW_16_AVX2
-#define HAS_HALFMERGEUVROW_AVX2
-#define HAS_MERGEAR64ROW_AVX2
-#define HAS_MERGEARGB16TO8ROW_AVX2
-#define HAS_MERGEARGBROW_AVX2
-#define HAS_MERGEXR30ROW_AVX2
-#define HAS_MERGEXR64ROW_AVX2
-#define HAS_MERGEXRGB16TO8ROW_AVX2
-#define HAS_MERGEXRGBROW_AVX2
-#define HAS_I210TOAR30ROW_AVX2
-#define HAS_I210TOARGBROW_AVX2
-#define HAS_I212TOAR30ROW_AVX2
-#define HAS_I212TOARGBROW_AVX2
-#define HAS_I400TOARGBROW_AVX2
-#define HAS_I410TOAR30ROW_AVX2
-#define HAS_I410TOARGBROW_AVX2
-#define HAS_P210TOAR30ROW_AVX2
-#define HAS_P210TOARGBROW_AVX2
-#define HAS_P410TOAR30ROW_AVX2
-#define HAS_P410TOARGBROW_AVX2
-#define HAS_I422TOAR30ROW_AVX2
-#define HAS_I422TOUYVYROW_AVX2
-#define HAS_I422TOYUY2ROW_AVX2
-#define HAS_MERGEUVROW_16_AVX2
-#define HAS_MIRRORUVROW_AVX2
-#define HAS_MULTIPLYROW_16_AVX2
-#define HAS_RGBATOYJROW_AVX2
-#define HAS_SPLITARGBROW_AVX2
-#define HAS_SPLITXRGBROW_AVX2
-#define HAS_SPLITUVROW_16_AVX2
-#define HAS_SWAPUVROW_AVX2
-// TODO(fbarchard): Fix AVX2 version of YUV24
-// #define HAS_NV21TOYUV24ROW_AVX2
-
-#if defined(__x86_64__) || !defined(__pic__)
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I210ALPHATOARGBROW_AVX2
-#define HAS_I410ALPHATOARGBROW_AVX2
-#endif
-#endif
-
-// The following are available for AVX512 clang x86 platforms:
-// TODO(fbarchard): Port to GCC and Visual C
-// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
-#if !defined(LIBYUV_DISABLE_X86) &&                                       \
-    (defined(__x86_64__) || defined(__i386__)) && \
-    (defined(CLANG_HAS_AVX512))
-#define HAS_ARGBTORGB24ROW_AVX512VBMI
-#endif
-
-// The following are available on Neon platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_ABGRTOUVROW_NEON
-#define HAS_ABGRTOYROW_NEON
-#define HAS_ARGB1555TOARGBROW_NEON
-#define HAS_ARGB1555TOUVROW_NEON
-#define HAS_ARGB1555TOYROW_NEON
-#define HAS_ARGB4444TOARGBROW_NEON
-#define HAS_ARGB4444TOUVROW_NEON
-#define HAS_ARGB4444TOYROW_NEON
-#define HAS_ARGBEXTRACTALPHAROW_NEON
-#define HAS_ARGBSETROW_NEON
-#define HAS_ARGBTOARGB1555ROW_NEON
-#define HAS_ARGBTOARGB4444ROW_NEON
-#define HAS_ARGBTORAWROW_NEON
-#define HAS_ARGBTORGB24ROW_NEON
-#define HAS_ARGBTORGB565DITHERROW_NEON
-#define HAS_ARGBTORGB565ROW_NEON
-#define HAS_ARGBTOAR64ROW_NEON
-#define HAS_ARGBTOAB64ROW_NEON
-#define HAS_AR64TOARGBROW_NEON
-#define HAS_AB64TOARGBROW_NEON
-#define HAS_ARGBTOUV444ROW_NEON
-#define HAS_ARGBTOUVJROW_NEON
-#define HAS_ARGBTOUVROW_NEON
-#define HAS_ARGBTOYJROW_NEON
-#define HAS_ARGBTOYROW_NEON
-#define HAS_AYUVTOUVROW_NEON
-#define HAS_AYUVTOVUROW_NEON
-#define HAS_AYUVTOYROW_NEON
-#define HAS_BGRATOUVROW_NEON
-#define HAS_BGRATOYROW_NEON
-#define HAS_BYTETOFLOATROW_NEON
-#define HAS_COPYROW_NEON
-#define HAS_DIVIDEROW_16_NEON
-#define HAS_HALFFLOATROW_NEON
-#define HAS_HALFMERGEUVROW_NEON
-#define HAS_I400TOARGBROW_NEON
-#define HAS_I444ALPHATOARGBROW_NEON
-#define HAS_I422ALPHATOARGBROW_NEON
-#define HAS_I422TOARGB1555ROW_NEON
-#define HAS_I422TOARGB4444ROW_NEON
-#define HAS_I422TOARGBROW_NEON
-#define HAS_I422TORGB24ROW_NEON
-#define HAS_I422TORGB565ROW_NEON
-#define HAS_I422TORGBAROW_NEON
-#define HAS_I422TOUYVYROW_NEON
-#define HAS_I422TOYUY2ROW_NEON
-#define HAS_I444TOARGBROW_NEON
-#define HAS_J400TOARGBROW_NEON
-#define HAS_MERGEAR64ROW_NEON
-#define HAS_MERGEARGB16TO8ROW_NEON
-#define HAS_MERGEARGBROW_NEON
-#define HAS_MERGEXR30ROW_NEON
-#define HAS_MERGEXR64ROW_NEON
-#define HAS_MERGEXRGB16TO8ROW_NEON
-#define HAS_MERGEXRGBROW_NEON
-#define HAS_MERGEUVROW_NEON
-#define HAS_MERGEUVROW_16_NEON
-#define HAS_MIRRORROW_NEON
-#define HAS_MIRRORUVROW_NEON
-#define HAS_MIRRORSPLITUVROW_NEON
-#define HAS_MULTIPLYROW_16_NEON
-#define HAS_NV12TOARGBROW_NEON
-#define HAS_NV12TORGB24ROW_NEON
-#define HAS_NV12TORGB565ROW_NEON
-#define HAS_NV21TOARGBROW_NEON
-#define HAS_NV21TORGB24ROW_NEON
-#define HAS_NV21TOYUV24ROW_NEON
-#define HAS_RAWTOARGBROW_NEON
-#define HAS_RAWTORGB24ROW_NEON
-#define HAS_RAWTORGBAROW_NEON
-#define HAS_RAWTOUVROW_NEON
-#define HAS_RAWTOYJROW_NEON
-#define HAS_RAWTOYROW_NEON
-#define HAS_RGB24TOARGBROW_NEON
-#define HAS_RGB24TOUVROW_NEON
-#define HAS_RGB24TOYJROW_NEON
-#define HAS_RGB24TOYROW_NEON
-#define HAS_RGB565TOARGBROW_NEON
-#define HAS_RGB565TOUVROW_NEON
-#define HAS_RGB565TOYROW_NEON
-#define HAS_RGBATOUVROW_NEON
-#define HAS_RGBATOYJROW_NEON
-#define HAS_RGBATOYROW_NEON
-#define HAS_SETROW_NEON
-#define HAS_SPLITARGBROW_NEON
-#define HAS_SPLITXRGBROW_NEON
-#define HAS_SPLITRGBROW_NEON
-#define HAS_SPLITUVROW_NEON
-#define HAS_SPLITUVROW_16_NEON
-#define HAS_SWAPUVROW_NEON
-#define HAS_UYVYTOARGBROW_NEON
-#define HAS_UYVYTOUV422ROW_NEON
-#define HAS_UYVYTOUVROW_NEON
-#define HAS_UYVYTOYROW_NEON
-#define HAS_YUY2TOARGBROW_NEON
-#define HAS_YUY2TOUV422ROW_NEON
-#define HAS_YUY2TOUVROW_NEON
-#define HAS_YUY2TOYROW_NEON
-
-// Effects:
-#define HAS_ARGBADDROW_NEON
-#define HAS_ARGBATTENUATEROW_NEON
-#define HAS_ARGBBLENDROW_NEON
-#define HAS_ARGBCOLORMATRIXROW_NEON
-#define HAS_ARGBGRAYROW_NEON
-#define HAS_ARGBMIRRORROW_NEON
-#define HAS_RGB24MIRRORROW_NEON
-#define HAS_ARGBMULTIPLYROW_NEON
-#define HAS_ARGBQUANTIZEROW_NEON
-#define HAS_ARGBSEPIAROW_NEON
-#define HAS_ARGBSHADEROW_NEON
-#define HAS_ARGBSHUFFLEROW_NEON
-#define HAS_ARGBSUBTRACTROW_NEON
-#define HAS_INTERPOLATEROW_NEON
-#define HAS_SOBELROW_NEON
-#define HAS_SOBELTOPLANEROW_NEON
-#define HAS_SOBELXROW_NEON
-#define HAS_SOBELXYROW_NEON
-#define HAS_SOBELYROW_NEON
-#endif
-
-// The following are available on AArch64 platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-#define HAS_SCALESUMSAMPLES_NEON
-#define HAS_GAUSSROW_F32_NEON
-#define HAS_GAUSSCOL_F32_NEON
-
-#endif
-#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
-#define HAS_ABGRTOUVROW_MSA
-#define HAS_ABGRTOYROW_MSA
-#define HAS_ARGB1555TOARGBROW_MSA
-#define HAS_ARGB1555TOUVROW_MSA
-#define HAS_ARGB1555TOYROW_MSA
-#define HAS_ARGB4444TOARGBROW_MSA
-#define HAS_ARGBADDROW_MSA
-#define HAS_ARGBATTENUATEROW_MSA
-#define HAS_ARGBBLENDROW_MSA
-#define HAS_ARGBCOLORMATRIXROW_MSA
-#define HAS_ARGBEXTRACTALPHAROW_MSA
-#define HAS_ARGBGRAYROW_MSA
-#define HAS_ARGBMIRRORROW_MSA
-#define HAS_ARGBMULTIPLYROW_MSA
-#define HAS_ARGBQUANTIZEROW_MSA
-#define HAS_ARGBSEPIAROW_MSA
-#define HAS_ARGBSETROW_MSA
-#define HAS_ARGBSHADEROW_MSA
-#define HAS_ARGBSHUFFLEROW_MSA
-#define HAS_ARGBSUBTRACTROW_MSA
-#define HAS_ARGBTOARGB1555ROW_MSA
-#define HAS_ARGBTOARGB4444ROW_MSA
-#define HAS_ARGBTORAWROW_MSA
-#define HAS_ARGBTORGB24ROW_MSA
-#define HAS_ARGBTORGB565DITHERROW_MSA
-#define HAS_ARGBTORGB565ROW_MSA
-#define HAS_ARGBTOUV444ROW_MSA
-#define HAS_ARGBTOUVJROW_MSA
-#define HAS_ARGBTOUVROW_MSA
-#define HAS_ARGBTOYJROW_MSA
-#define HAS_ARGBTOYROW_MSA
-#define HAS_BGRATOUVROW_MSA
-#define HAS_BGRATOYROW_MSA
-#define HAS_HALFFLOATROW_MSA
-#define HAS_I400TOARGBROW_MSA
-#define HAS_I422TOUYVYROW_MSA
-#define HAS_I422TOYUY2ROW_MSA
-#define HAS_INTERPOLATEROW_MSA
-#define HAS_J400TOARGBROW_MSA
-#define HAS_MERGEUVROW_MSA
-#define HAS_MIRRORROW_MSA
-#define HAS_MIRRORUVROW_MSA
-#define HAS_MIRRORSPLITUVROW_MSA
-#define HAS_RAWTOARGBROW_MSA
-#define HAS_RAWTORGB24ROW_MSA
-#define HAS_RAWTOUVROW_MSA
-#define HAS_RAWTOYROW_MSA
-#define HAS_RGB24TOARGBROW_MSA
-#define HAS_RGB24TOUVROW_MSA
-#define HAS_RGB24TOYROW_MSA
-#define HAS_RGB565TOARGBROW_MSA
-#define HAS_RGB565TOUVROW_MSA
-#define HAS_RGB565TOYROW_MSA
-#define HAS_RGBATOUVROW_MSA
-#define HAS_RGBATOYROW_MSA
-#define HAS_SETROW_MSA
-#define HAS_SOBELROW_MSA
-#define HAS_SOBELTOPLANEROW_MSA
-#define HAS_SOBELXROW_MSA
-#define HAS_SOBELXYROW_MSA
-#define HAS_SOBELYROW_MSA
-#define HAS_SPLITUVROW_MSA
-#define HAS_UYVYTOUVROW_MSA
-#define HAS_UYVYTOYROW_MSA
-#define HAS_YUY2TOUV422ROW_MSA
-#define HAS_YUY2TOUVROW_MSA
-#define HAS_YUY2TOYROW_MSA
-#endif
-
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_ABGRTOUVROW_MMI
-#define HAS_ABGRTOYROW_MMI
-#define HAS_ARGB1555TOARGBROW_MMI
-#define HAS_ARGB1555TOUVROW_MMI
-#define HAS_ARGB1555TOYROW_MMI
-#define HAS_ARGB4444TOARGBROW_MMI
-#define HAS_ARGB4444TOUVROW_MMI
-#define HAS_ARGB4444TOYROW_MMI
-#define HAS_ARGBADDROW_MMI
-#define HAS_ARGBATTENUATEROW_MMI
-#define HAS_ARGBBLENDROW_MMI
-#define HAS_ARGBCOLORMATRIXROW_MMI
-#define HAS_ARGBCOPYALPHAROW_MMI
-#define HAS_ARGBCOPYYTOALPHAROW_MMI
-#define HAS_ARGBEXTRACTALPHAROW_MMI
-#define HAS_ARGBGRAYROW_MMI
-#define HAS_ARGBMIRRORROW_MMI
-#define HAS_ARGBMULTIPLYROW_MMI
-#define HAS_ARGBSEPIAROW_MMI
-#define HAS_ARGBSETROW_MMI
-#define HAS_ARGBSHADEROW_MMI
-#define HAS_ARGBSHUFFLEROW_MMI
-#define HAS_ARGBSUBTRACTROW_MMI
-#define HAS_ARGBTOARGB1555ROW_MMI
-#define HAS_ARGBTOARGB4444ROW_MMI
-#define HAS_ARGBTORAWROW_MMI
-#define HAS_ARGBTORGB24ROW_MMI
-#define HAS_ARGBTORGB565DITHERROW_MMI
-#define HAS_ARGBTORGB565ROW_MMI
-#define HAS_ARGBTOUV444ROW_MMI
-#define HAS_ARGBTOUVJROW_MMI
-#define HAS_ARGBTOUVROW_MMI
-#define HAS_ARGBTOYJROW_MMI
-#define HAS_ARGBTOYROW_MMI
-#define HAS_BGRATOUVROW_MMI
-#define HAS_BGRATOYROW_MMI
-#define HAS_BLENDPLANEROW_MMI
-#define HAS_COMPUTECUMULATIVESUMROW_MMI
-#define HAS_CUMULATIVESUMTOAVERAGEROW_MMI
-#define HAS_HALFFLOATROW_MMI
-#define HAS_I400TOARGBROW_MMI
-#define HAS_I422TOUYVYROW_MMI
-#define HAS_I422TOYUY2ROW_MMI
-#define HAS_INTERPOLATEROW_MMI
-#define HAS_J400TOARGBROW_MMI
-#define HAS_MERGERGBROW_MMI
-#define HAS_MERGEUVROW_MMI
-#define HAS_MIRRORROW_MMI
-#define HAS_MIRRORSPLITUVROW_MMI
-#define HAS_RAWTOARGBROW_MMI
-#define HAS_RAWTORGB24ROW_MMI
-#define HAS_RAWTOUVROW_MMI
-#define HAS_RAWTOYROW_MMI
-#define HAS_RGB24TOARGBROW_MMI
-#define HAS_RGB24TOUVROW_MMI
-#define HAS_RGB24TOYROW_MMI
-#define HAS_RGB565TOARGBROW_MMI
-#define HAS_RGB565TOUVROW_MMI
-#define HAS_RGB565TOYROW_MMI
-#define HAS_RGBATOUVROW_MMI
-#define HAS_RGBATOYROW_MMI
-#define HAS_SOBELROW_MMI
-#define HAS_SOBELTOPLANEROW_MMI
-#define HAS_SOBELXROW_MMI
-#define HAS_SOBELXYROW_MMI
-#define HAS_SOBELYROW_MMI
-#define HAS_SPLITRGBROW_MMI
-#define HAS_SPLITUVROW_MMI
-#define HAS_UYVYTOUVROW_MMI
-#define HAS_UYVYTOYROW_MMI
-#define HAS_YUY2TOUV422ROW_MMI
-#define HAS_YUY2TOUVROW_MMI
-#define HAS_YUY2TOYROW_MMI
-#endif
-
-#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
-#if defined(VISUALC_HAS_AVX2)
-#define SIMD_ALIGNED(var) __declspec(align(32)) var
-#else
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#endif
-#define LIBYUV_NOINLINE __declspec(noinline)
-typedef __declspec(align(16)) int16_t vec16[8];
-typedef __declspec(align(16)) int32_t vec32[4];
-typedef __declspec(align(16)) float vecf32[4];
-typedef __declspec(align(16)) int8_t vec8[16];
-typedef __declspec(align(16)) uint16_t uvec16[8];
-typedef __declspec(align(16)) uint32_t uvec32[4];
-typedef __declspec(align(16)) uint8_t uvec8[16];
-typedef __declspec(align(32)) int16_t lvec16[16];
-typedef __declspec(align(32)) int32_t lvec32[8];
-typedef __declspec(align(32)) int8_t lvec8[32];
-typedef __declspec(align(32)) uint16_t ulvec16[16];
-typedef __declspec(align(32)) uint32_t ulvec32[8];
-typedef __declspec(align(32)) uint8_t ulvec8[32];
-#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
-// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
-#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
-#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
-#else
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#endif
-#define LIBYUV_NOINLINE __attribute__((noinline))
-typedef int16_t __attribute__((vector_size(16))) vec16;
-typedef int32_t __attribute__((vector_size(16))) vec32;
-typedef float __attribute__((vector_size(16))) vecf32;
-typedef int8_t __attribute__((vector_size(16))) vec8;
-typedef uint16_t __attribute__((vector_size(16))) uvec16;
-typedef uint32_t __attribute__((vector_size(16))) uvec32;
-typedef uint8_t __attribute__((vector_size(16))) uvec8;
-typedef int16_t __attribute__((vector_size(32))) lvec16;
-typedef int32_t __attribute__((vector_size(32))) lvec32;
-typedef int8_t __attribute__((vector_size(32))) lvec8;
-typedef uint16_t __attribute__((vector_size(32))) ulvec16;
-typedef uint32_t __attribute__((vector_size(32))) ulvec32;
-typedef uint8_t __attribute__((vector_size(32))) ulvec8;
-#else
-#define SIMD_ALIGNED(var) var
-#define LIBYUV_NOINLINE
-typedef int16_t vec16[8];
-typedef int32_t vec32[4];
-typedef float vecf32[4];
-typedef int8_t vec8[16];
-typedef uint16_t uvec16[8];
-typedef uint32_t uvec32[4];
-typedef uint8_t uvec8[16];
-typedef int16_t lvec16[16];
-typedef int32_t lvec32[8];
-typedef int8_t lvec8[32];
-typedef uint16_t ulvec16[16];
-typedef uint32_t ulvec32[8];
-typedef uint8_t ulvec8[32];
-#endif
-
-#if defined(__aarch64__) || defined(__arm__)
-// This struct is for ARM color conversion.
-struct YuvConstants {
-  uvec8 kUVCoeff;
-  vec16 kRGBCoeffBias;
-};
-#else
-// This struct is for Intel color conversion.
-struct YuvConstants {
-  uint8_t kUVToB[32];
-  uint8_t kUVToG[32];
-  uint8_t kUVToR[32];
-  int16_t kYToRgb[16];
-  int16_t kYBiasToRgb[16];
-};
-
-// Offsets into YuvConstants structure
-#define KUVTOB 0
-#define KUVTOG 32
-#define KUVTOR 64
-#define KYTORGB 96
-#define KYBIASTORGB 128
-
-#endif
-
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
-
-#define align_buffer_64(var, size)                                           \
-  uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63));         /* NOLINT */ \
-  uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
-
-#define free_aligned_buffer_64(var) \
-  free(var##_mem);                  \
-  var = 0
-
-#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
-#define OMITFP
-#else
-#define OMITFP __attribute__((optimize("omit-frame-pointer")))
-#endif
-
-// NaCL macros for GCC x86 and x64.
-#if defined(__native_client__)
-#define LABELALIGN ".p2align 5\n"
-#else
-#define LABELALIGN
-#endif
-
-// Intel Code Analizer markers.  Insert IACA_START IACA_END around code to be
-// measured and then run with iaca -64 libyuv_unittest.
-// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within
-// inline assembly blocks.
-// example of iaca:
-// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest
-
-#if defined(__x86_64__) || defined(__i386__)
-
-#define IACA_ASM_START  \
-  ".byte 0x0F, 0x0B\n"  \
-  " movl $111, %%ebx\n" \
-  ".byte 0x64, 0x67, 0x90\n"
-
-#define IACA_ASM_END         \
-  " movl $222, %%ebx\n"      \
-  ".byte 0x64, 0x67, 0x90\n" \
-  ".byte 0x0F, 0x0B\n"
-
-#define IACA_SSC_MARK(MARK_ID)                        \
-  __asm__ __volatile__("\n\t  movl $" #MARK_ID        \
-                       ", %%ebx"                      \
-                       "\n\t  .byte 0x64, 0x67, 0x90" \
-                       :                              \
-                       :                              \
-                       : "memory");
-
-#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B");
-
-#else /* Visual C */
-#define IACA_UD_BYTES \
-  { __asm _emit 0x0F __asm _emit 0x0B }
-
-#define IACA_SSC_MARK(x) \
-  { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 }
-
-#define IACA_VC64_START __writegsbyte(111, 111);
-#define IACA_VC64_END __writegsbyte(222, 222);
-#endif
-
-#define IACA_START     \
-  {                    \
-    IACA_UD_BYTES      \
-    IACA_SSC_MARK(111) \
-  }
-#define IACA_END       \
-  {                    \
-    IACA_SSC_MARK(222) \
-    IACA_UD_BYTES      \
-  }
-
-void I444ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             const uint8_t* src_a,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             const uint8_t* src_a,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToRGBARow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToRGB565Row_NEON(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I422ToARGB1555Row_NEON(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToARGB4444Row_NEON(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV12ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_uv,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void NV12ToRGB565Row_NEON(const uint8_t* src_y,
-                          const uint8_t* src_uv,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void NV21ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_vu,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void NV12ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV21ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV21ToYUV24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_yuv24,
-                         int width);
-void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I444ToARGBRow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I444ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-
-void I422ToARGBRow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I422ToRGBARow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I422ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            const uint8_t* src_a,
-                            uint8_t* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGB24Row_MSA(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToRGB565Row_MSA(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToARGB4444Row_MSA(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb4444,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I422ToARGB1555Row_MSA(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb1555,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void NV12ToARGBRow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_uv,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void NV12ToRGB565Row_MSA(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV21ToARGBRow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_vu,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-
-void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
-void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
-void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
-void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
-void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
-void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
-void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
-void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
-void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
-void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width);
-void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
-void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width);
-void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
-void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void ARGBToUVRow_NEON(const uint8_t* src_argb,
-                      int src_stride_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void ARGBToUV444Row_MSA(const uint8_t* src_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void ARGBToUVRow_MSA(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void ARGBToUV444Row_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void ARGBToUVRow_MMI(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void ARGBToUVJRow_NEON(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void BGRAToUVRow_NEON(const uint8_t* src_bgra,
-                      int src_stride_bgra,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void ABGRToUVRow_NEON(const uint8_t* src_abgr,
-                      int src_stride_abgr,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void RGBAToUVRow_NEON(const uint8_t* src_rgba,
-                      int src_stride_rgba,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
-                       int src_stride_rgb24,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void RAWToUVRow_NEON(const uint8_t* src_raw,
-                     int src_stride_raw,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
-                        int src_stride_rgb565,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
-                          int src_stride_argb1555,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
-                          int src_stride_argb4444,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void BGRAToUVRow_MSA(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void ABGRToUVRow_MSA(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void RGBAToUVRow_MSA(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void RAWToUVRow_MSA(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width);
-void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
-                       int src_stride_rgb565,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
-                         int src_stride_argb1555,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void BGRAToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void ABGRToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void RGBAToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void RAWToUVRow_MMI(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width);
-void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
-                       int src_stride_rgb565,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
-                         int src_stride_argb1555,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
-                         int src_stride_argb4444,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
-void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
-void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
-void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
-void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
-void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
-void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width);
-void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
-void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
-                         uint8_t* dst_y,
-                         int width);
-void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
-                         uint8_t* dst_y,
-                         int width);
-void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
-void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
-void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
-
-void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
-void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
-void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
-void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
-void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
-void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
-void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
-void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
-void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
-void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
-void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
-void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
-void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-
-void ARGBToUVRow_AVX2(const uint8_t* src_argb,
-                      int src_stride_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
-                      int src_stride_abgr,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
-                        int src_stride_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
-                       int src_stride_bgra,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
-                       int src_stride_abgr,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
-                       int src_stride_rgba,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
-                           int src_stride,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
-void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
-void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
-                            int src_stride,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width);
-void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
-void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
-void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
-void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width);
-void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width);
-void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
-                           int src_stride,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
-void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                           int src_stride,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
-void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
-                         int src_stride,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                            int src_stride,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width);
-void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                              int src_stride,
-                              uint8_t* dst_u,
-                              uint8_t* dst_v,
-                              int width);
-void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                              int src_stride,
-                              uint8_t* dst_u,
-                              uint8_t* dst_v,
-                              int width);
-void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr,
-                          int src_stride_ptr,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr,
-                          int src_stride_ptr,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void RAWToUVRow_Any_MSA(const uint8_t* src_ptr,
-                        int src_stride_ptr,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr,
-                           int src_stride_ptr,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
-void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,
-                             int src_stride_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr,
-                          int src_stride_ptr,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr,
-                          int src_stride_ptr,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void RAWToUVRow_Any_MMI(const uint8_t* src_ptr,
-                        int src_stride_ptr,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr,
-                           int src_stride_ptr,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
-void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr,
-                             int src_stride_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
-                             int src_stride_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb,
-                   int src_stride_rgb,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb,
-                   int src_stride_rgb,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width);
-void BGRAToUVRow_C(const uint8_t* src_rgb,
-                   int src_stride_rgb,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width);
-void ABGRToUVRow_C(const uint8_t* src_rgb,
-                   int src_stride_rgb,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width);
-void RGBAToUVRow_C(const uint8_t* src_rgb,
-                   int src_stride_rgb,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width);
-void RGB24ToUVRow_C(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width);
-void RAWToUVRow_C(const uint8_t* src_rgb,
-                  int src_stride_rgb,
-                  uint8_t* dst_u,
-                  uint8_t* dst_v,
-                  int width);
-void RGB565ToUVRow_C(const uint8_t* src_rgb565,
-                     int src_stride_rgb565,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
-                       int src_stride_argb1555,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
-                       int src_stride_argb4444,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-
-void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr,
-                              uint8_t* dst_u,
-                              uint8_t* dst_v,
-                              int width);
-
-void ARGBToUV444Row_C(const uint8_t* src_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-
-void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
-void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
-void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
-void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
-void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
-void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
-void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
-void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
-void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
-void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
-void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-
-void MirrorSplitUVRow_SSSE3(const uint8_t* src,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width);
-void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
-void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void MirrorSplitUVRow_C(const uint8_t* src_uv,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-
-void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
-void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
-void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
-void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-
-void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
-                          uint8_t* dst_rgb24,
-                          int width);
-void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
-                         uint8_t* dst_rgb24,
-                         int width);
-void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width);
-void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-
-void SplitUVRow_C(const uint8_t* src_uv,
-                  uint8_t* dst_u,
-                  uint8_t* dst_v,
-                  int width);
-void SplitUVRow_SSE2(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void SplitUVRow_AVX2(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void SplitUVRow_NEON(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void SplitUVRow_MSA(const uint8_t* src_uv,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width);
-void SplitUVRow_MMI(const uint8_t* src_uv,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width);
-void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void SplitUVRow_Any_AVX2(const uint8_t* src_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void SplitUVRow_Any_NEON(const uint8_t* src_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void SplitUVRow_Any_MSA(const uint8_t* src_ptr,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void SplitUVRow_Any_MMI(const uint8_t* src_ptr,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-
-void MergeUVRow_C(const uint8_t* src_u,
-                  const uint8_t* src_v,
-                  uint8_t* dst_uv,
-                  int width);
-void MergeUVRow_SSE2(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width);
-void MergeUVRow_AVX2(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width);
-void MergeUVRow_NEON(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width);
-void MergeUVRow_MSA(const uint8_t* src_u,
-                    const uint8_t* src_v,
-                    uint8_t* dst_uv,
-                    int width);
-void MergeUVRow_MMI(const uint8_t* src_u,
-                    const uint8_t* src_v,
-                    uint8_t* dst_uv,
-                    int width);
-void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
-                         const uint8_t* uv_buf,
-                         uint8_t* dst_ptr,
-                         int width);
-void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
-                         const uint8_t* uv_buf,
-                         uint8_t* dst_ptr,
-                         int width);
-void MergeUVRow_Any_NEON(const uint8_t* y_buf,
-                         const uint8_t* uv_buf,
-                         uint8_t* dst_ptr,
-                         int width);
-void MergeUVRow_Any_MSA(const uint8_t* y_buf,
-                        const uint8_t* uv_buf,
-                        uint8_t* dst_ptr,
-                        int width);
-void MergeUVRow_Any_MMI(const uint8_t* y_buf,
-                        const uint8_t* uv_buf,
-                        uint8_t* dst_ptr,
-                        int width);
-
-void HalfMergeUVRow_C(const uint8_t* src_u,
-                      int src_stride_u,
-                      const uint8_t* src_v,
-                      int src_stride_v,
-                      uint8_t* dst_uv,
-                      int width);
-
-void HalfMergeUVRow_NEON(const uint8_t* src_u,
-                         int src_stride_u,
-                         const uint8_t* src_v,
-                         int src_stride_v,
-                         uint8_t* dst_uv,
-                         int width);
-
-void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
-                          int src_stride_u,
-                          const uint8_t* src_v,
-                          int src_stride_v,
-                          uint8_t* dst_uv,
-                          int width);
-
-void HalfMergeUVRow_AVX2(const uint8_t* src_u,
-                         int src_stride_u,
-                         const uint8_t* src_v,
-                         int src_stride_v,
-                         uint8_t* dst_uv,
-                         int width);
-
-void SplitRGBRow_C(const uint8_t* src_rgb,
-                   uint8_t* dst_r,
-                   uint8_t* dst_g,
-                   uint8_t* dst_b,
-                   int width);
-void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       int width);
-void SplitRGBRow_NEON(const uint8_t* src_rgb,
-                      uint8_t* dst_r,
-                      uint8_t* dst_g,
-                      uint8_t* dst_b,
-                      int width);
-void SplitRGBRow_MMI(const uint8_t* src_rgb,
-                     uint8_t* dst_r,
-                     uint8_t* dst_g,
-                     uint8_t* dst_b,
-                     int width);
-void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
-                           uint8_t* dst_r,
-                           uint8_t* dst_g,
-                           uint8_t* dst_b,
-                           int width);
-void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
-                          uint8_t* dst_r,
-                          uint8_t* dst_g,
-                          uint8_t* dst_b,
-                          int width);
-void SplitRGBRow_Any_MMI(const uint8_t* src_ptr,
-                         uint8_t* dst_r,
-                         uint8_t* dst_g,
-                         uint8_t* dst_b,
-                         int width);
-
-void MergeRGBRow_C(const uint8_t* src_r,
-                   const uint8_t* src_g,
-                   const uint8_t* src_b,
-                   uint8_t* dst_rgb,
-                   int width);
-void MergeRGBRow_SSSE3(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       uint8_t* dst_rgb,
-                       int width);
-void MergeRGBRow_NEON(const uint8_t* src_r,
-                      const uint8_t* src_g,
-                      const uint8_t* src_b,
-                      uint8_t* dst_rgb,
-                      int width);
-void MergeRGBRow_MMI(const uint8_t* src_r,
-                     const uint8_t* src_g,
-                     const uint8_t* src_b,
-                     uint8_t* dst_rgb,
-                     int width);
-void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void MergeRGBRow_Any_NEON(const uint8_t* src_r,
-                          const uint8_t* src_g,
-                          const uint8_t* src_b,
-                          uint8_t* dst_rgb,
-                          int width);
-void MergeRGBRow_Any_MMI(const uint8_t* src_r,
-                         const uint8_t* src_g,
-                         const uint8_t* src_b,
-                         uint8_t* dst_rgb,
-                         int width);
-void MergeARGBRow_C(const uint8_t* src_r,
-                    const uint8_t* src_g,
-                    const uint8_t* src_b,
-                    const uint8_t* src_a,
-                    uint8_t* dst_argb,
-                    int width);
-void MergeARGBRow_SSE2(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       const uint8_t* src_a,
-                       uint8_t* dst_argb,
-                       int width);
-void MergeARGBRow_AVX2(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       const uint8_t* src_a,
-                       uint8_t* dst_argb,
-                       int width);
-void MergeARGBRow_NEON(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       const uint8_t* src_a,
-                       uint8_t* dst_argb,
-                       int width);
-void MergeARGBRow_Any_SSE2(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           const uint8_t* a_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void MergeARGBRow_Any_AVX2(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           const uint8_t* a_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void MergeARGBRow_Any_NEON(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           const uint8_t* a_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void SplitARGBRow_C(const uint8_t* src_argb,
-                    uint8_t* dst_r,
-                    uint8_t* dst_g,
-                    uint8_t* dst_b,
-                    uint8_t* dst_a,
-                    int width);
-void SplitARGBRow_SSE2(const uint8_t* src_argb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       uint8_t* dst_a,
-                       int width);
-void SplitARGBRow_SSSE3(const uint8_t* src_argb,
-                        uint8_t* dst_r,
-                        uint8_t* dst_g,
-                        uint8_t* dst_b,
-                        uint8_t* dst_a,
-                        int width);
-void SplitARGBRow_AVX2(const uint8_t* src_argb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       uint8_t* dst_a,
-                       int width);
-void SplitARGBRow_NEON(const uint8_t* src_rgba,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       uint8_t* dst_a,
-                       int width);
-void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr,
-                           uint8_t* dst_r,
-                           uint8_t* dst_g,
-                           uint8_t* dst_b,
-                           uint8_t* dst_a,
-                           int width);
-void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr,
-                            uint8_t* dst_r,
-                            uint8_t* dst_g,
-                            uint8_t* dst_b,
-                            uint8_t* dst_a,
-                            int width);
-void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr,
-                           uint8_t* dst_r,
-                           uint8_t* dst_g,
-                           uint8_t* dst_b,
-                           uint8_t* dst_a,
-                           int width);
-void SplitARGBRow_Any_NEON(const uint8_t* src_ptr,
-                           uint8_t* dst_r,
-                           uint8_t* dst_g,
-                           uint8_t* dst_b,
-                           uint8_t* dst_a,
-                           int width);
-void MergeXRGBRow_C(const uint8_t* src_r,
-                    const uint8_t* src_g,
-                    const uint8_t* src_b,
-                    uint8_t* dst_argb,
-                    int width);
-void MergeXRGBRow_SSE2(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       uint8_t* dst_argb,
-                       int width);
-void MergeXRGBRow_AVX2(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       uint8_t* dst_argb,
-                       int width);
-void MergeXRGBRow_NEON(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       uint8_t* dst_argb,
-                       int width);
-void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void MergeXRGBRow_Any_NEON(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void SplitXRGBRow_C(const uint8_t* src_argb,
-                    uint8_t* dst_r,
-                    uint8_t* dst_g,
-                    uint8_t* dst_b,
-                    int width);
-void SplitXRGBRow_SSE2(const uint8_t* src_argb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       int width);
-void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
-                        uint8_t* dst_r,
-                        uint8_t* dst_g,
-                        uint8_t* dst_b,
-                        int width);
-void SplitXRGBRow_AVX2(const uint8_t* src_argb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       int width);
-void SplitXRGBRow_NEON(const uint8_t* src_rgba,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       int width);
-void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr,
-                           uint8_t* dst_r,
-                           uint8_t* dst_g,
-                           uint8_t* dst_b,
-                           int width);
-void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr,
-                            uint8_t* dst_r,
-                            uint8_t* dst_g,
-                            uint8_t* dst_b,
-                            int width);
-void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr,
-                           uint8_t* dst_r,
-                           uint8_t* dst_g,
-                           uint8_t* dst_b,
-                           int width);
-void SplitXRGBRow_Any_NEON(const uint8_t* src_ptr,
-                           uint8_t* dst_r,
-                           uint8_t* dst_g,
-                           uint8_t* dst_b,
-                           int width);
-
-void MergeXR30Row_C(const uint16_t* src_r,
-                    const uint16_t* src_g,
-                    const uint16_t* src_b,
-                    uint8_t* dst_ar30,
-                    int depth,
-                    int width);
-void MergeAR64Row_C(const uint16_t* src_r,
-                    const uint16_t* src_g,
-                    const uint16_t* src_b,
-                    const uint16_t* src_a,
-                    uint16_t* dst_ar64,
-                    int depth,
-                    int width);
-void MergeARGB16To8Row_C(const uint16_t* src_r,
-                         const uint16_t* src_g,
-                         const uint16_t* src_b,
-                         const uint16_t* src_a,
-                         uint8_t* dst_argb,
-                         int depth,
-                         int width);
-void MergeXR64Row_C(const uint16_t* src_r,
-                    const uint16_t* src_g,
-                    const uint16_t* src_b,
-                    uint16_t* dst_ar64,
-                    int depth,
-                    int width);
-void MergeXRGB16To8Row_C(const uint16_t* src_r,
-                         const uint16_t* src_g,
-                         const uint16_t* src_b,
-                         uint8_t* dst_argb,
-                         int depth,
-                         int width);
-void MergeXR30Row_AVX2(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       uint8_t* dst_ar30,
-                       int depth,
-                       int width);
-void MergeAR64Row_AVX2(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       const uint16_t* src_a,
-                       uint16_t* dst_ar64,
-                       int depth,
-                       int width);
-void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
-                            const uint16_t* src_g,
-                            const uint16_t* src_b,
-                            const uint16_t* src_a,
-                            uint8_t* dst_argb,
-                            int depth,
-                            int width);
-void MergeXR64Row_AVX2(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       uint16_t* dst_ar64,
-                       int depth,
-                       int width);
-void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
-                            const uint16_t* src_g,
-                            const uint16_t* src_b,
-                            uint8_t* dst_argb,
-                            int depth,
-                            int width);
-void MergeXR30Row_NEON(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       uint8_t* dst_ar30,
-                       int depth,
-                       int width);
-void MergeXR30Row_10_NEON(const uint16_t* src_r,
-                          const uint16_t* src_g,
-                          const uint16_t* src_b,
-                          uint8_t* dst_ar30,
-                          int /* depth */,
-                          int width);
-void MergeAR64Row_NEON(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       const uint16_t* src_a,
-                       uint16_t* dst_ar64,
-                       int depth,
-                       int width);
-void MergeARGB16To8Row_NEON(const uint16_t* src_r,
-                            const uint16_t* src_g,
-                            const uint16_t* src_b,
-                            const uint16_t* src_a,
-                            uint8_t* dst_argb,
-                            int depth,
-                            int width);
-void MergeXR64Row_NEON(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       uint16_t* dst_ar64,
-                       int depth,
-                       int width);
-void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
-                            const uint16_t* src_g,
-                            const uint16_t* src_b,
-                            uint8_t* dst_argb,
-                            int depth,
-                            int width);
-void MergeXR30Row_Any_AVX2(const uint16_t* r_buf,
-                           const uint16_t* g_buf,
-                           const uint16_t* b_buf,
-                           uint8_t* dst_ptr,
-                           int depth,
-                           int width);
-void MergeAR64Row_Any_AVX2(const uint16_t* r_buf,
-                           const uint16_t* g_buf,
-                           const uint16_t* b_buf,
-                           const uint16_t* a_buf,
-                           uint16_t* dst_ptr,
-                           int depth,
-                           int width);
-void MergeXR64Row_Any_AVX2(const uint16_t* r_buf,
-                           const uint16_t* g_buf,
-                           const uint16_t* b_buf,
-                           uint16_t* dst_ptr,
-                           int depth,
-                           int width);
-void MergeARGB16To8Row_Any_AVX2(const uint16_t* r_buf,
-                                const uint16_t* g_buf,
-                                const uint16_t* b_buf,
-                                const uint16_t* a_buf,
-                                uint8_t* dst_ptr,
-                                int depth,
-                                int width);
-void MergeXRGB16To8Row_Any_AVX2(const uint16_t* r_buf,
-                                const uint16_t* g_buf,
-                                const uint16_t* b_buf,
-                                uint8_t* dst_ptr,
-                                int depth,
-                                int width);
-void MergeXR30Row_Any_NEON(const uint16_t* r_buf,
-                           const uint16_t* g_buf,
-                           const uint16_t* b_buf,
-                           uint8_t* dst_ptr,
-                           int depth,
-                           int width);
-void MergeXR30Row_10_Any_NEON(const uint16_t* r_buf,
-                              const uint16_t* g_buf,
-                              const uint16_t* b_buf,
-                              uint8_t* dst_ptr,
-                              int depth,
-                              int width);
-void MergeAR64Row_Any_NEON(const uint16_t* r_buf,
-                           const uint16_t* g_buf,
-                           const uint16_t* b_buf,
-                           const uint16_t* a_buf,
-                           uint16_t* dst_ptr,
-                           int depth,
-                           int width);
-void MergeARGB16To8Row_Any_NEON(const uint16_t* r_buf,
-                                const uint16_t* g_buf,
-                                const uint16_t* b_buf,
-                                const uint16_t* a_buf,
-                                uint8_t* dst_ptr,
-                                int depth,
-                                int width);
-void MergeXR64Row_Any_NEON(const uint16_t* r_buf,
-                           const uint16_t* g_buf,
-                           const uint16_t* b_buf,
-                           uint16_t* dst_ptr,
-                           int depth,
-                           int width);
-void MergeXRGB16To8Row_Any_NEON(const uint16_t* r_buf,
-                                const uint16_t* g_buf,
-                                const uint16_t* b_buf,
-                                uint8_t* dst_ptr,
-                                int depth,
-                                int width);
-
-void MergeUVRow_16_C(const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint16_t* dst_uv,
-                     int depth,
-                     int width);
-void MergeUVRow_16_AVX2(const uint16_t* src_u,
-                        const uint16_t* src_v,
-                        uint16_t* dst_uv,
-                        int depth,
-                        int width);
-void MergeUVRow_16_Any_AVX2(const uint16_t* src_u,
-                            const uint16_t* src_v,
-                            uint16_t* dst_uv,
-                            int depth,
-                            int width);
-void MergeUVRow_16_NEON(const uint16_t* src_u,
-                        const uint16_t* src_v,
-                        uint16_t* dst_uv,
-                        int depth,
-                        int width);
-void MergeUVRow_16_Any_NEON(const uint16_t* src_u,
-                            const uint16_t* src_v,
-                            uint16_t* dst_uv,
-                            int depth,
-                            int width);
-
-void SplitUVRow_16_C(const uint16_t* src_uv,
-                     uint16_t* dst_u,
-                     uint16_t* dst_v,
-                     int depth,
-                     int width);
-void SplitUVRow_16_AVX2(const uint16_t* src_uv,
-                        uint16_t* dst_u,
-                        uint16_t* dst_v,
-                        int depth,
-                        int width);
-void SplitUVRow_16_Any_AVX2(const uint16_t* src_uv,
-                            uint16_t* dst_u,
-                            uint16_t* dst_v,
-                            int depth,
-                            int width);
-void SplitUVRow_16_NEON(const uint16_t* src_uv,
-                        uint16_t* dst_u,
-                        uint16_t* dst_v,
-                        int depth,
-                        int width);
-void SplitUVRow_16_Any_NEON(const uint16_t* src_uv,
-                            uint16_t* dst_u,
-                            uint16_t* dst_v,
-                            int depth,
-                            int width);
-
-void MultiplyRow_16_C(const uint16_t* src_y,
-                      uint16_t* dst_y,
-                      int scale,
-                      int width);
-void MultiplyRow_16_AVX2(const uint16_t* src_y,
-                         uint16_t* dst_y,
-                         int scale,
-                         int width);
-void MultiplyRow_16_Any_AVX2(const uint16_t* src_ptr,
-                             uint16_t* dst_ptr,
-                             int scale,
-                             int width);
-void MultiplyRow_16_NEON(const uint16_t* src_y,
-                         uint16_t* dst_y,
-                         int scale,
-                         int width);
-void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr,
-                             uint16_t* dst_ptr,
-                             int scale,
-                             int width);
-
-void DivideRow_16_C(const uint16_t* src_y,
-                    uint16_t* dst_y,
-                    int scale,
-                    int width);
-void DivideRow_16_AVX2(const uint16_t* src_y,
-                       uint16_t* dst_y,
-                       int scale,
-                       int width);
-void DivideRow_16_Any_AVX2(const uint16_t* src_ptr,
-                           uint16_t* dst_ptr,
-                           int scale,
-                           int width);
-void DivideRow_16_NEON(const uint16_t* src_y,
-                       uint16_t* dst_y,
-                       int scale,
-                       int width);
-void DivideRow_16_Any_NEON(const uint16_t* src_ptr,
-                           uint16_t* dst_ptr,
-                           int scale,
-                           int width);
-
-void Convert8To16Row_C(const uint8_t* src_y,
-                       uint16_t* dst_y,
-                       int scale,
-                       int width);
-void Convert8To16Row_SSE2(const uint8_t* src_y,
-                          uint16_t* dst_y,
-                          int scale,
-                          int width);
-void Convert8To16Row_AVX2(const uint8_t* src_y,
-                          uint16_t* dst_y,
-                          int scale,
-                          int width);
-void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr,
-                              uint16_t* dst_ptr,
-                              int scale,
-                              int width);
-void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr,
-                              uint16_t* dst_ptr,
-                              int scale,
-                              int width);
-
-void Convert16To8Row_C(const uint16_t* src_y,
-                       uint8_t* dst_y,
-                       int scale,
-                       int width);
-void Convert16To8Row_SSSE3(const uint16_t* src_y,
-                           uint8_t* dst_y,
-                           int scale,
-                           int width);
-void Convert16To8Row_AVX2(const uint16_t* src_y,
-                          uint8_t* dst_y,
-                          int scale,
-                          int width);
-void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int scale,
-                               int width);
-void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int scale,
-                              int width);
-
-void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
-void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
-void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
-void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
-void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
-void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-
-void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
-
-void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-
-void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
-void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width);
-void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width);
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width);
-void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
-                             uint8_t* dst_a,
-                             int width);
-void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
-                             uint8_t* dst_a,
-                             int width);
-void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
-                                  uint8_t* dst_ptr,
-                                  int width);
-void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr,
-                                  uint8_t* dst_ptr,
-                                  int width);
-void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,
-                                  uint8_t* dst_ptr,
-                                  int width);
-void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 int width);
-void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 int width);
-
-void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
-                                  uint8_t* dst_ptr,
-                                  int width);
-void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,
-                                  uint8_t* dst_ptr,
-                                  int width);
-void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 int width);
-
-void SetRow_C(uint8_t* dst, uint8_t v8, int width);
-void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);
-void SetRow_X86(uint8_t* dst, uint8_t v8, int width);
-void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width);
-void SetRow_NEON(uint8_t* dst, uint8_t v8, int width);
-void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width);
-void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width);
-
-void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width);
-void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width);
-void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);
-void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);
-void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);
-void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);
-void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width);
-void ARGBSetRow_Any_MMI(uint8_t* dst_ptr, uint32_t v32, int width);
-
-// ARGBShufflers for BGRAToARGB etc.
-void ARGBShuffleRow_C(const uint8_t* src_argb,
-                      uint8_t* dst_argb,
-                      const uint8_t* shuffler,
-                      int width);
-void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          const uint8_t* shuffler,
-                          int width);
-void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const uint8_t* shuffler,
-                         int width);
-void ARGBShuffleRow_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const uint8_t* shuffler,
-                         int width);
-void ARGBShuffleRow_MSA(const uint8_t* src_argb,
-                        uint8_t* dst_argb,
-                        const uint8_t* shuffler,
-                        int width);
-void ARGBShuffleRow_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_argb,
-                        const uint8_t* shuffler,
-                        int width);
-void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              const uint8_t* param,
-                              int width);
-void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             const uint8_t* param,
-                             int width);
-void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             const uint8_t* param,
-                             int width);
-void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            const uint8_t* param,
-                            int width);
-void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            const uint8_t* param,
-                            int width);
-
-void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
-                          uint8_t* dst_argb,
-                          int width);
-void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
-void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
-                          uint8_t* dst_argb,
-                          int width);
-void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
-                            uint8_t* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
-                            uint8_t* dst_argb,
-                            int width);
-
-void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
-                         uint8_t* dst_argb,
-                         int width);
-void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
-void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
-void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
-void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
-                          uint8_t* dst_argb,
-                          int width);
-void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
-                         uint8_t* dst_argb,
-                         int width);
-void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
-                         uint8_t* dst_argb,
-                         int width);
-void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
-                            uint8_t* dst_argb,
-                            int width);
-void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
-                           uint8_t* dst_argb,
-                           int width);
-void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
-                           uint8_t* dst_argb,
-                           int width);
-void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
-                            uint8_t* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
-                           uint8_t* dst_argb,
-                           int width);
-void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
-                           uint8_t* dst_argb,
-                           int width);
-void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
-void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
-void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
-void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
-                         uint8_t* dst_argb,
-                         int width);
-void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
-                         uint8_t* dst_argb,
-                         int width);
-void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width);
-void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width);
-void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
-void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width);
-
-void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-
-void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-
-void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-
-void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-
-void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
-void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
-void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
-void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
-
-void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
-
-void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
-
-void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
-                             uint8_t* dst_rgb,
-                             const uint32_t dither4,
-                             int width);
-void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
-                                uint8_t* dst,
-                                const uint32_t dither4,
-                                int width);
-void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
-                                uint8_t* dst,
-                                const uint32_t dither4,
-                                int width);
-
-void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
-                            uint8_t* dst_rgb,
-                            int width);
-void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
-                            uint8_t* dst_rgb,
-                            int width);
-void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
-
-void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_rgb24,
-                         int width);
-void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width);
-void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
-                          uint8_t* dst_rgb565,
-                          int width);
-void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_argb1555,
-                            int width);
-void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_argb4444,
-                            int width);
-void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
-                                uint8_t* dst_rgb,
-                                const uint32_t dither4,
-                                int width);
-void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width);
-void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width);
-void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
-                               uint8_t* dst_rgb,
-                               const uint32_t dither4,
-                               int width);
-
-void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width);
-void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width);
-void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
-                               uint8_t* dst_rgb,
-                               const uint32_t dither4,
-                               int width);
-
-void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
-void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
-
-void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
-void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
-void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
-void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
-void AR64ShuffleRow_C(const uint8_t* src_ar64,
-                      uint8_t* dst_ar64,
-                      const uint8_t* shuffler,
-                      int width);
-void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
-                         uint16_t* dst_ar64,
-                         int width);
-void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
-                         uint16_t* dst_ab64,
-                         int width);
-void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
-                         uint8_t* dst_argb,
-                         int width);
-void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
-                         uint8_t* dst_argb,
-                         int width);
-void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
-void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
-void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
-void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
-void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
-void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
-void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
-void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
-void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr,
-                             uint16_t* dst_ptr,
-                             int width);
-void ARGBToAB64Row_Any_SSSE3(const uint8_t* src_ptr,
-                             uint16_t* dst_ptr,
-                             int width);
-void AR64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void AB64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGBToAR64Row_Any_AVX2(const uint8_t* src_ptr,
-                            uint16_t* dst_ptr,
-                            int width);
-void ARGBToAB64Row_Any_AVX2(const uint8_t* src_ptr,
-                            uint16_t* dst_ptr,
-                            int width);
-void AR64ToARGBRow_Any_AVX2(const uint16_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void AB64ToARGBRow_Any_AVX2(const uint16_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void ARGBToAR64Row_Any_NEON(const uint8_t* src_ptr,
-                            uint16_t* dst_ptr,
-                            int width);
-void ARGBToAB64Row_Any_NEON(const uint8_t* src_ptr,
-                            uint16_t* dst_ptr,
-                            int width);
-void AR64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-
-void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-
-void I444ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I422ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I422ToAR30Row_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I210ToAR30Row_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I210ToARGBRow_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I212ToAR30Row_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I212ToARGBRow_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I410ToAR30Row_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I410ToARGBRow_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I210AlphaToARGBRow_C(const uint16_t* src_y,
-                          const uint16_t* src_u,
-                          const uint16_t* src_v,
-                          const uint16_t* src_a,
-                          uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I410AlphaToARGBRow_C(const uint16_t* src_y,
-                          const uint16_t* src_u,
-                          const uint16_t* src_v,
-                          const uint16_t* src_a,
-                          uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I444AlphaToARGBRow_C(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          const uint8_t* src_a,
-                          uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I422AlphaToARGBRow_C(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          const uint8_t* src_a,
-                          uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void NV12ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_uv,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void NV12ToRGB565Row_C(const uint8_t* src_y,
-                       const uint8_t* src_uv,
-                       uint8_t* dst_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void NV21ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_vu,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void NV12ToRGB24Row_C(const uint8_t* src_y,
-                      const uint8_t* src_uv,
-                      uint8_t* rgb_buf,
-                      const struct YuvConstants* yuvconstants,
-                      int width);
-void NV21ToRGB24Row_C(const uint8_t* src_y,
-                      const uint8_t* src_vu,
-                      uint8_t* rgb_buf,
-                      const struct YuvConstants* yuvconstants,
-                      int width);
-void NV21ToYUV24Row_C(const uint8_t* src_y,
-                      const uint8_t* src_vu,
-                      uint8_t* dst_yuv24,
-                      int width);
-void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void UYVYToARGBRow_C(const uint8_t* src_uyvy,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void P210ToARGBRow_C(const uint16_t* src_y,
-                     const uint16_t* src_uv,
-                     uint8_t* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void P410ToARGBRow_C(const uint16_t* src_y,
-                     const uint16_t* src_uv,
-                     uint8_t* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void P210ToAR30Row_C(const uint16_t* src_y,
-                     const uint16_t* src_uv,
-                     uint8_t* dst_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void P410ToAR30Row_C(const uint16_t* src_y,
-                     const uint16_t* src_uv,
-                     uint8_t* dst_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-
-void I422ToRGBARow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I422ToRGB24Row_C(const uint8_t* src_y,
-                      const uint8_t* src_u,
-                      const uint8_t* src_v,
-                      uint8_t* rgb_buf,
-                      const struct YuvConstants* yuvconstants,
-                      int width);
-void I422ToARGB4444Row_C(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_argb4444,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToARGB1555Row_C(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_argb1555,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToRGB565Row_C(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I422ToARGBRow_AVX2(const uint8_t* y_buf,
-                        const uint8_t* u_buf,
-                        const uint8_t* v_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToRGBARow_AVX2(const uint8_t* y_buf,
-                        const uint8_t* u_buf,
-                        const uint8_t* v_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
-                         const uint8_t* u_buf,
-                         const uint8_t* v_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I444ToARGBRow_AVX2(const uint8_t* y_buf,
-                        const uint8_t* u_buf,
-                        const uint8_t* v_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
-                         const uint8_t* u_buf,
-                         const uint8_t* v_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-
-void I422ToAR30Row_SSSE3(const uint8_t* y_buf,
-                         const uint8_t* u_buf,
-                         const uint8_t* v_buf,
-                         uint8_t* dst_ar30,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I210ToAR30Row_SSSE3(const uint16_t* y_buf,
-                         const uint16_t* u_buf,
-                         const uint16_t* v_buf,
-                         uint8_t* dst_ar30,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
-                         const uint16_t* u_buf,
-                         const uint16_t* v_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I212ToAR30Row_SSSE3(const uint16_t* y_buf,
-                         const uint16_t* u_buf,
-                         const uint16_t* v_buf,
-                         uint8_t* dst_ar30,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I212ToARGBRow_SSSE3(const uint16_t* y_buf,
-                         const uint16_t* u_buf,
-                         const uint16_t* v_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I410ToAR30Row_SSSE3(const uint16_t* y_buf,
-                         const uint16_t* u_buf,
-                         const uint16_t* v_buf,
-                         uint8_t* dst_ar30,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I410ToARGBRow_SSSE3(const uint16_t* y_buf,
-                         const uint16_t* u_buf,
-                         const uint16_t* v_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
-                              const uint16_t* u_buf,
-                              const uint16_t* v_buf,
-                              const uint16_t* a_buf,
-                              uint8_t* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
-                              const uint16_t* u_buf,
-                              const uint16_t* v_buf,
-                              const uint16_t* a_buf,
-                              uint8_t* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void I422ToAR30Row_AVX2(const uint8_t* y_buf,
-                        const uint8_t* u_buf,
-                        const uint8_t* v_buf,
-                        uint8_t* dst_ar30,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I210ToARGBRow_AVX2(const uint16_t* y_buf,
-                        const uint16_t* u_buf,
-                        const uint16_t* v_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I210ToAR30Row_AVX2(const uint16_t* y_buf,
-                        const uint16_t* u_buf,
-                        const uint16_t* v_buf,
-                        uint8_t* dst_ar30,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I212ToARGBRow_AVX2(const uint16_t* y_buf,
-                        const uint16_t* u_buf,
-                        const uint16_t* v_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I212ToAR30Row_AVX2(const uint16_t* y_buf,
-                        const uint16_t* u_buf,
-                        const uint16_t* v_buf,
-                        uint8_t* dst_ar30,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I410ToAR30Row_AVX2(const uint16_t* y_buf,
-                        const uint16_t* u_buf,
-                        const uint16_t* v_buf,
-                        uint8_t* dst_ar30,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I410ToARGBRow_AVX2(const uint16_t* y_buf,
-                        const uint16_t* u_buf,
-                        const uint16_t* v_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
-                             const uint16_t* u_buf,
-                             const uint16_t* v_buf,
-                             const uint16_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
-                             const uint16_t* u_buf,
-                             const uint16_t* v_buf,
-                             const uint16_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
-                              const uint8_t* u_buf,
-                              const uint8_t* v_buf,
-                              const uint8_t* a_buf,
-                              uint8_t* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             const uint8_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
-                              const uint8_t* u_buf,
-                              const uint8_t* v_buf,
-                              const uint8_t* a_buf,
-                              uint8_t* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             const uint8_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
-                         const uint8_t* uv_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV12ToARGBRow_AVX2(const uint8_t* y_buf,
-                        const uint8_t* uv_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
-                          const uint8_t* src_uv,
-                          uint8_t* dst_rgb24,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
-                          const uint8_t* src_vu,
-                          uint8_t* dst_rgb24,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
-                           const uint8_t* src_uv,
-                           uint8_t* dst_rgb565,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_yuv24,
-                         int width);
-void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
-                          const uint8_t* src_uv,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
-                         const uint8_t* vu_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV21ToARGBRow_AVX2(const uint8_t* y_buf,
-                        const uint8_t* vu_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-
-void P210ToARGBRow_SSSE3(const uint16_t* y_buf,
-                         const uint16_t* uv_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void P410ToARGBRow_SSSE3(const uint16_t* y_buf,
-                         const uint16_t* uv_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void P210ToAR30Row_SSSE3(const uint16_t* y_buf,
-                         const uint16_t* uv_buf,
-                         uint8_t* dst_ar30,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void P410ToAR30Row_SSSE3(const uint16_t* y_buf,
-                         const uint16_t* uv_buf,
-                         uint8_t* dst_ar30,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void P210ToARGBRow_AVX2(const uint16_t* y_buf,
-                        const uint16_t* uv_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void P410ToARGBRow_AVX2(const uint16_t* y_buf,
-                        const uint16_t* uv_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void P210ToAR30Row_AVX2(const uint16_t* y_buf,
-                        const uint16_t* uv_buf,
-                        uint8_t* dst_ar30,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void P410ToAR30Row_AVX2(const uint16_t* y_buf,
-                        const uint16_t* uv_buf,
-                        uint8_t* dst_ar30,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-
-void I422ToRGBARow_SSSE3(const uint8_t* y_buf,
-                         const uint8_t* u_buf,
-                         const uint8_t* v_buf,
-                         uint8_t* dst_rgba,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             uint8_t* dst_argb4444,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             uint8_t* dst_argb1555,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_rgb565,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I422ToRGB565Row_AVX2(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
-                          const uint8_t* u_buf,
-                          const uint8_t* v_buf,
-                          uint8_t* dst_rgb24,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I422ToRGB24Row_AVX2(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
-                             const uint16_t* u_buf,
-                             const uint16_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
-                             const uint16_t* u_buf,
-                             const uint16_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
-                             const uint16_t* u_buf,
-                             const uint16_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
-                             const uint16_t* u_buf,
-                             const uint16_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I410ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
-                             const uint16_t* u_buf,
-                             const uint16_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I410ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
-                             const uint16_t* u_buf,
-                             const uint16_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I210AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf,
-                                  const uint16_t* u_buf,
-                                  const uint16_t* v_buf,
-                                  const uint16_t* a_buf,
-                                  uint8_t* dst_ptr,
-                                  const struct YuvConstants* yuvconstants,
-                                  int width);
-void I410AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf,
-                                  const uint16_t* u_buf,
-                                  const uint16_t* v_buf,
-                                  const uint16_t* a_buf,
-                                  uint8_t* dst_ptr,
-                                  const struct YuvConstants* yuvconstants,
-                                  int width);
-void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf,
-                            const uint16_t* u_buf,
-                            const uint16_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
-                            const uint16_t* u_buf,
-                            const uint16_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I212ToARGBRow_Any_AVX2(const uint16_t* y_buf,
-                            const uint16_t* u_buf,
-                            const uint16_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I212ToAR30Row_Any_AVX2(const uint16_t* y_buf,
-                            const uint16_t* u_buf,
-                            const uint16_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I410ToAR30Row_Any_AVX2(const uint16_t* y_buf,
-                            const uint16_t* u_buf,
-                            const uint16_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I410ToARGBRow_Any_AVX2(const uint16_t* y_buf,
-                            const uint16_t* u_buf,
-                            const uint16_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I210AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf,
-                                 const uint16_t* u_buf,
-                                 const uint16_t* v_buf,
-                                 const uint16_t* a_buf,
-                                 uint8_t* dst_ptr,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I410AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf,
-                                 const uint16_t* u_buf,
-                                 const uint16_t* v_buf,
-                                 const uint16_t* a_buf,
-                                 uint8_t* dst_ptr,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I444AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
-                                  const uint8_t* u_buf,
-                                  const uint8_t* v_buf,
-                                  const uint8_t* a_buf,
-                                  uint8_t* dst_ptr,
-                                  const struct YuvConstants* yuvconstants,
-                                  int width);
-void I444AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf,
-                                 const uint8_t* u_buf,
-                                 const uint8_t* v_buf,
-                                 const uint8_t* a_buf,
-                                 uint8_t* dst_ptr,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
-                                  const uint8_t* u_buf,
-                                  const uint8_t* v_buf,
-                                  const uint8_t* a_buf,
-                                  uint8_t* dst_ptr,
-                                  const struct YuvConstants* yuvconstants,
-                                  int width);
-void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf,
-                                 const uint8_t* u_buf,
-                                 const uint8_t* v_buf,
-                                 const uint8_t* a_buf,
-                                 uint8_t* dst_ptr,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf,
-                            const uint8_t* uv_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf,
-                            const uint8_t* uv_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
-                             const uint8_t* src_vu,
-                             uint8_t* dst_yuv24,
-                             int width);
-void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
-                               const uint8_t* uv_buf,
-                               uint8_t* dst_ptr,
-                               const struct YuvConstants* yuvconstants,
-                               int width);
-void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void P210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
-                             const uint16_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void P410ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
-                             const uint16_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void P210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
-                             const uint16_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void P410ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
-                             const uint16_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void P210ToARGBRow_Any_AVX2(const uint16_t* y_buf,
-                            const uint16_t* uv_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void P410ToARGBRow_Any_AVX2(const uint16_t* y_buf,
-                            const uint16_t* uv_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void P210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
-                            const uint16_t* uv_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void P410ToAR30Row_Any_AVX2(const uint16_t* y_buf,
-                            const uint16_t* uv_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf,
-                                 const uint8_t* u_buf,
-                                 const uint8_t* v_buf,
-                                 uint8_t* dst_ptr,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_ptr,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
-void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf,
-                                 const uint8_t* u_buf,
-                                 const uint8_t* v_buf,
-                                 uint8_t* dst_ptr,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_ptr,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
-void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_ptr,
-                               const struct YuvConstants* yuvconstants,
-                               int width);
-void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
-                              const uint8_t* u_buf,
-                              const uint8_t* v_buf,
-                              uint8_t* dst_ptr,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
-                              const uint8_t* u_buf,
-                              const uint8_t* v_buf,
-                              uint8_t* dst_ptr,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-
-void I400ToARGBRow_C(const uint8_t* src_y,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I400ToARGBRow_SSE2(const uint8_t* y_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I400ToARGBRow_AVX2(const uint8_t* y_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I400ToARGBRow_NEON(const uint8_t* src_y,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I400ToARGBRow_MSA(const uint8_t* src_y,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I400ToARGBRow_MMI(const uint8_t* src_y,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* param,
-                            int width);
-void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* param,
-                            int width);
-void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* param,
-                            int width);
-void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-
-// ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
-                        const uint8_t* src_argb1,
-                        uint8_t* dst_argb,
-                        int width);
-void ARGBBlendRow_NEON(const uint8_t* src_argb,
-                       const uint8_t* src_argb1,
-                       uint8_t* dst_argb,
-                       int width);
-void ARGBBlendRow_MSA(const uint8_t* src_argb0,
-                      const uint8_t* src_argb1,
-                      uint8_t* dst_argb,
-                      int width);
-void ARGBBlendRow_MMI(const uint8_t* src_argb0,
-                      const uint8_t* src_argb1,
-                      uint8_t* dst_argb,
-                      int width);
-void ARGBBlendRow_C(const uint8_t* src_argb,
-                    const uint8_t* src_argb1,
-                    uint8_t* dst_argb,
-                    int width);
-
-// Unattenuated planar alpha blend.
-void BlendPlaneRow_SSSE3(const uint8_t* src0,
-                         const uint8_t* src1,
-                         const uint8_t* alpha,
-                         uint8_t* dst,
-                         int width);
-void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             uint8_t* dst_ptr,
-                             int width);
-void BlendPlaneRow_AVX2(const uint8_t* src0,
-                        const uint8_t* src1,
-                        const uint8_t* alpha,
-                        uint8_t* dst,
-                        int width);
-void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            int width);
-void BlendPlaneRow_MMI(const uint8_t* src0,
-                       const uint8_t* src1,
-                       const uint8_t* alpha,
-                       uint8_t* dst,
-                       int width);
-void BlendPlaneRow_Any_MMI(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void BlendPlaneRow_C(const uint8_t* src0,
-                     const uint8_t* src1,
-                     const uint8_t* alpha,
-                     uint8_t* dst,
-                     int width);
-
-// ARGB multiply images. Same API as Blend, but these require
-// pointer and width alignment for SSE2.
-void ARGBMultiplyRow_C(const uint8_t* src_argb,
-                       const uint8_t* src_argb1,
-                       uint8_t* dst_argb,
-                       int width);
-void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width);
-void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width);
-void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width);
-void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width);
-void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width);
-void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             int width);
-
-// ARGB add images.
-void ARGBAddRow_C(const uint8_t* src_argb,
-                  const uint8_t* src_argb1,
-                  uint8_t* dst_argb,
-                  int width);
-void ARGBAddRow_SSE2(const uint8_t* src_argb,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width);
-void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,
-                         const uint8_t* uv_buf,
-                         uint8_t* dst_ptr,
-                         int width);
-void ARGBAddRow_AVX2(const uint8_t* src_argb,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width);
-void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,
-                         const uint8_t* uv_buf,
-                         uint8_t* dst_ptr,
-                         int width);
-void ARGBAddRow_NEON(const uint8_t* src_argb,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width);
-void ARGBAddRow_Any_NEON(const uint8_t* y_buf,
-                         const uint8_t* uv_buf,
-                         uint8_t* dst_ptr,
-                         int width);
-void ARGBAddRow_MSA(const uint8_t* src_argb0,
-                    const uint8_t* src_argb1,
-                    uint8_t* dst_argb,
-                    int width);
-void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
-                        const uint8_t* uv_buf,
-                        uint8_t* dst_ptr,
-                        int width);
-void ARGBAddRow_MMI(const uint8_t* src_argb0,
-                    const uint8_t* src_argb1,
-                    uint8_t* dst_argb,
-                    int width);
-void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
-                        const uint8_t* uv_buf,
-                        uint8_t* dst_ptr,
-                        int width);
-
-// ARGB subtract images. Same API as Blend, but these require
-// pointer and width alignment for SSE2.
-void ARGBSubtractRow_C(const uint8_t* src_argb,
-                       const uint8_t* src_argb1,
-                       uint8_t* dst_argb,
-                       int width);
-void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width);
-void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width);
-void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBSubtractRow_NEON(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width);
-void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width);
-void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width);
-void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             int width);
-
-void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr,
-                                   uint8_t* dst_ptr,
-                                   int width);
-void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
-                                    uint8_t* dst_ptr,
-                                    const uint32_t param,
-                                    int width);
-void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr,
-                                    uint8_t* dst_ptr,
-                                    const uint32_t param,
-                                    int width);
-
-void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-
-void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr,
-                                    uint8_t* dst_ptr,
-                                    const uint32_t param,
-                                    int width);
-void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
-                                   uint8_t* dst_ptr,
-                                   const uint32_t param,
-                                   int width);
-
-void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr,
-                                   uint8_t* dst_ptr,
-                                   const uint32_t param,
-                                   int width);
-
-void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
-                                 const uint8_t* u_buf,
-                                 const uint8_t* v_buf,
-                                 const uint8_t* a_buf,
-                                 uint8_t* dst_ptr,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
-                                 const uint8_t* u_buf,
-                                 const uint8_t* v_buf,
-                                 const uint8_t* a_buf,
-                                 uint8_t* dst_ptr,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
-void I422ToRGBARow_Any_NEON(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_ptr,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
-void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_ptr,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
-void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf,
-                              const uint8_t* u_buf,
-                              const uint8_t* v_buf,
-                              uint8_t* dst_ptr,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf,
-                            const uint8_t* uv_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf,
-                            const uint8_t* uv_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV21ToYUV24Row_Any_NEON(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             int width);
-void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void P210ToARGBRow_NEON(const uint16_t* y_buf,
-                        const uint16_t* uv_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void P410ToARGBRow_NEON(const uint16_t* y_buf,
-                        const uint16_t* uv_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void P210ToAR30Row_NEON(const uint16_t* y_buf,
-                        const uint16_t* uv_buf,
-                        uint8_t* dst_ar30,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void P410ToAR30Row_NEON(const uint16_t* y_buf,
-                        const uint16_t* uv_buf,
-                        uint8_t* dst_ar30,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void P210ToARGBRow_Any_NEON(const uint16_t* y_buf,
-                            const uint16_t* uv_buf,
-                            uint8_t* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void P410ToARGBRow_Any_NEON(const uint16_t* y_buf,
-                            const uint16_t* uv_buf,
-                            uint8_t* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void P210ToAR30Row_Any_NEON(const uint16_t* y_buf,
-                            const uint16_t* uv_buf,
-                            uint8_t* dst_ar30,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void P410ToAR30Row_Any_NEON(const uint16_t* y_buf,
-                            const uint16_t* uv_buf,
-                            uint8_t* dst_ar30,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I444ToARGBRow_Any_MMI(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I422ToARGBRow_Any_MMI(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                const uint8_t* a_buf,
-                                uint8_t* dst_ptr,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
-void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_ptr,
-                               const struct YuvConstants* yuvconstants,
-                               int width);
-void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_ptr,
-                               const struct YuvConstants* yuvconstants,
-                               int width);
-void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf,
-                           const uint8_t* uv_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf,
-                           const uint8_t* uv_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-
-void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
-void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
-void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
-void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
-void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
-void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
-                     int src_stride_yuy2,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
-                     int src_stride_yuy2,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
-void YUY2ToUVRow_C(const uint8_t* src_yuy2,
-                   int src_stride_yuy2,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width);
-void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width);
-void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width);
-void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
-void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
-void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
-void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
-void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
-                     int src_stride_uyvy,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
-                     int src_stride_uyvy,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-
-void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
-void UYVYToUVRow_C(const uint8_t* src_uyvy,
-                   int src_stride_uyvy,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width);
-void UYVYToUV422Row_C(const uint8_t* src_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr,
-                         int src_stride_ptr,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
-void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width);
-void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width);
-void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
-void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
-void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
-void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
-void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
-void AYUVToUVRow_C(const uint8_t* src_ayuv,
-                   int src_stride_ayuv,
-                   uint8_t* dst_uv,
-                   int width);
-void AYUVToVURow_C(const uint8_t* src_ayuv,
-                   int src_stride_ayuv,
-                   uint8_t* dst_vu,
-                   int width);
-void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
-void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
-                      int src_stride_ayuv,
-                      uint8_t* dst_uv,
-                      int width);
-void AYUVToVURow_NEON(const uint8_t* src_ayuv,
-                      int src_stride_ayuv,
-                      uint8_t* dst_vu,
-                      int width);
-void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_vu,
-                          int width);
-void AYUVToVURow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride,
-                          uint8_t* dst_vu,
-                          int width);
-
-void I422ToYUY2Row_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_frame,
-                     int width);
-void I422ToUYVYRow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_frame,
-                     int width);
-void I422ToYUY2Row_SSE2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width);
-void I422ToUYVYRow_SSE2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width);
-void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            int width);
-void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            int width);
-void I422ToYUY2Row_AVX2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width);
-void I422ToUYVYRow_AVX2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width);
-void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            int width);
-void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            int width);
-void I422ToYUY2Row_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width);
-void I422ToUYVYRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width);
-void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            int width);
-void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            int width);
-void I422ToYUY2Row_MSA(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_yuy2,
-                       int width);
-void I422ToYUY2Row_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_yuy2,
-                       int width);
-void I422ToUYVYRow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_uyvy,
-                       int width);
-void I422ToUYVYRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_uyvy,
-                       int width);
-void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           int width);
-
-// Effects related row functions.
-void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            int width);
-void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
-                           uint8_t* dst_argb,
-                           int width);
-void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
-                           uint8_t* dst_argb,
-                           int width);
-void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          int width);
-void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          int width);
-void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
-void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-
-// Inverse table for unattenuate, shared by C and SSE2.
-extern const uint32_t fixed_invtbl8[256];
-void ARGBUnattenuateRow_C(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          int width);
-void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             int width);
-void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             int width);
-void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 int width);
-void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 int width);
-
-void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-
-void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
-void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
-void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
-void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
-void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width);
-
-void ARGBColorMatrixRow_C(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          const int8_t* matrix_argb,
-                          int width);
-void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
-                              uint8_t* dst_argb,
-                              const int8_t* matrix_argb,
-                              int width);
-void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             const int8_t* matrix_argb,
-                             int width);
-void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const int8_t* matrix_argb,
-                            int width);
-void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const int8_t* matrix_argb,
-                            int width);
-
-void ARGBColorTableRow_C(uint8_t* dst_argb,
-                         const uint8_t* table_argb,
-                         int width);
-void ARGBColorTableRow_X86(uint8_t* dst_argb,
-                           const uint8_t* table_argb,
-                           int width);
-
-void RGBColorTableRow_C(uint8_t* dst_argb,
-                        const uint8_t* table_argb,
-                        int width);
-void RGBColorTableRow_X86(uint8_t* dst_argb,
-                          const uint8_t* table_argb,
-                          int width);
-
-void ARGBQuantizeRow_C(uint8_t* dst_argb,
-                       int scale,
-                       int interval_size,
-                       int interval_offset,
-                       int width);
-void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
-                          int scale,
-                          int interval_size,
-                          int interval_offset,
-                          int width);
-void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
-                          int scale,
-                          int interval_size,
-                          int interval_offset,
-                          int width);
-void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
-                         int scale,
-                         int interval_size,
-                         int interval_offset,
-                         int width);
-
-void ARGBShadeRow_C(const uint8_t* src_argb,
-                    uint8_t* dst_argb,
-                    int width,
-                    uint32_t value);
-void ARGBShadeRow_SSE2(const uint8_t* src_argb,
-                       uint8_t* dst_argb,
-                       int width,
-                       uint32_t value);
-void ARGBShadeRow_NEON(const uint8_t* src_argb,
-                       uint8_t* dst_argb,
-                       int width,
-                       uint32_t value);
-void ARGBShadeRow_MSA(const uint8_t* src_argb,
-                      uint8_t* dst_argb,
-                      int width,
-                      uint32_t value);
-void ARGBShadeRow_MMI(const uint8_t* src_argb,
-                      uint8_t* dst_argb,
-                      int width,
-                      uint32_t value);
-
-// Used for blur.
-void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
-                                    const int32_t* botleft,
-                                    int width,
-                                    int area,
-                                    uint8_t* dst,
-                                    int count);
-void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
-                                  int32_t* cumsum,
-                                  const int32_t* previous_cumsum,
-                                  int width);
-
-void ComputeCumulativeSumRow_MMI(const uint8_t* row,
-                                 int32_t* cumsum,
-                                 const int32_t* previous_cumsum,
-                                 int width);
-
-void CumulativeSumToAverageRow_C(const int32_t* tl,
-                                 const int32_t* bl,
-                                 int w,
-                                 int area,
-                                 uint8_t* dst,
-                                 int count);
-void ComputeCumulativeSumRow_C(const uint8_t* row,
-                               int32_t* cumsum,
-                               const int32_t* previous_cumsum,
-                               int width);
-
-LIBYUV_API
-void ARGBAffineRow_C(const uint8_t* src_argb,
-                     int src_argb_stride,
-                     uint8_t* dst_argb,
-                     const float* uv_dudv,
-                     int width);
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8_t* src_argb,
-                        int src_argb_stride,
-                        uint8_t* dst_argb,
-                        const float* src_dudv,
-                        int width);
-
-// Used for I420Scale, ARGBScale, and ARGBInterpolate.
-void InterpolateRow_C(uint8_t* dst_ptr,
-                      const uint8_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      int width,
-                      int source_y_fraction);
-void InterpolateRow_SSSE3(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction);
-void InterpolateRow_AVX2(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int dst_width,
-                         int source_y_fraction);
-void InterpolateRow_NEON(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int dst_width,
-                         int source_y_fraction);
-void InterpolateRow_MSA(uint8_t* dst_ptr,
-                        const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        int width,
-                        int source_y_fraction);
-void InterpolateRow_MMI(uint8_t* dst_ptr,
-                        const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        int width,
-                        int source_y_fraction);
-void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
-                             const uint8_t* src_ptr,
-                             ptrdiff_t src_stride_ptr,
-                             int width,
-                             int source_y_fraction);
-void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr,
-                              const uint8_t* src_ptr,
-                              ptrdiff_t src_stride_ptr,
-                              int width,
-                              int source_y_fraction);
-void InterpolateRow_Any_AVX2(uint8_t* dst_ptr,
-                             const uint8_t* src_ptr,
-                             ptrdiff_t src_stride_ptr,
-                             int width,
-                             int source_y_fraction);
-void InterpolateRow_Any_MSA(uint8_t* dst_ptr,
-                            const uint8_t* src_ptr,
-                            ptrdiff_t src_stride_ptr,
-                            int width,
-                            int source_y_fraction);
-void InterpolateRow_Any_MMI(uint8_t* dst_ptr,
-                            const uint8_t* src_ptr,
-                            ptrdiff_t src_stride_ptr,
-                            int width,
-                            int source_y_fraction);
-
-void InterpolateRow_16_C(uint16_t* dst_ptr,
-                         const uint16_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int width,
-                         int source_y_fraction);
-
-// Sobel images.
-void SobelXRow_C(const uint8_t* src_y0,
-                 const uint8_t* src_y1,
-                 const uint8_t* src_y2,
-                 uint8_t* dst_sobelx,
-                 int width);
-void SobelXRow_SSE2(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    const uint8_t* src_y2,
-                    uint8_t* dst_sobelx,
-                    int width);
-void SobelXRow_NEON(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    const uint8_t* src_y2,
-                    uint8_t* dst_sobelx,
-                    int width);
-void SobelXRow_MSA(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   const uint8_t* src_y2,
-                   uint8_t* dst_sobelx,
-                   int width);
-void SobelXRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   const uint8_t* src_y2,
-                   uint8_t* dst_sobelx,
-                   int width);
-void SobelYRow_C(const uint8_t* src_y0,
-                 const uint8_t* src_y1,
-                 uint8_t* dst_sobely,
-                 int width);
-void SobelYRow_SSE2(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    uint8_t* dst_sobely,
-                    int width);
-void SobelYRow_NEON(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    uint8_t* dst_sobely,
-                    int width);
-void SobelYRow_MSA(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   uint8_t* dst_sobely,
-                   int width);
-void SobelYRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   uint8_t* dst_sobely,
-                   int width);
-void SobelRow_C(const uint8_t* src_sobelx,
-                const uint8_t* src_sobely,
-                uint8_t* dst_argb,
-                int width);
-void SobelRow_SSE2(const uint8_t* src_sobelx,
-                   const uint8_t* src_sobely,
-                   uint8_t* dst_argb,
-                   int width);
-void SobelRow_NEON(const uint8_t* src_sobelx,
-                   const uint8_t* src_sobely,
-                   uint8_t* dst_argb,
-                   int width);
-void SobelRow_MSA(const uint8_t* src_sobelx,
-                  const uint8_t* src_sobely,
-                  uint8_t* dst_argb,
-                  int width);
-void SobelRow_MMI(const uint8_t* src_sobelx,
-                  const uint8_t* src_sobely,
-                  uint8_t* dst_argb,
-                  int width);
-void SobelToPlaneRow_C(const uint8_t* src_sobelx,
-                       const uint8_t* src_sobely,
-                       uint8_t* dst_y,
-                       int width);
-void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
-                          const uint8_t* src_sobely,
-                          uint8_t* dst_y,
-                          int width);
-void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
-                          const uint8_t* src_sobely,
-                          uint8_t* dst_y,
-                          int width);
-void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
-                         const uint8_t* src_sobely,
-                         uint8_t* dst_y,
-                         int width);
-void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
-                         const uint8_t* src_sobely,
-                         uint8_t* dst_y,
-                         int width);
-void SobelXYRow_C(const uint8_t* src_sobelx,
-                  const uint8_t* src_sobely,
-                  uint8_t* dst_argb,
-                  int width);
-void SobelXYRow_SSE2(const uint8_t* src_sobelx,
-                     const uint8_t* src_sobely,
-                     uint8_t* dst_argb,
-                     int width);
-void SobelXYRow_NEON(const uint8_t* src_sobelx,
-                     const uint8_t* src_sobely,
-                     uint8_t* dst_argb,
-                     int width);
-void SobelXYRow_MSA(const uint8_t* src_sobelx,
-                    const uint8_t* src_sobely,
-                    uint8_t* dst_argb,
-                    int width);
-void SobelXYRow_MMI(const uint8_t* src_sobelx,
-                    const uint8_t* src_sobely,
-                    uint8_t* dst_argb,
-                    int width);
-void SobelRow_Any_SSE2(const uint8_t* y_buf,
-                       const uint8_t* uv_buf,
-                       uint8_t* dst_ptr,
-                       int width);
-void SobelRow_Any_NEON(const uint8_t* y_buf,
-                       const uint8_t* uv_buf,
-                       uint8_t* dst_ptr,
-                       int width);
-void SobelRow_Any_MSA(const uint8_t* y_buf,
-                      const uint8_t* uv_buf,
-                      uint8_t* dst_ptr,
-                      int width);
-void SobelRow_Any_MMI(const uint8_t* y_buf,
-                      const uint8_t* uv_buf,
-                      uint8_t* dst_ptr,
-                      int width);
-void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              int width);
-void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf,
-                              const uint8_t* uv_buf,
-                              uint8_t* dst_ptr,
-                              int width);
-void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             int width);
-void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             int width);
-void SobelXYRow_Any_SSE2(const uint8_t* y_buf,
-                         const uint8_t* uv_buf,
-                         uint8_t* dst_ptr,
-                         int width);
-void SobelXYRow_Any_NEON(const uint8_t* y_buf,
-                         const uint8_t* uv_buf,
-                         uint8_t* dst_ptr,
-                         int width);
-void SobelXYRow_Any_MSA(const uint8_t* y_buf,
-                        const uint8_t* uv_buf,
-                        uint8_t* dst_ptr,
-                        int width);
-void SobelXYRow_Any_MMI(const uint8_t* y_buf,
-                        const uint8_t* uv_buf,
-                        uint8_t* dst_ptr,
-                        int width);
-
-void ARGBPolynomialRow_C(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const float* poly,
-                         int width);
-void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const float* poly,
-                            int width);
-void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const float* poly,
-                            int width);
-
-// Scale and convert to half float.
-void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width);
-void HalfFloatRow_SSE2(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width);
-void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr,
-                           uint16_t* dst_ptr,
-                           float param,
-                           int width);
-void HalfFloatRow_AVX2(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width);
-void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr,
-                           uint16_t* dst_ptr,
-                           float param,
-                           int width);
-void HalfFloatRow_F16C(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width);
-void HalfFloatRow_Any_F16C(const uint16_t* src,
-                           uint16_t* dst,
-                           float scale,
-                           int width);
-void HalfFloat1Row_F16C(const uint16_t* src,
-                        uint16_t* dst,
-                        float scale,
-                        int width);
-void HalfFloat1Row_Any_F16C(const uint16_t* src,
-                            uint16_t* dst,
-                            float scale,
-                            int width);
-void HalfFloatRow_NEON(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width);
-void HalfFloatRow_Any_NEON(const uint16_t* src_ptr,
-                           uint16_t* dst_ptr,
-                           float param,
-                           int width);
-void HalfFloat1Row_NEON(const uint16_t* src,
-                        uint16_t* dst,
-                        float scale,
-                        int width);
-void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,
-                            uint16_t* dst_ptr,
-                            float param,
-                            int width);
-void HalfFloatRow_MSA(const uint16_t* src,
-                      uint16_t* dst,
-                      float scale,
-                      int width);
-void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,
-                          uint16_t* dst_ptr,
-                          float param,
-                          int width);
-void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
-void ByteToFloatRow_NEON(const uint8_t* src,
-                         float* dst,
-                         float scale,
-                         int width);
-void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
-                             float* dst_ptr,
-                             float param,
-                             int width);
-
-void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             int width,
-                             const uint8_t* luma,
-                             uint32_t lumacoeff);
-void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
-                                 uint8_t* dst_argb,
-                                 int width,
-                                 const uint8_t* luma,
-                                 uint32_t lumacoeff);
-
-float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width);
-float ScaleMaxSamples_NEON(const float* src,
-                           float* dst,
-                           float scale,
-                           int width);
-float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
-float ScaleSumSamples_NEON(const float* src,
-                           float* dst,
-                           float scale,
-                           int width);
-void ScaleSamples_C(const float* src, float* dst, float scale, int width);
-void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
-
-void I210ToARGBRow_MMI(const uint16_t* src_y,
-                       const uint16_t* src_u,
-                       const uint16_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I422ToRGBARow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            const uint8_t* src_a,
-                            uint8_t* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToRGB565Row_MMI(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I422ToARGB4444Row_MMI(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb4444,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I422ToARGB1555Row_MMI(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb1555,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void NV12ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_uv,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void NV12ToRGB565Row_MMI(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV21ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_vu,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void NV12ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_uv,
-                        uint8_t* dst_rgb24,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void NV21ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_vu,
-                        uint8_t* dst_rgb24,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width);
-void I210ToARGBRow_Any_MMI(const uint16_t* y_buf,
-                           const uint16_t* u_buf,
-                           const uint16_t* v_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I422ToRGBARow_Any_MMI(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void I422AlphaToARGBRow_Any_MMI(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                const uint8_t* a_buf,
-                                uint8_t* dst_ptr,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
-void I422ToRGB24Row_Any_MMI(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGB565Row_Any_MMI(const uint8_t* y_buf,
-                             const uint8_t* u_buf,
-                             const uint8_t* v_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGB4444Row_Any_MMI(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_ptr,
-                               const struct YuvConstants* yuvconstants,
-                               int width);
-void I422ToARGB1555Row_Any_MMI(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_ptr,
-                               const struct YuvConstants* yuvconstants,
-                               int width);
-void NV12ToARGBRow_Any_MMI(const uint8_t* y_buf,
-                           const uint8_t* uv_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void NV12ToRGB565Row_Any_MMI(const uint8_t* y_buf,
-                             const uint8_t* uv_buf,
-                             uint8_t* dst_ptr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void NV21ToARGBRow_Any_MMI(const uint8_t* y_buf,
-                           const uint8_t* uv_buf,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void NV12ToRGB24Row_Any_MMI(const uint8_t* y_buf,
-                            const uint8_t* uv_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV21ToRGB24Row_Any_MMI(const uint8_t* y_buf,
-                            const uint8_t* uv_buf,
-                            uint8_t* dst_ptr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void YUY2ToARGBRow_Any_MMI(const uint8_t* src_ptr,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr,
-                           uint8_t* dst_ptr,
-                           const struct YuvConstants* yuvconstants,
-                           int width);
-
-void GaussRow_F32_NEON(const float* src, float* dst, int width);
-void GaussRow_F32_C(const float* src, float* dst, int width);
-
-void GaussCol_F32_NEON(const float* src0,
-                       const float* src1,
-                       const float* src2,
-                       const float* src3,
-                       const float* src4,
-                       float* dst,
-                       int width);
-
-void GaussCol_F32_C(const float* src0,
-                    const float* src1,
-                    const float* src2,
-                    const float* src3,
-                    const float* src4,
-                    float* dst,
-                    int width);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_ROW_H_
diff --git a/thirdparty/libyuv/include/libyuv/scale.h b/thirdparty/libyuv/include/libyuv/scale.h
deleted file mode 100644
index 3d4b600..0000000
--- a/thirdparty/libyuv/include/libyuv/scale.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_H_
-#define INCLUDE_LIBYUV_SCALE_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Supported filtering.
-typedef enum FilterMode {
-  kFilterNone = 0,      // Point sample; Fastest.
-  kFilterLinear = 1,    // Filter horizontally only.
-  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 3        // Highest quality.
-} FilterModeEnum;
-
-// Scale a YUV plane.
-LIBYUV_API
-void ScalePlane(const uint8_t* src,
-                int src_stride,
-                int src_width,
-                int src_height,
-                uint8_t* dst,
-                int dst_stride,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering);
-
-LIBYUV_API
-void ScalePlane_16(const uint16_t* src,
-                   int src_stride,
-                   int src_width,
-                   int src_height,
-                   uint16_t* dst,
-                   int dst_stride,
-                   int dst_width,
-                   int dst_height,
-                   enum FilterMode filtering);
-
-// Sample is expected to be in the low 12 bits.
-LIBYUV_API
-void ScalePlane_12(const uint16_t* src,
-                   int src_stride,
-                   int src_width,
-                   int src_height,
-                   uint16_t* dst,
-                   int dst_stride,
-                   int dst_width,
-                   int dst_height,
-                   enum FilterMode filtering);
-
-// Scales a YUV 4:2:0 image from the src width and height to the
-// dst width and height.
-// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
-// used. This produces basic (blocky) quality at the fastest speed.
-// If filtering is kFilterBilinear, interpolation is used to produce a better
-// quality image, at the expense of speed.
-// If filtering is kFilterBox, averaging is used to produce ever better
-// quality image, at further expense of speed.
-// Returns 0 if successful.
-
-LIBYUV_API
-int I420Scale(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              int src_width,
-              int src_height,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering);
-
-LIBYUV_API
-int I420Scale_16(const uint16_t* src_y,
-                 int src_stride_y,
-                 const uint16_t* src_u,
-                 int src_stride_u,
-                 const uint16_t* src_v,
-                 int src_stride_v,
-                 int src_width,
-                 int src_height,
-                 uint16_t* dst_y,
-                 int dst_stride_y,
-                 uint16_t* dst_u,
-                 int dst_stride_u,
-                 uint16_t* dst_v,
-                 int dst_stride_v,
-                 int dst_width,
-                 int dst_height,
-                 enum FilterMode filtering);
-
-LIBYUV_API
-int I420Scale_12(const uint16_t* src_y,
-                 int src_stride_y,
-                 const uint16_t* src_u,
-                 int src_stride_u,
-                 const uint16_t* src_v,
-                 int src_stride_v,
-                 int src_width,
-                 int src_height,
-                 uint16_t* dst_y,
-                 int dst_stride_y,
-                 uint16_t* dst_u,
-                 int dst_stride_u,
-                 uint16_t* dst_v,
-                 int dst_stride_v,
-                 int dst_width,
-                 int dst_height,
-                 enum FilterMode filtering);
-
-// Scales a YUV 4:4:4 image from the src width and height to the
-// dst width and height.
-// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
-// used. This produces basic (blocky) quality at the fastest speed.
-// If filtering is kFilterBilinear, interpolation is used to produce a better
-// quality image, at the expense of speed.
-// If filtering is kFilterBox, averaging is used to produce ever better
-// quality image, at further expense of speed.
-// Returns 0 if successful.
-
-LIBYUV_API
-int I444Scale(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              int src_width,
-              int src_height,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering);
-
-LIBYUV_API
-int I444Scale_16(const uint16_t* src_y,
-                 int src_stride_y,
-                 const uint16_t* src_u,
-                 int src_stride_u,
-                 const uint16_t* src_v,
-                 int src_stride_v,
-                 int src_width,
-                 int src_height,
-                 uint16_t* dst_y,
-                 int dst_stride_y,
-                 uint16_t* dst_u,
-                 int dst_stride_u,
-                 uint16_t* dst_v,
-                 int dst_stride_v,
-                 int dst_width,
-                 int dst_height,
-                 enum FilterMode filtering);
-
-LIBYUV_API
-int I444Scale_12(const uint16_t* src_y,
-                 int src_stride_y,
-                 const uint16_t* src_u,
-                 int src_stride_u,
-                 const uint16_t* src_v,
-                 int src_stride_v,
-                 int src_width,
-                 int src_height,
-                 uint16_t* dst_y,
-                 int dst_stride_y,
-                 uint16_t* dst_u,
-                 int dst_stride_u,
-                 uint16_t* dst_v,
-                 int dst_stride_v,
-                 int dst_width,
-                 int dst_height,
-                 enum FilterMode filtering);
-
-// Scales an NV12 image from the src width and height to the
-// dst width and height.
-// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
-// used. This produces basic (blocky) quality at the fastest speed.
-// If filtering is kFilterBilinear, interpolation is used to produce a better
-// quality image, at the expense of speed.
-// kFilterBox is not supported for the UV channel and will be treated as
-// bilinear.
-// Returns 0 if successful.
-
-LIBYUV_API
-int NV12Scale(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_uv,
-              int src_stride_uv,
-              int src_width,
-              int src_height,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_uv,
-              int dst_stride_uv,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering);
-
-#ifdef __cplusplus
-// Legacy API.  Deprecated.
-LIBYUV_API
-int Scale(const uint8_t* src_y,
-          const uint8_t* src_u,
-          const uint8_t* src_v,
-          int src_stride_y,
-          int src_stride_u,
-          int src_stride_v,
-          int src_width,
-          int src_height,
-          uint8_t* dst_y,
-          uint8_t* dst_u,
-          uint8_t* dst_v,
-          int dst_stride_y,
-          int dst_stride_u,
-          int dst_stride_v,
-          int dst_width,
-          int dst_height,
-          LIBYUV_BOOL interpolate);
-
-// For testing, allow disabling of specialized scalers.
-LIBYUV_API
-void SetUseReferenceImpl(LIBYUV_BOOL use);
-#endif  // __cplusplus
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_SCALE_H_
diff --git a/thirdparty/libyuv/include/libyuv/scale_argb.h b/thirdparty/libyuv/include/libyuv/scale_argb.h
deleted file mode 100644
index 7641f18..0000000
--- a/thirdparty/libyuv/include/libyuv/scale_argb.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_
-#define INCLUDE_LIBYUV_SCALE_ARGB_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/scale.h"  // For FilterMode
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-LIBYUV_API
-int ARGBScale(const uint8_t* src_argb,
-              int src_stride_argb,
-              int src_width,
-              int src_height,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering);
-
-// Clipped scale takes destination rectangle coordinates for clip values.
-LIBYUV_API
-int ARGBScaleClip(const uint8_t* src_argb,
-                  int src_stride_argb,
-                  int src_width,
-                  int src_height,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int dst_width,
-                  int dst_height,
-                  int clip_x,
-                  int clip_y,
-                  int clip_width,
-                  int clip_height,
-                  enum FilterMode filtering);
-
-// Scale with YUV conversion to ARGB and clipping.
-LIBYUV_API
-int YUVToARGBScaleClip(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint32_t src_fourcc,
-                       int src_width,
-                       int src_height,
-                       uint8_t* dst_argb,
-                       int dst_stride_argb,
-                       uint32_t dst_fourcc,
-                       int dst_width,
-                       int dst_height,
-                       int clip_x,
-                       int clip_y,
-                       int clip_width,
-                       int clip_height,
-                       enum FilterMode filtering);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_
diff --git a/thirdparty/libyuv/include/libyuv/scale_row.h b/thirdparty/libyuv/include/libyuv/scale_row.h
deleted file mode 100644
index 833af1c..0000000
--- a/thirdparty/libyuv/include/libyuv/scale_row.h
+++ /dev/null
@@ -1,1727 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_
-#define INCLUDE_LIBYUV_SCALE_ROW_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/scale.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__native_client__) && defined(__x86_64__)) || \
-    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
-#define LIBYUV_DISABLE_X86
-#endif
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// GCC >= 4.7.0 required for AVX2.
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define GCC_HAS_AVX2 1
-#endif  // GNUC >= 4.7
-#endif  // __GNUC__
-
-// clang >= 3.4.0 required for AVX2.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
-#define CLANG_HAS_AVX2 1
-#endif  // clang >= 3.4
-#endif  // __clang__
-
-// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
-    _MSC_VER >= 1700
-#define VISUALC_HAS_AVX2 1
-#endif  // VisualStudio >= 2012
-
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_FIXEDDIV1_X86
-#define HAS_FIXEDDIV_X86
-#define HAS_SCALEADDROW_SSE2
-#define HAS_SCALEARGBCOLS_SSE2
-#define HAS_SCALEARGBCOLSUP2_SSE2
-#define HAS_SCALEARGBFILTERCOLS_SSSE3
-#define HAS_SCALEARGBROWDOWN2_SSE2
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-#define HAS_SCALECOLSUP2_SSE2
-#define HAS_SCALEFILTERCOLS_SSSE3
-#define HAS_SCALEROWDOWN2_SSSE3
-#define HAS_SCALEROWDOWN34_SSSE3
-#define HAS_SCALEROWDOWN38_SSSE3
-#define HAS_SCALEROWDOWN4_SSSE3
-#endif
-
-// The following are available for gcc/clang x86 platforms:
-// TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__))
-#define HAS_SCALEUVROWDOWN2BOX_SSSE3
-#define HAS_SCALEROWUP2LINEAR_SSE2
-#define HAS_SCALEROWUP2LINEAR_SSSE3
-#define HAS_SCALEROWUP2BILINEAR_SSE2
-#define HAS_SCALEROWUP2BILINEAR_SSSE3
-#define HAS_SCALEROWUP2LINEAR_12_SSSE3
-#define HAS_SCALEROWUP2BILINEAR_12_SSSE3
-#define HAS_SCALEROWUP2LINEAR_16_SSE2
-#define HAS_SCALEROWUP2BILINEAR_16_SSE2
-#define HAS_SCALEUVROWUP2LINEAR_SSSE3
-#define HAS_SCALEUVROWUP2BILINEAR_SSSE3
-#define HAS_SCALEUVROWUP2LINEAR_16_SSE2
-#define HAS_SCALEUVROWUP2BILINEAR_16_SSE2
-#endif
-
-// The following are available for gcc/clang x86 platforms, but
-// require clang 3.4 or gcc 4.7.
-// TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) &&                  \
-    (defined(__x86_64__) || defined(__i386__)) &&    \
-    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-#define HAS_SCALEUVROWDOWN2BOX_AVX2
-#define HAS_SCALEROWUP2LINEAR_AVX2
-#define HAS_SCALEROWUP2BILINEAR_AVX2
-#define HAS_SCALEROWUP2LINEAR_12_AVX2
-#define HAS_SCALEROWUP2BILINEAR_12_AVX2
-#define HAS_SCALEROWUP2LINEAR_16_AVX2
-#define HAS_SCALEROWUP2BILINEAR_16_AVX2
-#define HAS_SCALEUVROWUP2LINEAR_AVX2
-#define HAS_SCALEUVROWUP2BILINEAR_AVX2
-#define HAS_SCALEUVROWUP2LINEAR_16_AVX2
-#define HAS_SCALEUVROWUP2BILINEAR_16_AVX2
-#endif
-
-// The following are available on all x86 platforms, but
-// require VS2012, clang 3.4 or gcc 4.7.
-// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) &&                          \
-    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
-     defined(GCC_HAS_AVX2))
-#define HAS_SCALEADDROW_AVX2
-#define HAS_SCALEROWDOWN2_AVX2
-#define HAS_SCALEROWDOWN4_AVX2
-#endif
-
-// The following are available on Neon platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SCALEADDROW_NEON
-#define HAS_SCALEARGBCOLS_NEON
-#define HAS_SCALEARGBFILTERCOLS_NEON
-#define HAS_SCALEARGBROWDOWN2_NEON
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-#define HAS_SCALEFILTERCOLS_NEON
-#define HAS_SCALEROWDOWN2_NEON
-#define HAS_SCALEROWDOWN34_NEON
-#define HAS_SCALEROWDOWN38_NEON
-#define HAS_SCALEROWDOWN4_NEON
-#define HAS_SCALEUVROWDOWN2BOX_NEON
-#define HAS_SCALEUVROWDOWNEVEN_NEON
-#define HAS_SCALEROWUP2LINEAR_NEON
-#define HAS_SCALEROWUP2BILINEAR_NEON
-#define HAS_SCALEROWUP2LINEAR_12_NEON
-#define HAS_SCALEROWUP2BILINEAR_12_NEON
-#define HAS_SCALEROWUP2LINEAR_16_NEON
-#define HAS_SCALEROWUP2BILINEAR_16_NEON
-#define HAS_SCALEUVROWUP2LINEAR_NEON
-#define HAS_SCALEUVROWUP2BILINEAR_NEON
-#define HAS_SCALEUVROWUP2LINEAR_16_NEON
-#define HAS_SCALEUVROWUP2BILINEAR_16_NEON
-#endif
-
-#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
-#define HAS_SCALEADDROW_MSA
-#define HAS_SCALEARGBCOLS_MSA
-#define HAS_SCALEARGBFILTERCOLS_MSA
-#define HAS_SCALEARGBROWDOWN2_MSA
-#define HAS_SCALEARGBROWDOWNEVEN_MSA
-#define HAS_SCALEFILTERCOLS_MSA
-#define HAS_SCALEROWDOWN2_MSA
-#define HAS_SCALEROWDOWN34_MSA
-#define HAS_SCALEROWDOWN38_MSA
-#define HAS_SCALEROWDOWN4_MSA
-#endif
-
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_FIXEDDIV1_MIPS
-#define HAS_FIXEDDIV_MIPS
-#define HAS_SCALEADDROW_16_MMI
-#define HAS_SCALEADDROW_MMI
-#define HAS_SCALEARGBCOLS_MMI
-#define HAS_SCALEARGBCOLSUP2_MMI
-#define HAS_SCALEARGBROWDOWN2_MMI
-#define HAS_SCALEARGBROWDOWNEVEN_MMI
-#define HAS_SCALECOLS_16_MMI
-#define HAS_SCALECOLS_MMI
-#define HAS_SCALEROWDOWN2_16_MMI
-#define HAS_SCALEROWDOWN2_MMI
-#define HAS_SCALEROWDOWN4_16_MMI
-#define HAS_SCALEROWDOWN4_MMI
-#define HAS_SCALEROWDOWN34_MMI
-#endif
-
-// Scale ARGB vertically with bilinear interpolation.
-void ScalePlaneVertical(int src_height,
-                        int dst_width,
-                        int dst_height,
-                        int src_stride,
-                        int dst_stride,
-                        const uint8_t* src_argb,
-                        uint8_t* dst_argb,
-                        int x,
-                        int y,
-                        int dy,
-                        int bpp,
-                        enum FilterMode filtering);
-
-void ScalePlaneVertical_16(int src_height,
-                           int dst_width,
-                           int dst_height,
-                           int src_stride,
-                           int dst_stride,
-                           const uint16_t* src_argb,
-                           uint16_t* dst_argb,
-                           int x,
-                           int y,
-                           int dy,
-                           int wpp,
-                           enum FilterMode filtering);
-
-// Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width,
-                                  int src_height,
-                                  int dst_width,
-                                  int dst_height,
-                                  enum FilterMode filtering);
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_C(int num, int div);
-int FixedDiv_X86(int num, int div);
-int FixedDiv_MIPS(int num, int div);
-// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
-int FixedDiv1_C(int num, int div);
-int FixedDiv1_X86(int num, int div);
-int FixedDiv1_MIPS(int num, int div);
-#ifdef HAS_FIXEDDIV_X86
-#define FixedDiv FixedDiv_X86
-#define FixedDiv1 FixedDiv1_X86
-#elif defined HAS_FIXEDDIV_MIPS
-#define FixedDiv FixedDiv_MIPS
-#define FixedDiv1 FixedDiv1_MIPS
-#else
-#define FixedDiv FixedDiv_C
-#define FixedDiv1 FixedDiv1_C
-#endif
-
-// Compute slope values for stepping.
-void ScaleSlope(int src_width,
-                int src_height,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering,
-                int* x,
-                int* y,
-                int* dx,
-                int* dy);
-
-void ScaleRowDown2_C(const uint8_t* src_ptr,
-                     ptrdiff_t src_stride,
-                     uint8_t* dst,
-                     int dst_width);
-void ScaleRowDown2_16_C(const uint16_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint16_t* dst,
-                        int dst_width);
-void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst,
-                           int dst_width);
-void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint16_t* dst,
-                              int dst_width);
-void ScaleRowDown2Box_C(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width);
-void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst,
-                            int dst_width);
-void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint16_t* dst,
-                           int dst_width);
-void ScaleRowDown4_C(const uint8_t* src_ptr,
-                     ptrdiff_t src_stride,
-                     uint8_t* dst,
-                     int dst_width);
-void ScaleRowDown4_16_C(const uint16_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint16_t* dst,
-                        int dst_width);
-void ScaleRowDown4Box_C(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width);
-void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint16_t* dst,
-                           int dst_width);
-void ScaleRowDown34_C(const uint8_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t* dst,
-                      int dst_width);
-void ScaleRowDown34_16_C(const uint16_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint16_t* dst,
-                         int dst_width);
-void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* d,
-                            int dst_width);
-void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t* d,
-                               int dst_width);
-void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* d,
-                            int dst_width);
-void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t* d,
-                               int dst_width);
-
-void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
-                          uint8_t* dst_ptr,
-                          int dst_width);
-void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            ptrdiff_t dst_stride,
-                            int dst_width);
-void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
-                             uint16_t* dst_ptr,
-                             int dst_width);
-void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t* dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width);
-void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                ptrdiff_t dst_stride,
-                                int dst_width);
-void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
-                                 uint16_t* dst_ptr,
-                                 int dst_width);
-void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint16_t* dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width);
-
-void ScaleCols_C(uint8_t* dst_ptr,
-                 const uint8_t* src_ptr,
-                 int dst_width,
-                 int x,
-                 int dx);
-void ScaleCols_16_C(uint16_t* dst_ptr,
-                    const uint16_t* src_ptr,
-                    int dst_width,
-                    int x,
-                    int dx);
-void ScaleColsUp2_C(uint8_t* dst_ptr,
-                    const uint8_t* src_ptr,
-                    int dst_width,
-                    int,
-                    int);
-void ScaleColsUp2_16_C(uint16_t* dst_ptr,
-                       const uint16_t* src_ptr,
-                       int dst_width,
-                       int,
-                       int);
-void ScaleFilterCols_C(uint8_t* dst_ptr,
-                       const uint8_t* src_ptr,
-                       int dst_width,
-                       int x,
-                       int dx);
-void ScaleFilterCols_16_C(uint16_t* dst_ptr,
-                          const uint16_t* src_ptr,
-                          int dst_width,
-                          int x,
-                          int dx);
-void ScaleFilterCols64_C(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         int dst_width,
-                         int x32,
-                         int dx);
-void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
-                            const uint16_t* src_ptr,
-                            int dst_width,
-                            int x32,
-                            int dx);
-void ScaleRowDown38_C(const uint8_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t* dst,
-                      int dst_width);
-void ScaleRowDown38_16_C(const uint16_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint16_t* dst,
-                         int dst_width);
-void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t* dst_ptr,
-                               int dst_width);
-void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t* dst_ptr,
-                               int dst_width);
-void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_16_C(const uint16_t* src_ptr,
-                      uint32_t* dst_ptr,
-                      int src_width);
-void ScaleARGBRowDown2_C(const uint8_t* src_argb,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_argb,
-                         int dst_width);
-void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_argb,
-                               int dst_width);
-void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_argb,
-                            int dst_width);
-void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
-                            ptrdiff_t src_stride,
-                            int src_stepx,
-                            uint8_t* dst_argb,
-                            int dst_width);
-void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_argb,
-                               int dst_width);
-void ScaleARGBCols_C(uint8_t* dst_argb,
-                     const uint8_t* src_argb,
-                     int dst_width,
-                     int x,
-                     int dx);
-void ScaleARGBCols64_C(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x32,
-                       int dx);
-void ScaleARGBColsUp2_C(uint8_t* dst_argb,
-                        const uint8_t* src_argb,
-                        int dst_width,
-                        int,
-                        int);
-void ScaleARGBFilterCols_C(uint8_t* dst_argb,
-                           const uint8_t* src_argb,
-                           int dst_width,
-                           int x,
-                           int dx);
-void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
-                             const uint8_t* src_argb,
-                             int dst_width,
-                             int x32,
-                             int dx);
-void ScaleUVRowDown2_C(const uint8_t* src_uv,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst_uv,
-                       int dst_width);
-void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst_uv,
-                             int dst_width);
-void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst_uv,
-                          int dst_width);
-void ScaleUVRowDownEven_C(const uint8_t* src_uv,
-                          ptrdiff_t src_stride,
-                          int src_stepx,
-                          uint8_t* dst_uv,
-                          int dst_width);
-void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
-                             ptrdiff_t src_stride,
-                             int src_stepx,
-                             uint8_t* dst_uv,
-                             int dst_width);
-
-void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              ptrdiff_t dst_stride,
-                              int dst_width);
-void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
-                               uint16_t* dst_ptr,
-                               int dst_width);
-void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint16_t* dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width);
-void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
-                                   uint16_t* dst_ptr,
-                                   int dst_width);
-void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint16_t* dst_ptr,
-                                     ptrdiff_t dst_stride,
-                                     int dst_width);
-
-void ScaleUVCols_C(uint8_t* dst_uv,
-                   const uint8_t* src_uv,
-                   int dst_width,
-                   int x,
-                   int dx);
-void ScaleUVCols64_C(uint8_t* dst_uv,
-                     const uint8_t* src_uv,
-                     int dst_width,
-                     int x32,
-                     int dx);
-void ScaleUVColsUp2_C(uint8_t* dst_uv,
-                      const uint8_t* src_uv,
-                      int dst_width,
-                      int,
-                      int);
-void ScaleUVFilterCols_C(uint8_t* dst_uv,
-                         const uint8_t* src_uv,
-                         int dst_width,
-                         int x,
-                         int dx);
-void ScaleUVFilterCols64_C(uint8_t* dst_uv,
-                           const uint8_t* src_uv,
-                           int dst_width,
-                           int x32,
-                           int dx);
-
-// Specialized scalers for x86.
-void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width);
-void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width);
-void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width);
-void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width);
-void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width);
-void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width);
-
-void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst_ptr,
-                          int dst_width);
-void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst_ptr,
-                          int dst_width);
-void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-
-void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int dst_width);
-void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width);
-void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
-                                 uint16_t* dst_ptr,
-                                 int dst_width);
-void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint16_t* dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width);
-void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width);
-void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                ptrdiff_t dst_stride,
-                                int dst_width);
-void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int dst_width);
-void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width);
-void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width);
-void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width);
-void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 int dst_width);
-void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width);
-void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr,
-                                     uint16_t* dst_ptr,
-                                     int dst_width);
-void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint16_t* dst_ptr,
-                                       ptrdiff_t dst_stride,
-                                       int dst_width);
-void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
-                                    uint16_t* dst_ptr,
-                                    int dst_width);
-void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint16_t* dst_ptr,
-                                       ptrdiff_t dst_stride,
-                                       int dst_width);
-void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t* dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width);
-void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 int dst_width);
-void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width);
-void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t* src_ptr,
-                                    uint16_t* dst_ptr,
-                                    int dst_width);
-void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint16_t* dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      int dst_width);
-void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
-                                    uint16_t* dst_ptr,
-                                    int dst_width);
-void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint16_t* dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      int dst_width);
-
-void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst_ptr,
-                             int dst_width);
-void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst_ptr,
-                             int dst_width);
-void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-
-void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t* dst_ptr,
-                                    int dst_width);
-void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t* dst_ptr,
-                                    int dst_width);
-void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t* dst_ptr,
-                                    int dst_width);
-void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t* dst_ptr,
-                                    int dst_width);
-
-void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr,
-                          uint16_t* dst_ptr,
-                          int src_width);
-void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr,
-                          uint16_t* dst_ptr,
-                          int src_width);
-
-void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
-                           const uint8_t* src_ptr,
-                           int dst_width,
-                           int x,
-                           int dx);
-void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
-                       const uint8_t* src_ptr,
-                       int dst_width,
-                       int x,
-                       int dx);
-
-// ARGB Column functions
-void ScaleARGBCols_SSE2(uint8_t* dst_argb,
-                        const uint8_t* src_argb,
-                        int dst_width,
-                        int x,
-                        int dx);
-void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
-                               const uint8_t* src_argb,
-                               int dst_width,
-                               int x,
-                               int dx);
-void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
-                           const uint8_t* src_argb,
-                           int dst_width,
-                           int x,
-                           int dx);
-void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
-                              const uint8_t* src_argb,
-                              int dst_width,
-                              int x,
-                              int dx);
-void ScaleARGBCols_NEON(uint8_t* dst_argb,
-                        const uint8_t* src_argb,
-                        int dst_width,
-                        int x,
-                        int dx);
-void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr,
-                                  const uint8_t* src_ptr,
-                                  int dst_width,
-                                  int x,
-                                  int dx);
-void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr,
-                            const uint8_t* src_ptr,
-                            int dst_width,
-                            int x,
-                            int dx);
-void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
-                             const uint8_t* src_argb,
-                             int dst_width,
-                             int x,
-                             int dx);
-void ScaleARGBCols_MSA(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x,
-                       int dx);
-void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr,
-                                 const uint8_t* src_ptr,
-                                 int dst_width,
-                                 int x,
-                                 int dx);
-void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
-                           const uint8_t* src_ptr,
-                           int dst_width,
-                           int x,
-                           int dx);
-void ScaleARGBCols_MMI(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x,
-                       int dx);
-void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr,
-                           const uint8_t* src_ptr,
-                           int dst_width,
-                           int x,
-                           int dx);
-
-// ARGB Row functions
-void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_argb,
-                            int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_argb,
-                                  int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_argb,
-                               int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst,
-                            int dst_width);
-void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_argb,
-                                  int dst_width);
-void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst,
-                               int dst_width);
-void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_argb,
-                           int dst_width);
-void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_argb,
-                                 int dst_width);
-void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_argb,
-                              int dst_width);
-void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_argb,
-                           int dst_width);
-void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_argb,
-                                 int dst_width);
-void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_argb,
-                              int dst_width);
-void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8_t* dst_ptr,
-                                      int dst_width);
-void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8_t* dst_ptr,
-                                      int dst_width);
-void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8_t* dst_ptr,
-                                     int dst_width);
-void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8_t* dst_ptr,
-                                     int dst_width);
-void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_argb,
-                               int dst_width);
-void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8_t* dst_argb,
-                                  int dst_width);
-void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_argb,
-                               int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8_t* dst_argb,
-                                  int dst_width);
-void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              int32_t src_stepx,
-                              uint8_t* dst_argb,
-                              int dst_width);
-void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 int src_stepx,
-                                 uint8_t* dst_argb,
-                                 int dst_width);
-void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              int32_t src_stepx,
-                              uint8_t* dst_argb,
-                              int dst_width);
-void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 int src_stepx,
-                                 uint8_t* dst_argb,
-                                 int dst_width);
-void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   int src_stepx,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      int src_stepx,
-                                      uint8_t* dst_ptr,
-                                      int dst_width);
-void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   int src_stepx,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      int src_stepx,
-                                      uint8_t* dst_ptr,
-                                      int dst_width);
-void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  int32_t src_stepx,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     int src_stepx,
-                                     uint8_t* dst_ptr,
-                                     int dst_width);
-void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  int32_t src_stepx,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     int src_stepx,
-                                     uint8_t* dst_ptr,
-                                     int dst_width);
-
-// UV Row functions
-void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_uv,
-                           int dst_width);
-void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_uv,
-                                 int dst_width);
-void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_uv,
-                              int dst_width);
-void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst_uv,
-                             int dst_width);
-void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width);
-void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_uv,
-                                int dst_width);
-void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width);
-void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_uv,
-                         int dst_width);
-void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_uv,
-                               int dst_width);
-void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_uv,
-                            int dst_width);
-void ScaleUVRowDown2_MMI(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_uv,
-                         int dst_width);
-void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_uv,
-                               int dst_width);
-void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_uv,
-                            int dst_width);
-void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8_t* dst_ptr,
-                                     int dst_width);
-void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_ptr,
-                                 int dst_width);
-void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t* dst_ptr,
-                                    int dst_width);
-void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_ptr,
-                                 int dst_width);
-void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst_ptr,
-                             int dst_width);
-void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleUVRowDown2_Any_MMI(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst_ptr,
-                             int dst_width);
-void ScaleUVRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              int src_stepx,
-                              uint8_t* dst_uv,
-                              int dst_width);
-void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 int src_stepx,
-                                 uint8_t* dst_uv,
-                                 int dst_width);
-void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             int src_stepx,
-                             uint8_t* dst_uv,
-                             int dst_width);
-void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                int src_stepx,
-                                uint8_t* dst_uv,
-                                int dst_width);
-void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            int32_t src_stepx,
-                            uint8_t* dst_uv,
-                            int dst_width);
-void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_uv,
-                               int dst_width);
-void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            int32_t src_stepx,
-                            uint8_t* dst_uv,
-                            int dst_width);
-void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_uv,
-                               int dst_width);
-void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     int src_stepx,
-                                     uint8_t* dst_ptr,
-                                     int dst_width);
-void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 int src_stepx,
-                                 uint8_t* dst_ptr,
-                                 int dst_width);
-void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    int src_stepx,
-                                    uint8_t* dst_ptr,
-                                    int dst_width);
-void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                int32_t src_stepx,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   int src_stepx,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleUVRowDownEven_Any_MMI(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                int32_t src_stepx,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   int src_stepx,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-
-void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int dst_width);
-void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
-                                    uint8_t* dst_ptr,
-                                    int dst_width);
-void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8_t* dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      int dst_width);
-void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width);
-void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8_t* dst_ptr,
-                                     ptrdiff_t dst_stride,
-                                     int dst_width);
-void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width);
-void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8_t* dst_ptr,
-                                     ptrdiff_t dst_stride,
-                                     int dst_width);
-void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
-                                  uint16_t* dst_ptr,
-                                  int dst_width);
-void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width);
-void ScaleUVRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
-                                      uint16_t* dst_ptr,
-                                      int dst_width);
-void ScaleUVRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint16_t* dst_ptr,
-                                        ptrdiff_t dst_stride,
-                                        int dst_width);
-void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
-                                  uint16_t* dst_ptr,
-                                  int dst_width);
-void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width);
-void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
-                                      uint16_t* dst_ptr,
-                                      int dst_width);
-void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint16_t* dst_ptr,
-                                        ptrdiff_t dst_stride,
-                                        int dst_width);
-void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
-                                  uint16_t* dst_ptr,
-                                  int dst_width);
-void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width);
-void ScaleUVRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
-                                      uint16_t* dst_ptr,
-                                      int dst_width);
-void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint16_t* dst_ptr,
-                                        ptrdiff_t dst_stride,
-                                        int dst_width);
-
-// ScaleRowDown2Box also used by planar functions
-// NEON downscalers with interpolation.
-
-// Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width);
-void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst,
-                              int dst_width);
-void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst,
-                           int dst_width);
-
-void ScaleRowDown4_NEON(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width);
-void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width);
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-//  to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width);
-void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width);
-// 32x3 -> 12x1
-void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-
-void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width);
-void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst_ptr,
-                             int dst_width);
-void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-// 32 -> 12
-void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst_ptr,
-                             int dst_width);
-// 32x3 -> 12x1
-void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   int dst_width);
-
-void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int dst_width);
-void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width);
-void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width);
-void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width);
-void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 int dst_width);
-void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t* dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width);
-void ScaleRowUp2_Linear_12_Any_NEON(const uint16_t* src_ptr,
-                                    uint16_t* dst_ptr,
-                                    int dst_width);
-void ScaleRowUp2_Bilinear_12_Any_NEON(const uint16_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint16_t* dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      int dst_width);
-void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
-                                    uint16_t* dst_ptr,
-                                    int dst_width);
-void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint16_t* dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      int dst_width);
-
-void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
-                          uint16_t* dst_ptr,
-                          int src_width);
-
-void ScaleFilterCols_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          int dst_width,
-                          int x,
-                          int dx);
-
-void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr,
-                              const uint8_t* src_ptr,
-                              int dst_width,
-                              int x,
-                              int dx);
-
-void ScaleRowDown2_MSA(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width);
-void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width);
-void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width);
-void ScaleRowDown4_MSA(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width);
-void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width);
-void ScaleRowDown38_MSA(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width);
-void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleFilterCols_MSA(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         int dst_width,
-                         int x,
-                         int dx);
-void ScaleRowDown34_MSA(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width);
-void ScaleRowDown34_MMI(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width);
-void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* d,
-                              int dst_width);
-void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* d,
-                              int dst_width);
-
-void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width);
-void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_ptr,
-                                 int dst_width);
-void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width);
-void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
-                         uint16_t* dst_ptr,
-                         int src_width);
-void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr,
-                             const uint8_t* src_ptr,
-                             int dst_width,
-                             int x,
-                             int dx);
-void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown34_Any_MMI(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width);
-void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  int dst_width);
-
-void ScaleRowDown2_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width);
-void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width);
-void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width);
-void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint16_t* dst,
-                                int dst_width);
-void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width);
-void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width);
-void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst,
-                              int dst_width);
-void ScaleRowDown4_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width);
-void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width);
-void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width);
-void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width);
-void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
-                        uint32_t* dst_ptr,
-                        int src_width);
-void ScaleColsUp2_MMI(uint8_t* dst_ptr,
-                      const uint8_t* src_ptr,
-                      int dst_width,
-                      int x,
-                      int dx);
-void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
-                         const uint16_t* src_ptr,
-                         int dst_width,
-                         int x,
-                         int dx);
-void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
-                          const uint8_t* src_argb,
-                          int dst_width,
-                          int x,
-                          int dx);
-
-void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width);
-void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_ptr,
-                                 int dst_width);
-void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width);
-void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleAddRow_Any_MMI(const uint8_t* src_ptr,
-                         uint16_t* dst_ptr,
-                         int src_width);
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_
diff --git a/thirdparty/libyuv/include/libyuv/scale_uv.h b/thirdparty/libyuv/include/libyuv/scale_uv.h
deleted file mode 100644
index 8e74e31..0000000
--- a/thirdparty/libyuv/include/libyuv/scale_uv.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright 2020 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_UV_H_
-#define INCLUDE_LIBYUV_SCALE_UV_H_
-
-#include "libyuv/basic_types.h"
-#include "libyuv/scale.h"  // For FilterMode
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-LIBYUV_API
-int UVScale(const uint8_t* src_uv,
-            int src_stride_uv,
-            int src_width,
-            int src_height,
-            uint8_t* dst_uv,
-            int dst_stride_uv,
-            int dst_width,
-            int dst_height,
-            enum FilterMode filtering);
-
-// Scale a 16 bit UV image.
-// This function is currently incomplete, it can't handle all cases.
-LIBYUV_API
-int UVScale_16(const uint16_t* src_uv,
-               int src_stride_uv,
-               int src_width,
-               int src_height,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int dst_width,
-               int dst_height,
-               enum FilterMode filtering);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_SCALE_UV_H_
diff --git a/thirdparty/libyuv/include/libyuv/version.h b/thirdparty/libyuv/include/libyuv/version.h
deleted file mode 100644
index d720d48..0000000
--- a/thirdparty/libyuv/include/libyuv/version.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_VERSION_H_
-#define INCLUDE_LIBYUV_VERSION_H_
-
-#define LIBYUV_VERSION 1787
-
-#endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/thirdparty/libyuv/include/libyuv/video_common.h b/thirdparty/libyuv/include/libyuv/video_common.h
deleted file mode 100644
index 32b8a52..0000000
--- a/thirdparty/libyuv/include/libyuv/video_common.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Common definitions for video, including fourcc and VideoFormat.
-
-#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
-#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-// Definition of FourCC codes
-//////////////////////////////////////////////////////////////////////////////
-
-// Convert four characters to a FourCC code.
-// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
-// constants are used in a switch.
-#ifdef __cplusplus
-#define FOURCC(a, b, c, d)                                        \
-  ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \
-   (static_cast<uint32_t>(c) << 16) | /* NOLINT */                \
-   (static_cast<uint32_t>(d) << 24))  /* NOLINT */
-#else
-#define FOURCC(a, b, c, d)                                     \
-  (((uint32_t)(a)) | ((uint32_t)(b) << 8) |       /* NOLINT */ \
-   ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */
-#endif
-
-// Some pages discussing FourCC codes:
-//   http://www.fourcc.org/yuv.php
-//   http://v4l2spec.bytesex.org/spec/book1.htm
-//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
-//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
-//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
-
-// FourCC codes grouped according to implementation efficiency.
-// Primary formats should convert in 1 efficient step.
-// Secondary formats are converted in 2 steps.
-// Auxilliary formats call primary converters.
-enum FourCC {
-  // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
-  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
-  FOURCC_I422 = FOURCC('I', '4', '2', '2'),
-  FOURCC_I444 = FOURCC('I', '4', '4', '4'),
-  FOURCC_I400 = FOURCC('I', '4', '0', '0'),
-  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
-  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
-  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
-  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
-  FOURCC_I010 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 420
-  FOURCC_I210 = FOURCC('I', '2', '1', '0'),  // bt.601 10 bit 422
-
-  // 1 Secondary YUV format: row biplanar.  deprecated.
-  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-
-  // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp
-  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
-  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
-  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
-  FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.
-  FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit
-  FOURCC_AR64 = FOURCC('A', 'R', '6', '4'),  // 16 bit per channel.
-  FOURCC_AB64 = FOURCC('A', 'B', '6', '4'),  // ABGR version of 16 bit
-  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
-  FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
-  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
-  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
-  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
-  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
-
-  // 1 Primary Compressed YUV format.
-  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
-
-  // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
-  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
-  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
-  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
-  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
-  FOURCC_J420 =
-      FOURCC('J', '4', '2', '0'),  // jpeg (bt.601 full), unofficial fourcc
-  FOURCC_J422 =
-      FOURCC('J', '4', '2', '2'),  // jpeg (bt.601 full), unofficial fourcc
-  FOURCC_J444 =
-      FOURCC('J', '4', '4', '4'),  // jpeg (bt.601 full), unofficial fourcc
-  FOURCC_J400 =
-      FOURCC('J', '4', '0', '0'),  // jpeg (bt.601 full), unofficial fourcc
-  FOURCC_F420 = FOURCC('F', '4', '2', '0'),  // bt.709 full, unofficial fourcc
-  FOURCC_F422 = FOURCC('F', '4', '2', '2'),  // bt.709 full, unofficial fourcc
-  FOURCC_F444 = FOURCC('F', '4', '4', '4'),  // bt.709 full, unofficial fourcc
-  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // bt.709, unofficial fourcc
-  FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // bt.709, unofficial fourcc
-  FOURCC_H444 = FOURCC('H', '4', '4', '4'),  // bt.709, unofficial fourcc
-  FOURCC_U420 = FOURCC('U', '4', '2', '0'),  // bt.2020, unofficial fourcc
-  FOURCC_U422 = FOURCC('U', '4', '2', '2'),  // bt.2020, unofficial fourcc
-  FOURCC_U444 = FOURCC('U', '4', '4', '4'),  // bt.2020, unofficial fourcc
-  FOURCC_F010 = FOURCC('F', '0', '1', '0'),  // bt.709 full range 10 bit 420
-  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // bt.709 10 bit 420
-  FOURCC_U010 = FOURCC('U', '0', '1', '0'),  // bt.2020 10 bit 420
-  FOURCC_F210 = FOURCC('F', '2', '1', '0'),  // bt.709 full range 10 bit 422
-  FOURCC_H210 = FOURCC('H', '2', '1', '0'),  // bt.709 10 bit 422
-  FOURCC_U210 = FOURCC('U', '2', '1', '0'),  // bt.2020 10 bit 422
-  FOURCC_P010 = FOURCC('P', '0', '1', '0'),
-  FOURCC_P210 = FOURCC('P', '2', '1', '0'),
-
-  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
-  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
-  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
-  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
-  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
-  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
-  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
-  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
-  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
-  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
-  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
-  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
-  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
-  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
-  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
-  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
-  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
-  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
-
-  // deprecated formats.  Not supported, but defined for backward compatibility.
-  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
-  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
-  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
-  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
-  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
-  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
-
-  // Match any fourcc.
-  FOURCC_ANY = -1,
-};
-
-enum FourCCBpp {
-  // Canonical fourcc codes used in our code.
-  FOURCC_BPP_I420 = 12,
-  FOURCC_BPP_I422 = 16,
-  FOURCC_BPP_I444 = 24,
-  FOURCC_BPP_I411 = 12,
-  FOURCC_BPP_I400 = 8,
-  FOURCC_BPP_NV21 = 12,
-  FOURCC_BPP_NV12 = 12,
-  FOURCC_BPP_YUY2 = 16,
-  FOURCC_BPP_UYVY = 16,
-  FOURCC_BPP_M420 = 12,  // deprecated
-  FOURCC_BPP_Q420 = 12,
-  FOURCC_BPP_ARGB = 32,
-  FOURCC_BPP_BGRA = 32,
-  FOURCC_BPP_ABGR = 32,
-  FOURCC_BPP_RGBA = 32,
-  FOURCC_BPP_AR30 = 32,
-  FOURCC_BPP_AB30 = 32,
-  FOURCC_BPP_AR64 = 64,
-  FOURCC_BPP_AB64 = 64,
-  FOURCC_BPP_24BG = 24,
-  FOURCC_BPP_RAW = 24,
-  FOURCC_BPP_RGBP = 16,
-  FOURCC_BPP_RGBO = 16,
-  FOURCC_BPP_R444 = 16,
-  FOURCC_BPP_RGGB = 8,
-  FOURCC_BPP_BGGR = 8,
-  FOURCC_BPP_GRBG = 8,
-  FOURCC_BPP_GBRG = 8,
-  FOURCC_BPP_YV12 = 12,
-  FOURCC_BPP_YV16 = 16,
-  FOURCC_BPP_YV24 = 24,
-  FOURCC_BPP_YU12 = 12,
-  FOURCC_BPP_J420 = 12,
-  FOURCC_BPP_J400 = 8,
-  FOURCC_BPP_H420 = 12,
-  FOURCC_BPP_H422 = 16,
-  FOURCC_BPP_I010 = 15,
-  FOURCC_BPP_I210 = 20,
-  FOURCC_BPP_H010 = 15,
-  FOURCC_BPP_H210 = 20,
-  FOURCC_BPP_P010 = 15,
-  FOURCC_BPP_P210 = 20,
-  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
-  FOURCC_BPP_H264 = 0,
-  FOURCC_BPP_IYUV = 12,
-  FOURCC_BPP_YU16 = 16,
-  FOURCC_BPP_YU24 = 24,
-  FOURCC_BPP_YUYV = 16,
-  FOURCC_BPP_YUVS = 16,
-  FOURCC_BPP_HDYC = 16,
-  FOURCC_BPP_2VUY = 16,
-  FOURCC_BPP_JPEG = 1,
-  FOURCC_BPP_DMB1 = 1,
-  FOURCC_BPP_BA81 = 8,
-  FOURCC_BPP_RGB3 = 24,
-  FOURCC_BPP_BGR3 = 24,
-  FOURCC_BPP_CM32 = 32,
-  FOURCC_BPP_CM24 = 24,
-
-  // Match any fourcc.
-  FOURCC_BPP_ANY = 0,  // 0 means unknown.
-};
-
-// Converts fourcc aliases into canonical ones.
-LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_
diff --git a/thirdparty/libyuv/libyuv.gni b/thirdparty/libyuv/libyuv.gni
deleted file mode 100644
index 8df40ba..0000000
--- a/thirdparty/libyuv/libyuv.gni
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2016 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-import("//build_overrides/build.gni")
-import("//build/config/arm.gni")
-import("//build/config/mips.gni")
-
-declare_args() {
-  libyuv_include_tests = !build_with_chromium
-  libyuv_disable_jpeg = false
-  libyuv_use_neon =
-      current_cpu == "arm64" ||
-      (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
-  libyuv_use_msa =
-      (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa
-  libyuv_use_mmi =
-      (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi
-}
diff --git a/thirdparty/libyuv/linux.mk b/thirdparty/libyuv/linux.mk
deleted file mode 100644
index f5e73ea..0000000
--- a/thirdparty/libyuv/linux.mk
+++ /dev/null
@@ -1,97 +0,0 @@
-# This is a generic makefile for libyuv for gcc.
-# make -f linux.mk CXX=clang++
-
-CC?=gcc
-CFLAGS?=-O2 -fomit-frame-pointer
-CFLAGS+=-Iinclude/
-
-CXX?=g++
-CXXFLAGS?=-O2 -fomit-frame-pointer
-CXXFLAGS+=-Iinclude/
-
-LOCAL_OBJ_FILES := \
-	source/compare.o           \
-	source/compare_common.o    \
-	source/compare_gcc.o       \
-	source/compare_mmi.o       \
-	source/compare_msa.o       \
-	source/compare_neon.o      \
-	source/compare_neon64.o    \
-	source/compare_win.o       \
-	source/convert.o           \
-	source/convert_argb.o      \
-	source/convert_from.o      \
-	source/convert_from_argb.o \
-	source/convert_jpeg.o      \
-	source/convert_to_argb.o   \
-	source/convert_to_i420.o   \
-	source/cpu_id.o            \
-	source/mjpeg_decoder.o     \
-	source/mjpeg_validate.o    \
-	source/planar_functions.o  \
-	source/rotate.o            \
-	source/rotate_any.o        \
-	source/rotate_argb.o       \
-	source/rotate_common.o     \
-	source/rotate_gcc.o        \
-	source/rotate_mmi.o        \
-	source/rotate_msa.o        \
-	source/rotate_neon.o       \
-	source/rotate_neon64.o     \
-	source/rotate_win.o        \
-	source/row_any.o           \
-	source/row_common.o        \
-	source/row_gcc.o           \
-	source/row_mmi.o           \
-	source/row_msa.o           \
-	source/row_neon.o          \
-	source/row_neon64.o        \
-	source/row_win.o           \
-	source/scale.o             \
-	source/scale_any.o         \
-	source/scale_argb.o        \
-	source/scale_common.o      \
-	source/scale_gcc.o         \
-	source/scale_mmi.o         \
-	source/scale_msa.o         \
-	source/scale_neon.o        \
-	source/scale_neon64.o      \
-	source/scale_uv.o          \
-	source/scale_win.o         \
-	source/video_common.o
-
-.cc.o:
-	$(CXX) -c $(CXXFLAGS) $*.cc -o $*.o
-
-.c.o:
-	$(CC) -c $(CFLAGS) $*.c -o $*.o
-
-all: libyuv.a i444tonv12_eg yuvconvert yuvconstants cpuid psnr
-
-libyuv.a: $(LOCAL_OBJ_FILES)
-	$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
-
-# A C++ test utility that uses libyuv conversion.
-yuvconvert: util/yuvconvert.cc libyuv.a
-	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/yuvconvert.cc libyuv.a
-
-# A C test utility that generates yuvconstants for yuv to rgb.
-yuvconstants: util/yuvconstants.c libyuv.a
-	$(CXX) $(CXXFLAGS) -Iutil/ -lm -o $@ util/yuvconstants.c libyuv.a
-
-# A standalone test utility
-psnr: util/psnr.cc
-	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
-
-# A simple conversion example.
-i444tonv12_eg: util/i444tonv12_eg.cc libyuv.a
-	$(CXX) $(CXXFLAGS) -o $@ util/i444tonv12_eg.cc libyuv.a
-
-# A C test utility that uses libyuv conversion from C.
-# gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0
-# CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk
-cpuid: util/cpuid.c libyuv.a
-	$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
-
-clean:
-	/bin/rm -f source/*.o *.ii *.s libyuv.a i444tonv12_eg yuvconvert yuvconstants cpuid psnr
diff --git a/thirdparty/libyuv/public.mk b/thirdparty/libyuv/public.mk
deleted file mode 100644
index 1342307..0000000
--- a/thirdparty/libyuv/public.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-# This file contains all the common make variables which are useful for
-# anyone depending on this library.
-# Note that dependencies on NDK are not directly listed since NDK auto adds
-# them.
-
-LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
-
-LIBYUV_C_FLAGS :=
-
-LIBYUV_CPP_FLAGS :=
-
-LIBYUV_LDLIBS :=
-LIBYUV_DEP_MODULES :=
diff --git a/thirdparty/libyuv/pylintrc b/thirdparty/libyuv/pylintrc
deleted file mode 100644
index b8bea33..0000000
--- a/thirdparty/libyuv/pylintrc
+++ /dev/null
@@ -1,17 +0,0 @@
-[MESSAGES CONTROL]
-
-# Disable the message, report, category or checker with the given id(s).
-# TODO(kjellander): Reduce this list to as small as possible.
-disable=I0010,I0011,bad-continuation,broad-except,duplicate-code,eval-used,exec-used,fixme,invalid-name,missing-docstring,no-init,no-member,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-branches,too-many-function-args,too-many-instance-attributes,too-many-lines,too-many-locals,too-many-public-methods,too-many-return-statements,too-many-statements
-
-
-[REPORTS]
-
-# Don't write out full reports, just messages.
-reports=no
-
-
-[FORMAT]
-
-# We use two spaces for indents, instead of the usual four spaces or tab.
-indent-string='  '
diff --git a/thirdparty/libyuv/source/compare.cc b/thirdparty/libyuv/source/compare.cc
deleted file mode 100644
index e93aba1..0000000
--- a/thirdparty/libyuv/source/compare.cc
+++ /dev/null
@@ -1,440 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/compare.h"
-
-#include <float.h>
-#include <math.h>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "libyuv/basic_types.h"
-#include "libyuv/compare_row.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/row.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// hash seed of 5381 recommended.
-LIBYUV_API
-uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
-  const int kBlockSize = 1 << 15;  // 32768;
-  int remainder;
-  uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) =
-      HashDjb2_C;
-#if defined(HAS_HASHDJB2_SSE41)
-  if (TestCpuFlag(kCpuHasSSE41)) {
-    HashDjb2_SSE = HashDjb2_SSE41;
-  }
-#endif
-#if defined(HAS_HASHDJB2_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    HashDjb2_SSE = HashDjb2_AVX2;
-  }
-#endif
-
-  while (count >= (uint64_t)(kBlockSize)) {
-    seed = HashDjb2_SSE(src, kBlockSize, seed);
-    src += kBlockSize;
-    count -= kBlockSize;
-  }
-  remainder = (int)count & ~15;
-  if (remainder) {
-    seed = HashDjb2_SSE(src, remainder, seed);
-    src += remainder;
-    count -= remainder;
-  }
-  remainder = (int)count & 15;
-  if (remainder) {
-    seed = HashDjb2_C(src, remainder, seed);
-  }
-  return seed;
-}
-
-static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
-      return FOURCC_BGRA;
-    }
-    if (argb[3] != 255) {  // Fourth byte is not Alpha of 255, so not BGRA.
-      return FOURCC_ARGB;
-    }
-    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
-      return FOURCC_BGRA;
-    }
-    if (argb[7] != 255) {  // Second pixel fourth byte is not Alpha of 255.
-      return FOURCC_ARGB;
-    }
-    argb += 8;
-  }
-  if (width & 1) {
-    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
-      return FOURCC_BGRA;
-    }
-    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
-      return FOURCC_ARGB;
-    }
-  }
-  return 0;
-}
-
-// Scan an opaque argb image and return fourcc based on alpha offset.
-// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
-LIBYUV_API
-uint32_t ARGBDetect(const uint8_t* argb,
-                    int stride_argb,
-                    int width,
-                    int height) {
-  uint32_t fourcc = 0;
-  int h;
-
-  // Coalesce rows.
-  if (stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    stride_argb = 0;
-  }
-  for (h = 0; h < height && fourcc == 0; ++h) {
-    fourcc = ARGBDetectRow_C(argb, width);
-    argb += stride_argb;
-  }
-  return fourcc;
-}
-
-// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.
-// So actual maximum is 1 less loop, which is 64436 - 32 bytes.
-
-LIBYUV_API
-uint64_t ComputeHammingDistance(const uint8_t* src_a,
-                                const uint8_t* src_b,
-                                int count) {
-  const int kBlockSize = 1 << 15;  // 32768;
-  const int kSimdSize = 64;
-  // SIMD for multiple of 64, and C for remainder
-  int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
-  uint64_t diff = 0;
-  int i;
-  uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b,
-                              int count) = HammingDistance_C;
-#if defined(HAS_HAMMINGDISTANCE_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    HammingDistance = HammingDistance_NEON;
-  }
-#endif
-#if defined(HAS_HAMMINGDISTANCE_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    HammingDistance = HammingDistance_SSSE3;
-  }
-#endif
-#if defined(HAS_HAMMINGDISTANCE_SSE42)
-  if (TestCpuFlag(kCpuHasSSE42)) {
-    HammingDistance = HammingDistance_SSE42;
-  }
-#endif
-#if defined(HAS_HAMMINGDISTANCE_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    HammingDistance = HammingDistance_AVX2;
-  }
-#endif
-#if defined(HAS_HAMMINGDISTANCE_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    HammingDistance = HammingDistance_MMI;
-  }
-#endif
-#if defined(HAS_HAMMINGDISTANCE_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    HammingDistance = HammingDistance_MSA;
-  }
-#endif
-
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+ : diff)
-#endif
-  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
-    diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
-  }
-  src_a += count & ~(kBlockSize - 1);
-  src_b += count & ~(kBlockSize - 1);
-  if (remainder) {
-    diff += HammingDistance(src_a, src_b, remainder);
-    src_a += remainder;
-    src_b += remainder;
-  }
-  remainder = count & (kSimdSize - 1);
-  if (remainder) {
-    diff += HammingDistance_C(src_a, src_b, remainder);
-  }
-  return diff;
-}
-
-// TODO(fbarchard): Refactor into row function.
-LIBYUV_API
-uint64_t ComputeSumSquareError(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count) {
-  // SumSquareError returns values 0 to 65535 for each squared difference.
-  // Up to 65536 of those can be summed and remain within a uint32_t.
-  // After each block of 65536 pixels, accumulate into a uint64_t.
-  const int kBlockSize = 65536;
-  int remainder = count & (kBlockSize - 1) & ~31;
-  uint64_t sse = 0;
-  int i;
-  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
-                             int count) = SumSquareError_C;
-#if defined(HAS_SUMSQUAREERROR_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SumSquareError = SumSquareError_NEON;
-  }
-#endif
-#if defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    // Note only used for multiples of 16 so count is not checked.
-    SumSquareError = SumSquareError_SSE2;
-  }
-#endif
-#if defined(HAS_SUMSQUAREERROR_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    // Note only used for multiples of 32 so count is not checked.
-    SumSquareError = SumSquareError_AVX2;
-  }
-#endif
-#if defined(HAS_SUMSQUAREERROR_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SumSquareError = SumSquareError_MMI;
-  }
-#endif
-#if defined(HAS_SUMSQUAREERROR_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    SumSquareError = SumSquareError_MSA;
-  }
-#endif
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+ : sse)
-#endif
-  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
-    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
-  }
-  src_a += count & ~(kBlockSize - 1);
-  src_b += count & ~(kBlockSize - 1);
-  if (remainder) {
-    sse += SumSquareError(src_a, src_b, remainder);
-    src_a += remainder;
-    src_b += remainder;
-  }
-  remainder = count & 31;
-  if (remainder) {
-    sse += SumSquareError_C(src_a, src_b, remainder);
-  }
-  return sse;
-}
-
-LIBYUV_API
-uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
-                                    int stride_a,
-                                    const uint8_t* src_b,
-                                    int stride_b,
-                                    int width,
-                                    int height) {
-  uint64_t sse = 0;
-  int h;
-  // Coalesce rows.
-  if (stride_a == width && stride_b == width) {
-    width *= height;
-    height = 1;
-    stride_a = stride_b = 0;
-  }
-  for (h = 0; h < height; ++h) {
-    sse += ComputeSumSquareError(src_a, src_b, width);
-    src_a += stride_a;
-    src_b += stride_b;
-  }
-  return sse;
-}
-
-LIBYUV_API
-double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) {
-  double psnr;
-  if (sse > 0) {
-    double mse = (double)count / (double)sse;
-    psnr = 10.0 * log10(255.0 * 255.0 * mse);
-  } else {
-    psnr = kMaxPsnr;  // Limit to prevent divide by 0
-  }
-
-  if (psnr > kMaxPsnr) {
-    psnr = kMaxPsnr;
-  }
-
-  return psnr;
-}
-
-LIBYUV_API
-double CalcFramePsnr(const uint8_t* src_a,
-                     int stride_a,
-                     const uint8_t* src_b,
-                     int stride_b,
-                     int width,
-                     int height) {
-  const uint64_t samples = (uint64_t)width * (uint64_t)height;
-  const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
-                                                  stride_b, width, height);
-  return SumSquareErrorToPsnr(sse, samples);
-}
-
-LIBYUV_API
-double I420Psnr(const uint8_t* src_y_a,
-                int stride_y_a,
-                const uint8_t* src_u_a,
-                int stride_u_a,
-                const uint8_t* src_v_a,
-                int stride_v_a,
-                const uint8_t* src_y_b,
-                int stride_y_b,
-                const uint8_t* src_u_b,
-                int stride_u_b,
-                const uint8_t* src_v_b,
-                int stride_v_b,
-                int width,
-                int height) {
-  const uint64_t sse_y = ComputeSumSquareErrorPlane(
-      src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
-  const int width_uv = (width + 1) >> 1;
-  const int height_uv = (height + 1) >> 1;
-  const uint64_t sse_u = ComputeSumSquareErrorPlane(
-      src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
-  const uint64_t sse_v = ComputeSumSquareErrorPlane(
-      src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
-  const uint64_t samples = (uint64_t)width * (uint64_t)height +
-                           2 * ((uint64_t)width_uv * (uint64_t)height_uv);
-  const uint64_t sse = sse_y + sse_u + sse_v;
-  return SumSquareErrorToPsnr(sse, samples);
-}
-
-static const int64_t cc1 = 26634;   // (64^2*(.01*255)^2
-static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
-
-static double Ssim8x8_C(const uint8_t* src_a,
-                        int stride_a,
-                        const uint8_t* src_b,
-                        int stride_b) {
-  int64_t sum_a = 0;
-  int64_t sum_b = 0;
-  int64_t sum_sq_a = 0;
-  int64_t sum_sq_b = 0;
-  int64_t sum_axb = 0;
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    int j;
-    for (j = 0; j < 8; ++j) {
-      sum_a += src_a[j];
-      sum_b += src_b[j];
-      sum_sq_a += src_a[j] * src_a[j];
-      sum_sq_b += src_b[j] * src_b[j];
-      sum_axb += src_a[j] * src_b[j];
-    }
-
-    src_a += stride_a;
-    src_b += stride_b;
-  }
-
-  {
-    const int64_t count = 64;
-    // scale the constants by number of pixels
-    const int64_t c1 = (cc1 * count * count) >> 12;
-    const int64_t c2 = (cc2 * count * count) >> 12;
-
-    const int64_t sum_a_x_sum_b = sum_a * sum_b;
-
-    const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) *
-                           (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
-
-    const int64_t sum_a_sq = sum_a * sum_a;
-    const int64_t sum_b_sq = sum_b * sum_b;
-
-    const int64_t ssim_d =
-        (sum_a_sq + sum_b_sq + c1) *
-        (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
-
-    if (ssim_d == 0.0) {
-      return DBL_MAX;
-    }
-    return ssim_n * 1.0 / ssim_d;
-  }
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-LIBYUV_API
-double CalcFrameSsim(const uint8_t* src_a,
-                     int stride_a,
-                     const uint8_t* src_b,
-                     int stride_b,
-                     int width,
-                     int height) {
-  int samples = 0;
-  double ssim_total = 0;
-  double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b,
-                    int stride_b) = Ssim8x8_C;
-
-  // sample point start with each 4x4 location
-  int i;
-  for (i = 0; i < height - 8; i += 4) {
-    int j;
-    for (j = 0; j < width - 8; j += 4) {
-      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
-      samples++;
-    }
-
-    src_a += stride_a * 4;
-    src_b += stride_b * 4;
-  }
-
-  ssim_total /= samples;
-  return ssim_total;
-}
-
-LIBYUV_API
-double I420Ssim(const uint8_t* src_y_a,
-                int stride_y_a,
-                const uint8_t* src_u_a,
-                int stride_u_a,
-                const uint8_t* src_v_a,
-                int stride_v_a,
-                const uint8_t* src_y_b,
-                int stride_y_b,
-                const uint8_t* src_u_b,
-                int stride_u_b,
-                const uint8_t* src_v_b,
-                int stride_v_b,
-                int width,
-                int height) {
-  const double ssim_y =
-      CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
-  const int width_uv = (width + 1) >> 1;
-  const int height_uv = (height + 1) >> 1;
-  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
-                                      width_uv, height_uv);
-  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
-                                      width_uv, height_uv);
-  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/compare_common.cc b/thirdparty/libyuv/source/compare_common.cc
deleted file mode 100644
index d1cab8d..0000000
--- a/thirdparty/libyuv/source/compare_common.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Hakmem method for hamming distance.
-uint32_t HammingDistance_C(const uint8_t* src_a,
-                           const uint8_t* src_b,
-                           int count) {
-  uint32_t diff = 0u;
-
-  int i;
-  for (i = 0; i < count - 3; i += 4) {
-    uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b);
-    uint32_t u = x - ((x >> 1) & 0x55555555);
-    u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
-    diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
-    src_a += 4;
-    src_b += 4;
-  }
-
-  for (; i < count; ++i) {
-    uint32_t x = *src_a ^ *src_b;
-    uint32_t u = x - ((x >> 1) & 0x55);
-    u = ((u >> 2) & 0x33) + (u & 0x33);
-    diff += (u + (u >> 4)) & 0x0f;
-    src_a += 1;
-    src_b += 1;
-  }
-
-  return diff;
-}
-
-uint32_t SumSquareError_C(const uint8_t* src_a,
-                          const uint8_t* src_b,
-                          int count) {
-  uint32_t sse = 0u;
-  int i;
-  for (i = 0; i < count; ++i) {
-    int diff = src_a[i] - src_b[i];
-    sse += (uint32_t)(diff * diff);
-  }
-  return sse;
-}
-
-// hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) {
-  uint32_t hash = seed;
-  int i;
-  for (i = 0; i < count; ++i) {
-    hash += (hash << 5) + src[i];
-  }
-  return hash;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/compare_gcc.cc b/thirdparty/libyuv/source/compare_gcc.cc
deleted file mode 100644
index 7dcbf7d..0000000
--- a/thirdparty/libyuv/source/compare_gcc.cc
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__))
-
-#if defined(__x86_64__)
-uint32_t HammingDistance_SSE42(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count) {
-  uint64_t diff = 0u;
-
-  asm volatile(
-      "xor         %3,%3                         \n"
-      "xor         %%r8,%%r8                     \n"
-      "xor         %%r9,%%r9                     \n"
-      "xor         %%r10,%%r10                   \n"
-
-      // Process 32 bytes per loop.
-      LABELALIGN
-      "1:                                        \n"
-      "mov         (%0),%%rcx                    \n"
-      "mov         0x8(%0),%%rdx                 \n"
-      "xor         (%1),%%rcx                    \n"
-      "xor         0x8(%1),%%rdx                 \n"
-      "popcnt      %%rcx,%%rcx                   \n"
-      "popcnt      %%rdx,%%rdx                   \n"
-      "mov         0x10(%0),%%rsi                \n"
-      "mov         0x18(%0),%%rdi                \n"
-      "xor         0x10(%1),%%rsi                \n"
-      "xor         0x18(%1),%%rdi                \n"
-      "popcnt      %%rsi,%%rsi                   \n"
-      "popcnt      %%rdi,%%rdi                   \n"
-      "add         $0x20,%0                      \n"
-      "add         $0x20,%1                      \n"
-      "add         %%rcx,%3                      \n"
-      "add         %%rdx,%%r8                    \n"
-      "add         %%rsi,%%r9                    \n"
-      "add         %%rdi,%%r10                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-
-      "add         %%r8, %3                      \n"
-      "add         %%r9, %3                      \n"
-      "add         %%r10, %3                     \n"
-      : "+r"(src_a),  // %0
-        "+r"(src_b),  // %1
-        "+r"(count),  // %2
-        "=r"(diff)    // %3
-      :
-      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
-
-  return static_cast<uint32_t>(diff);
-}
-#else
-uint32_t HammingDistance_SSE42(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count) {
-  uint32_t diff = 0u;
-
-  asm volatile(
-      // Process 16 bytes per loop.
-      LABELALIGN
-      "1:                                        \n"
-      "mov         (%0),%%ecx                    \n"
-      "mov         0x4(%0),%%edx                 \n"
-      "xor         (%1),%%ecx                    \n"
-      "xor         0x4(%1),%%edx                 \n"
-      "popcnt      %%ecx,%%ecx                   \n"
-      "add         %%ecx,%3                      \n"
-      "popcnt      %%edx,%%edx                   \n"
-      "add         %%edx,%3                      \n"
-      "mov         0x8(%0),%%ecx                 \n"
-      "mov         0xc(%0),%%edx                 \n"
-      "xor         0x8(%1),%%ecx                 \n"
-      "xor         0xc(%1),%%edx                 \n"
-      "popcnt      %%ecx,%%ecx                   \n"
-      "add         %%ecx,%3                      \n"
-      "popcnt      %%edx,%%edx                   \n"
-      "add         %%edx,%3                      \n"
-      "add         $0x10,%0                      \n"
-      "add         $0x10,%1                      \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_a),  // %0
-        "+r"(src_b),  // %1
-        "+r"(count),  // %2
-        "+r"(diff)    // %3
-      :
-      : "memory", "cc", "ecx", "edx");
-
-  return diff;
-}
-#endif
-
-static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
-                                 15, 15, 15, 15, 15, 15, 15, 15};
-static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
-
-uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count) {
-  uint32_t diff = 0u;
-
-  asm volatile(
-      "movdqa      %4,%%xmm2                     \n"
-      "movdqa      %5,%%xmm3                     \n"
-      "pxor        %%xmm0,%%xmm0                 \n"
-      "pxor        %%xmm1,%%xmm1                 \n"
-      "sub         %0,%1                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqa      (%0),%%xmm4                   \n"
-      "movdqa      0x10(%0), %%xmm5              \n"
-      "pxor        (%0,%1), %%xmm4               \n"
-      "movdqa      %%xmm4,%%xmm6                 \n"
-      "pand        %%xmm2,%%xmm6                 \n"
-      "psrlw       $0x4,%%xmm4                   \n"
-      "movdqa      %%xmm3,%%xmm7                 \n"
-      "pshufb      %%xmm6,%%xmm7                 \n"
-      "pand        %%xmm2,%%xmm4                 \n"
-      "movdqa      %%xmm3,%%xmm6                 \n"
-      "pshufb      %%xmm4,%%xmm6                 \n"
-      "paddb       %%xmm7,%%xmm6                 \n"
-      "pxor        0x10(%0,%1),%%xmm5            \n"
-      "add         $0x20,%0                      \n"
-      "movdqa      %%xmm5,%%xmm4                 \n"
-      "pand        %%xmm2,%%xmm5                 \n"
-      "psrlw       $0x4,%%xmm4                   \n"
-      "movdqa      %%xmm3,%%xmm7                 \n"
-      "pshufb      %%xmm5,%%xmm7                 \n"
-      "pand        %%xmm2,%%xmm4                 \n"
-      "movdqa      %%xmm3,%%xmm5                 \n"
-      "pshufb      %%xmm4,%%xmm5                 \n"
-      "paddb       %%xmm7,%%xmm5                 \n"
-      "paddb       %%xmm5,%%xmm6                 \n"
-      "psadbw      %%xmm1,%%xmm6                 \n"
-      "paddd       %%xmm6,%%xmm0                 \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-
-      "pshufd      $0xaa,%%xmm0,%%xmm1           \n"
-      "paddd       %%xmm1,%%xmm0                 \n"
-      "movd        %%xmm0, %3                    \n"
-      : "+r"(src_a),       // %0
-        "+r"(src_b),       // %1
-        "+r"(count),       // %2
-        "=r"(diff)         // %3
-      : "m"(kNibbleMask),  // %4
-        "m"(kBitCount)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-
-  return diff;
-}
-
-#ifdef HAS_HAMMINGDISTANCE_AVX2
-uint32_t HammingDistance_AVX2(const uint8_t* src_a,
-                              const uint8_t* src_b,
-                              int count) {
-  uint32_t diff = 0u;
-
-  asm volatile(
-      "vbroadcastf128 %4,%%ymm2                  \n"
-      "vbroadcastf128 %5,%%ymm3                  \n"
-      "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpxor       %%ymm1,%%ymm1,%%ymm1          \n"
-      "sub         %0,%1                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqa     (%0),%%ymm4                   \n"
-      "vmovdqa     0x20(%0), %%ymm5              \n"
-      "vpxor       (%0,%1), %%ymm4, %%ymm4       \n"
-      "vpand       %%ymm2,%%ymm4,%%ymm6          \n"
-      "vpsrlw      $0x4,%%ymm4,%%ymm4            \n"
-      "vpshufb     %%ymm6,%%ymm3,%%ymm6          \n"
-      "vpand       %%ymm2,%%ymm4,%%ymm4          \n"
-      "vpshufb     %%ymm4,%%ymm3,%%ymm4          \n"
-      "vpaddb      %%ymm4,%%ymm6,%%ymm6          \n"
-      "vpxor       0x20(%0,%1),%%ymm5,%%ymm4     \n"
-      "add         $0x40,%0                      \n"
-      "vpand       %%ymm2,%%ymm4,%%ymm5          \n"
-      "vpsrlw      $0x4,%%ymm4,%%ymm4            \n"
-      "vpshufb     %%ymm5,%%ymm3,%%ymm5          \n"
-      "vpand       %%ymm2,%%ymm4,%%ymm4          \n"
-      "vpshufb     %%ymm4,%%ymm3,%%ymm4          \n"
-      "vpaddb      %%ymm5,%%ymm4,%%ymm4          \n"
-      "vpaddb      %%ymm6,%%ymm4,%%ymm4          \n"
-      "vpsadbw     %%ymm1,%%ymm4,%%ymm4          \n"
-      "vpaddd      %%ymm0,%%ymm4,%%ymm0          \n"
-      "sub         $0x40,%2                      \n"
-      "jg          1b                            \n"
-
-      "vpermq      $0xb1,%%ymm0,%%ymm1           \n"
-      "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xaa,%%ymm0,%%ymm1           \n"
-      "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovd       %%xmm0, %3                    \n"
-      "vzeroupper                                \n"
-      : "+r"(src_a),       // %0
-        "+r"(src_b),       // %1
-        "+r"(count),       // %2
-        "=r"(diff)         // %3
-      : "m"(kNibbleMask),  // %4
-        "m"(kBitCount)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-
-  return diff;
-}
-#endif  // HAS_HAMMINGDISTANCE_AVX2
-
-uint32_t SumSquareError_SSE2(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t sse;
-  asm volatile(
-      "pxor        %%xmm0,%%xmm0                 \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqu      (%1),%%xmm2                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "psubusb     %%xmm2,%%xmm1                 \n"
-      "psubusb     %%xmm3,%%xmm2                 \n"
-      "por         %%xmm2,%%xmm1                 \n"
-      "movdqa      %%xmm1,%%xmm2                 \n"
-      "punpcklbw   %%xmm5,%%xmm1                 \n"
-      "punpckhbw   %%xmm5,%%xmm2                 \n"
-      "pmaddwd     %%xmm1,%%xmm1                 \n"
-      "pmaddwd     %%xmm2,%%xmm2                 \n"
-      "paddd       %%xmm1,%%xmm0                 \n"
-      "paddd       %%xmm2,%%xmm0                 \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-
-      "pshufd      $0xee,%%xmm0,%%xmm1           \n"
-      "paddd       %%xmm1,%%xmm0                 \n"
-      "pshufd      $0x1,%%xmm0,%%xmm1            \n"
-      "paddd       %%xmm1,%%xmm0                 \n"
-      "movd        %%xmm0,%3                     \n"
-
-      : "+r"(src_a),  // %0
-        "+r"(src_b),  // %1
-        "+r"(count),  // %2
-        "=g"(sse)     // %3
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-  return sse;
-}
-
-static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
-static const uvec32 kHashMul0 = {
-    0x0c3525e1,  // 33 ^ 15
-    0xa3476dc1,  // 33 ^ 14
-    0x3b4039a1,  // 33 ^ 13
-    0x4f5f0981,  // 33 ^ 12
-};
-static const uvec32 kHashMul1 = {
-    0x30f35d61,  // 33 ^ 11
-    0x855cb541,  // 33 ^ 10
-    0x040a9121,  // 33 ^ 9
-    0x747c7101,  // 33 ^ 8
-};
-static const uvec32 kHashMul2 = {
-    0xec41d4e1,  // 33 ^ 7
-    0x4cfa3cc1,  // 33 ^ 6
-    0x025528a1,  // 33 ^ 5
-    0x00121881,  // 33 ^ 4
-};
-static const uvec32 kHashMul3 = {
-    0x00008c61,  // 33 ^ 3
-    0x00000441,  // 33 ^ 2
-    0x00000021,  // 33 ^ 1
-    0x00000001,  // 33 ^ 0
-};
-
-uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
-  uint32_t hash;
-  asm volatile(
-      "movd        %2,%%xmm0                     \n"
-      "pxor        %%xmm7,%%xmm7                 \n"
-      "movdqa      %4,%%xmm6                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "pmulld      %%xmm6,%%xmm0                 \n"
-      "movdqa      %5,%%xmm5                     \n"
-      "movdqa      %%xmm1,%%xmm2                 \n"
-      "punpcklbw   %%xmm7,%%xmm2                 \n"
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "punpcklwd   %%xmm7,%%xmm3                 \n"
-      "pmulld      %%xmm5,%%xmm3                 \n"
-      "movdqa      %6,%%xmm5                     \n"
-      "movdqa      %%xmm2,%%xmm4                 \n"
-      "punpckhwd   %%xmm7,%%xmm4                 \n"
-      "pmulld      %%xmm5,%%xmm4                 \n"
-      "movdqa      %7,%%xmm5                     \n"
-      "punpckhbw   %%xmm7,%%xmm1                 \n"
-      "movdqa      %%xmm1,%%xmm2                 \n"
-      "punpcklwd   %%xmm7,%%xmm2                 \n"
-      "pmulld      %%xmm5,%%xmm2                 \n"
-      "movdqa      %8,%%xmm5                     \n"
-      "punpckhwd   %%xmm7,%%xmm1                 \n"
-      "pmulld      %%xmm5,%%xmm1                 \n"
-      "paddd       %%xmm4,%%xmm3                 \n"
-      "paddd       %%xmm2,%%xmm1                 \n"
-      "paddd       %%xmm3,%%xmm1                 \n"
-      "pshufd      $0xe,%%xmm1,%%xmm2            \n"
-      "paddd       %%xmm2,%%xmm1                 \n"
-      "pshufd      $0x1,%%xmm1,%%xmm2            \n"
-      "paddd       %%xmm2,%%xmm1                 \n"
-      "paddd       %%xmm1,%%xmm0                 \n"
-      "sub         $0x10,%1                      \n"
-      "jg          1b                            \n"
-      "movd        %%xmm0,%3                     \n"
-      : "+r"(src),        // %0
-        "+r"(count),      // %1
-        "+rm"(seed),      // %2
-        "=g"(hash)        // %3
-      : "m"(kHash16x33),  // %4
-        "m"(kHashMul0),   // %5
-        "m"(kHashMul1),   // %6
-        "m"(kHashMul2),   // %7
-        "m"(kHashMul3)    // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-  return hash;
-}
-#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/compare_mmi.cc b/thirdparty/libyuv/source/compare_mmi.cc
deleted file mode 100644
index 7640d94..0000000
--- a/thirdparty/libyuv/source/compare_mmi.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// Hakmem method for hamming distance.
-uint32_t HammingDistance_MMI(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t diff = 0u;
-
-  uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0;
-  uint64_t c1 = 0x5555555555555555;
-  uint64_t c2 = 0x3333333333333333;
-  uint64_t c3 = 0x0f0f0f0f0f0f0f0f;
-  uint32_t c4 = 0x01010101;
-  uint64_t s1 = 1, s2 = 2, s3 = 4;
-  __asm__ volatile(
-      "1:	\n\t"
-      "ldc1   %[ta],    0(%[src_a])          \n\t"
-      "ldc1   %[tb],    0(%[src_b])          \n\t"
-      "xor    %[temp],  %[ta],      %[tb]    \n\t"
-      "psrlw  %[temp1], %[temp],    %[s1]    \n\t"  // temp1=x>>1
-      "and    %[temp1], %[temp1],   %[c1]    \n\t"  // temp1&=c1
-      "psubw  %[temp1], %[temp],    %[temp1] \n\t"  // x-temp1
-      "and    %[temp],  %[temp1],   %[c2]    \n\t"  // t = (u&c2)
-      "psrlw  %[temp1], %[temp1],   %[s2]    \n\t"  // u>>2
-      "and    %[temp1], %[temp1],   %[c2]    \n\t"  // u>>2 & c2
-      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // t1 = t1+t
-      "psrlw  %[temp],  %[temp1],   %[s3]    \n\t"  // u>>4
-      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // u+(u>>4)
-      "and    %[temp1], %[temp1],   %[c3]    \n\t"  //&c3
-      "dmfc1  $t0,      %[temp1]             \n\t"
-      "dsrl32 $t0,      $t0,        0        \n\t "
-      "mul    $t0,      $t0,        %[c4]    \n\t"
-      "dsrl   $t0,      $t0,        24       \n\t"
-      "dadd   %[diff],  %[diff],    $t0      \n\t"
-      "dmfc1  $t0,      %[temp1]             \n\t"
-      "mul    $t0,      $t0,        %[c4]    \n\t"
-      "dsrl   $t0,      $t0,        24       \n\t"
-      "dadd   %[diff],  %[diff],    $t0      \n\t"
-      "daddiu %[src_a], %[src_a],   8        \n\t"
-      "daddiu %[src_b], %[src_b],   8        \n\t"
-      "addiu  %[count], %[count],  -8        \n\t"
-      "bgtz   %[count], 1b \n\t"
-      "nop                            \n\t"
-      : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b),
-        [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp),
-        [temp1] "+f"(temp1)
-      : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1),
-        [s2] "f"(s2), [s3] "f"(s3)
-      : "memory");
-  return diff;
-}
-
-uint32_t SumSquareError_MMI(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count) {
-  uint32_t sse = 0u;
-  uint32_t sse_hi = 0u, sse_lo = 0u;
-
-  uint64_t src1, src2;
-  uint64_t diff, diff_hi, diff_lo;
-  uint64_t sse_sum, sse_tmp;
-
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "xor        %[sse_sum],      %[sse_sum],        %[sse_sum]    \n\t"
-
-      "1:                                                           \n\t"
-      "ldc1       %[src1],         0x00(%[src_a])                   \n\t"
-      "ldc1       %[src2],         0x00(%[src_b])                   \n\t"
-      "pasubub    %[diff],         %[src1],           %[src2]       \n\t"
-      "punpcklbh  %[diff_lo],      %[diff],           %[mask]       \n\t"
-      "punpckhbh  %[diff_hi],      %[diff],           %[mask]       \n\t"
-      "pmaddhw    %[sse_tmp],      %[diff_lo],        %[diff_lo]    \n\t"
-      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
-      "pmaddhw    %[sse_tmp],      %[diff_hi],        %[diff_hi]    \n\t"
-      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
-
-      "daddiu     %[src_a],        %[src_a],          0x08          \n\t"
-      "daddiu     %[src_b],        %[src_b],          0x08          \n\t"
-      "daddiu     %[count],        %[count],         -0x08          \n\t"
-      "bnez       %[count],        1b                               \n\t"
-
-      "mfc1       %[sse_lo],       %[sse_sum]                       \n\t"
-      "mfhc1      %[sse_hi],       %[sse_sum]                       \n\t"
-      "daddu      %[sse],          %[sse_hi],         %[sse_lo]     \n\t"
-      : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1),
-        [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi),
-        [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp),
-        [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo)
-      : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count),
-        [mask] "f"(mask)
-      : "memory");
-
-  return sse;
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/compare_msa.cc b/thirdparty/libyuv/source/compare_msa.cc
deleted file mode 100644
index 0b807d3..0000000
--- a/thirdparty/libyuv/source/compare_msa.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2017 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-// This module is for GCC MSA
-#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
-#include "libyuv/macros_msa.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-uint32_t HammingDistance_MSA(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t diff = 0u;
-  int i;
-  v16u8 src0, src1, src2, src3;
-  v2i64 vec0 = {0}, vec1 = {0};
-
-  for (i = 0; i < count; i += 32) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
-    src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
-    src0 ^= src2;
-    src1 ^= src3;
-    vec0 += __msa_pcnt_d((v2i64)src0);
-    vec1 += __msa_pcnt_d((v2i64)src1);
-    src_a += 32;
-    src_b += 32;
-  }
-
-  vec0 += vec1;
-  diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0);
-  diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2);
-  return diff;
-}
-
-uint32_t SumSquareError_MSA(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count) {
-  uint32_t sse = 0u;
-  int i;
-  v16u8 src0, src1, src2, src3;
-  v8i16 vec0, vec1, vec2, vec3;
-  v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0};
-  v2i64 tmp0;
-
-  for (i = 0; i < count; i += 32) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
-    src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
-    vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
-    vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
-    vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
-    vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
-    vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
-    vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
-    vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
-    vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
-    reg0 = __msa_dpadd_s_w(reg0, vec0, vec0);
-    reg1 = __msa_dpadd_s_w(reg1, vec1, vec1);
-    reg2 = __msa_dpadd_s_w(reg2, vec2, vec2);
-    reg3 = __msa_dpadd_s_w(reg3, vec3, vec3);
-    src_a += 32;
-    src_b += 32;
-  }
-
-  reg0 += reg1;
-  reg2 += reg3;
-  reg0 += reg2;
-  tmp0 = __msa_hadd_s_d(reg0, reg0);
-  sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0);
-  sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2);
-  return sse;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/thirdparty/libyuv/source/compare_neon.cc b/thirdparty/libyuv/source/compare_neon.cc
deleted file mode 100644
index afdd601..0000000
--- a/thirdparty/libyuv/source/compare_neon.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-// 256 bits at a time
-// uses short accumulator which restricts count to 131 KB
-uint32_t HammingDistance_NEON(const uint8_t* src_a,
-                              const uint8_t* src_b,
-                              int count) {
-  uint32_t diff;
-
-  asm volatile(
-      "vmov.u16    q4, #0                        \n"  // accumulator
-
-      "1:                                        \n"
-      "vld1.8      {q0, q1}, [%0]!               \n"
-      "vld1.8      {q2, q3}, [%1]!               \n"
-      "veor.32     q0, q0, q2                    \n"
-      "veor.32     q1, q1, q3                    \n"
-      "vcnt.i8     q0, q0                        \n"
-      "vcnt.i8     q1, q1                        \n"
-      "subs        %2, %2, #32                   \n"
-      "vadd.u8     q0, q0, q1                    \n"  // 16 byte counts
-      "vpadal.u8   q4, q0                        \n"  // 8 shorts
-      "bgt         1b                            \n"
-
-      "vpaddl.u16  q0, q4                        \n"  // 4 ints
-      "vpadd.u32   d0, d0, d1                    \n"
-      "vpadd.u32   d0, d0, d0                    \n"
-      "vmov.32     %3, d0[0]                     \n"
-
-      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
-      :
-      : "cc", "q0", "q1", "q2", "q3", "q4");
-  return diff;
-}
-
-uint32_t SumSquareError_NEON(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t sse;
-  asm volatile(
-      "vmov.u8     q8, #0                        \n"
-      "vmov.u8     q10, #0                       \n"
-      "vmov.u8     q9, #0                        \n"
-      "vmov.u8     q11, #0                       \n"
-
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"
-      "vld1.8      {q1}, [%1]!                   \n"
-      "subs        %2, %2, #16                   \n"
-      "vsubl.u8    q2, d0, d2                    \n"
-      "vsubl.u8    q3, d1, d3                    \n"
-      "vmlal.s16   q8, d4, d4                    \n"
-      "vmlal.s16   q9, d6, d6                    \n"
-      "vmlal.s16   q10, d5, d5                   \n"
-      "vmlal.s16   q11, d7, d7                   \n"
-      "bgt         1b                            \n"
-
-      "vadd.u32    q8, q8, q9                    \n"
-      "vadd.u32    q10, q10, q11                 \n"
-      "vadd.u32    q11, q8, q10                  \n"
-      "vpaddl.u32  q1, q11                       \n"
-      "vadd.u64    d0, d2, d3                    \n"
-      "vmov.32     %3, d0[0]                     \n"
-      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-  return sse;
-}
-
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/compare_neon64.cc b/thirdparty/libyuv/source/compare_neon64.cc
deleted file mode 100644
index 70fb9b9..0000000
--- a/thirdparty/libyuv/source/compare_neon64.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// 256 bits at a time
-// uses short accumulator which restricts count to 131 KB
-uint32_t HammingDistance_NEON(const uint8_t* src_a,
-                              const uint8_t* src_b,
-                              int count) {
-  uint32_t diff;
-  asm volatile(
-      "movi        v4.8h, #0                     \n"
-
-      "1:                                        \n"
-      "ld1         {v0.16b, v1.16b}, [%0], #32   \n"
-      "ld1         {v2.16b, v3.16b}, [%1], #32   \n"
-      "eor         v0.16b, v0.16b, v2.16b        \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "eor         v1.16b, v1.16b, v3.16b        \n"
-      "cnt         v0.16b, v0.16b                \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "cnt         v1.16b, v1.16b                \n"
-      "subs        %w2, %w2, #32                 \n"
-      "add         v0.16b, v0.16b, v1.16b        \n"
-      "uadalp      v4.8h, v0.16b                 \n"
-      "b.gt        1b                            \n"
-
-      "uaddlv      s4, v4.8h                     \n"
-      "fmov        %w3, s4                       \n"
-      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
-      :
-      : "cc", "v0", "v1", "v2", "v3", "v4");
-  return diff;
-}
-
-uint32_t SumSquareError_NEON(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t sse;
-  asm volatile(
-      "eor         v16.16b, v16.16b, v16.16b     \n"
-      "eor         v18.16b, v18.16b, v18.16b     \n"
-      "eor         v17.16b, v17.16b, v17.16b     \n"
-      "eor         v19.16b, v19.16b, v19.16b     \n"
-
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"
-      "ld1         {v1.16b}, [%1], #16           \n"
-      "subs        %w2, %w2, #16                 \n"
-      "usubl       v2.8h, v0.8b, v1.8b           \n"
-      "usubl2      v3.8h, v0.16b, v1.16b         \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "smlal       v16.4s, v2.4h, v2.4h          \n"
-      "smlal       v17.4s, v3.4h, v3.4h          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "smlal2      v18.4s, v2.8h, v2.8h          \n"
-      "smlal2      v19.4s, v3.8h, v3.8h          \n"
-      "b.gt        1b                            \n"
-
-      "add         v16.4s, v16.4s, v17.4s        \n"
-      "add         v18.4s, v18.4s, v19.4s        \n"
-      "add         v19.4s, v16.4s, v18.4s        \n"
-      "addv        s0, v19.4s                    \n"
-      "fmov        %w3, s0                       \n"
-      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
-      :
-      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
-  return sse;
-}
-
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/compare_win.cc b/thirdparty/libyuv/source/compare_win.cc
deleted file mode 100644
index 9bb27f1..0000000
--- a/thirdparty/libyuv/source/compare_win.cc
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#if defined(_MSC_VER)
-#include <intrin.h>  // For __popcnt
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for 32 bit Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    !defined(__clang__) && defined(_M_IX86)
-
-uint32_t HammingDistance_SSE42(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count) {
-  uint32_t diff = 0u;
-
-  int i;
-  for (i = 0; i < count - 3; i += 4) {
-    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
-    src_a += 4;
-    src_b += 4;
-    diff += __popcnt(x);
-  }
-  return diff;
-}
-
-__declspec(naked) uint32_t
-    SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
-  __asm {
-    mov        eax, [esp + 4]  // src_a
-    mov        edx, [esp + 8]  // src_b
-    mov        ecx, [esp + 12]  // count
-    pxor       xmm0, xmm0
-    pxor       xmm5, xmm5
-
-  wloop:
-    movdqu     xmm1, [eax]
-    lea        eax,  [eax + 16]
-    movdqu     xmm2, [edx]
-    lea        edx,  [edx + 16]
-    movdqa     xmm3, xmm1  // abs trick
-    psubusb    xmm1, xmm2
-    psubusb    xmm2, xmm3
-    por        xmm1, xmm2
-    movdqa     xmm2, xmm1
-    punpcklbw  xmm1, xmm5
-    punpckhbw  xmm2, xmm5
-    pmaddwd    xmm1, xmm1
-    pmaddwd    xmm2, xmm2
-    paddd      xmm0, xmm1
-    paddd      xmm0, xmm2
-    sub        ecx, 16
-    jg         wloop
-
-    pshufd     xmm1, xmm0, 0xee
-    paddd      xmm0, xmm1
-    pshufd     xmm1, xmm0, 0x01
-    paddd      xmm0, xmm1
-    movd       eax, xmm0
-    ret
-  }
-}
-
-#ifdef HAS_SUMSQUAREERROR_AVX2
-// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable : 4752)
-__declspec(naked) uint32_t
-    SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
-  __asm {
-    mov        eax, [esp + 4]  // src_a
-    mov        edx, [esp + 8]  // src_b
-    mov        ecx, [esp + 12]  // count
-    vpxor      ymm0, ymm0, ymm0  // sum
-    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
-    sub        edx, eax
-
-  wloop:
-    vmovdqu    ymm1, [eax]
-    vmovdqu    ymm2, [eax + edx]
-    lea        eax,  [eax + 32]
-    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
-    vpsubusb   ymm2, ymm2, ymm1
-    vpor       ymm1, ymm2, ymm3
-    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
-    vpunpckhbw ymm1, ymm1, ymm5
-    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
-    vpmaddwd   ymm1, ymm1, ymm1
-    vpaddd     ymm0, ymm0, ymm1
-    vpaddd     ymm0, ymm0, ymm2
-    sub        ecx, 32
-    jg         wloop
-
-    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
-    vpaddd     ymm0, ymm0, ymm1
-    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
-    vpaddd     ymm0, ymm0, ymm1
-    vpermq     ymm1, ymm0, 0x02  // high + low lane.
-    vpaddd     ymm0, ymm0, ymm1
-    vmovd      eax, xmm0
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SUMSQUAREERROR_AVX2
-
-uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
-uvec32 kHashMul0 = {
-    0x0c3525e1,  // 33 ^ 15
-    0xa3476dc1,  // 33 ^ 14
-    0x3b4039a1,  // 33 ^ 13
-    0x4f5f0981,  // 33 ^ 12
-};
-uvec32 kHashMul1 = {
-    0x30f35d61,  // 33 ^ 11
-    0x855cb541,  // 33 ^ 10
-    0x040a9121,  // 33 ^ 9
-    0x747c7101,  // 33 ^ 8
-};
-uvec32 kHashMul2 = {
-    0xec41d4e1,  // 33 ^ 7
-    0x4cfa3cc1,  // 33 ^ 6
-    0x025528a1,  // 33 ^ 5
-    0x00121881,  // 33 ^ 4
-};
-uvec32 kHashMul3 = {
-    0x00008c61,  // 33 ^ 3
-    0x00000441,  // 33 ^ 2
-    0x00000021,  // 33 ^ 1
-    0x00000001,  // 33 ^ 0
-};
-
-__declspec(naked) uint32_t
-    HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        ecx, [esp + 8]  // count
-    movd       xmm0, [esp + 12]  // seed
-
-    pxor       xmm7, xmm7  // constant 0 for unpck
-    movdqa     xmm6, xmmword ptr kHash16x33
-
-  wloop:
-    movdqu     xmm1, [eax]  // src[0-15]
-    lea        eax, [eax + 16]
-    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
-    movdqa     xmm5, xmmword ptr kHashMul0
-    movdqa     xmm2, xmm1
-    punpcklbw  xmm2, xmm7  // src[0-7]
-    movdqa     xmm3, xmm2
-    punpcklwd  xmm3, xmm7  // src[0-3]
-    pmulld     xmm3, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul1
-    movdqa     xmm4, xmm2
-    punpckhwd  xmm4, xmm7  // src[4-7]
-    pmulld     xmm4, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul2
-    punpckhbw  xmm1, xmm7  // src[8-15]
-    movdqa     xmm2, xmm1
-    punpcklwd  xmm2, xmm7  // src[8-11]
-    pmulld     xmm2, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul3
-    punpckhwd  xmm1, xmm7  // src[12-15]
-    pmulld     xmm1, xmm5
-    paddd      xmm3, xmm4  // add 16 results
-    paddd      xmm1, xmm2
-    paddd      xmm1, xmm3
-
-    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
-    paddd      xmm1, xmm2
-    pshufd     xmm2, xmm1, 0x01
-    paddd      xmm1, xmm2
-    paddd      xmm0, xmm1
-    sub        ecx, 16
-    jg         wloop
-
-    movd       eax, xmm0  // return hash
-    ret
-  }
-}
-
-// Visual C 2012 required for AVX2.
-#ifdef HAS_HASHDJB2_AVX2
-__declspec(naked) uint32_t
-    HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        ecx, [esp + 8]  // count
-    vmovd      xmm0, [esp + 12]  // seed
-
-  wloop:
-    vpmovzxbd  xmm3, [eax]  // src[0-3]
-    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
-    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
-    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
-    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
-    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
-    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
-    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
-    lea        eax, [eax + 16]
-    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
-    vpaddd     xmm3, xmm3, xmm4  // add 16 results
-    vpaddd     xmm1, xmm1, xmm2
-    vpaddd     xmm1, xmm1, xmm3
-    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
-    vpaddd     xmm1, xmm1,xmm2
-    vpshufd    xmm2, xmm1, 0x01
-    vpaddd     xmm1, xmm1, xmm2
-    vpaddd     xmm0, xmm0, xmm1
-    sub        ecx, 16
-    jg         wloop
-
-    vmovd      eax, xmm0  // return hash
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_HASHDJB2_AVX2
-
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/convert.cc b/thirdparty/libyuv/source/convert.cc
deleted file mode 100644
index 69f7fb6..0000000
--- a/thirdparty/libyuv/source/convert.cc
+++ /dev/null
@@ -1,3148 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/row.h"
-#include "libyuv/scale.h"     // For ScalePlane()
-#include "libyuv/scale_uv.h"  // For UVScale()
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8_t* src_y,
-                      int src_stride_y,
-                      const uint8_t* src_u,
-                      int src_stride_u,
-                      const uint8_t* src_v,
-                      int src_stride_v,
-                      uint8_t* dst_y,
-                      int dst_stride_y,
-                      uint8_t* dst_u,
-                      int dst_stride_u,
-                      uint8_t* dst_v,
-                      int dst_stride_v,
-                      int src_y_width,
-                      int src_y_height,
-                      int src_uv_width,
-                      int src_uv_height) {
-  const int dst_y_width = Abs(src_y_width);
-  const int dst_y_height = Abs(src_y_height);
-  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
-  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
-  if (src_uv_width <= 0 || src_uv_height == 0) {
-    return -1;
-  }
-  if (dst_y) {
-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
-               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
-  }
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
-             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
-             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
-  return 0;
-}
-
-// Copy I420 with optional flipping.
-// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
-// is does row coalescing.
-LIBYUV_API
-int I420Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_u,
-             int src_stride_u,
-             const uint8_t* src_v,
-             int src_stride_v,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_u,
-             int dst_stride_u,
-             uint8_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  // Copy UV planes.
-  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
-  return 0;
-}
-
-// Copy I010 with optional flipping.
-LIBYUV_API
-int I010Copy(const uint16_t* src_y,
-             int src_stride_y,
-             const uint16_t* src_u,
-             int src_stride_u,
-             const uint16_t* src_v,
-             int src_stride_v,
-             uint16_t* dst_y,
-             int dst_stride_y,
-             uint16_t* dst_u,
-             int dst_stride_u,
-             uint16_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  // Copy UV planes.
-  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
-  return 0;
-}
-
-static int Planar16bitTo8bit(const uint16_t* src_y,
-                             int src_stride_y,
-                             const uint16_t* src_u,
-                             int src_stride_u,
-                             const uint16_t* src_v,
-                             int src_stride_v,
-                             uint8_t* dst_y,
-                             int dst_stride_y,
-                             uint8_t* dst_u,
-                             int dst_stride_u,
-                             uint8_t* dst_v,
-                             int dst_stride_v,
-                             int width,
-                             int height,
-                             int subsample_x,
-                             int subsample_y,
-                             int depth) {
-  int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
-  int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
-  int scale = 1 << (24 - depth);
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    uv_height = -uv_height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (uv_height - 1) * src_stride_u;
-    src_v = src_v + (uv_height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  // Convert Y plane.
-  Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
-                    height);
-  // Convert UV planes.
-  Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale, uv_width,
-                    uv_height);
-  Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale, uv_width,
-                    uv_height);
-  return 0;
-}
-
-// Convert 10 bit YUV to 8 bit.
-LIBYUV_API
-int I010ToI420(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_y, dst_stride_y, dst_u,
-                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
-                           1, 10);
-}
-
-LIBYUV_API
-int I210ToI422(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_y, dst_stride_y, dst_u,
-                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
-                           0, 10);
-}
-
-LIBYUV_API
-int I410ToI444(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_y, dst_stride_y, dst_u,
-                           dst_stride_u, dst_v, dst_stride_v, width, height, 0,
-                           0, 10);
-}
-
-LIBYUV_API
-int I012ToI420(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_y, dst_stride_y, dst_u,
-                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
-                           1, 12);
-}
-
-LIBYUV_API
-int I212ToI422(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_y, dst_stride_y, dst_u,
-                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
-                           0, 12);
-}
-
-LIBYUV_API
-int I412ToI444(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_y, dst_stride_y, dst_u,
-                           dst_stride_u, dst_v, dst_stride_v, width, height, 0,
-                           0, 12);
-}
-
-// Any Ix10 To I010 format with mirroring.
-static int Ix10ToI010(const uint16_t* src_y,
-                      int src_stride_y,
-                      const uint16_t* src_u,
-                      int src_stride_u,
-                      const uint16_t* src_v,
-                      int src_stride_v,
-                      uint16_t* dst_y,
-                      int dst_stride_y,
-                      uint16_t* dst_u,
-                      int dst_stride_u,
-                      uint16_t* dst_v,
-                      int dst_stride_v,
-                      int width,
-                      int height,
-                      int subsample_x,
-                      int subsample_y) {
-  const int dst_y_width = Abs(width);
-  const int dst_y_height = Abs(height);
-  const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
-  const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
-  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
-  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
-  if (width <= 0 || height == 0) {
-    return -1;
-  }
-  if (dst_y) {
-    ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-                  dst_y_width, dst_y_height, kFilterBilinear);
-  }
-  ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
-                dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
-  ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
-                dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
-  return 0;
-}
-
-LIBYUV_API
-int I410ToI010(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                    dst_v, dst_stride_v, width, height, 0, 0);
-}
-
-LIBYUV_API
-int I210ToI010(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                    dst_v, dst_stride_v, width, height, 1, 0);
-}
-
-// Any I[420]1[02] to P[420]1[02] format with mirroring.
-static int IxxxToPxxx(const uint16_t* src_y,
-                      int src_stride_y,
-                      const uint16_t* src_u,
-                      int src_stride_u,
-                      const uint16_t* src_v,
-                      int src_stride_v,
-                      uint16_t* dst_y,
-                      int dst_stride_y,
-                      uint16_t* dst_uv,
-                      int dst_stride_uv,
-                      int width,
-                      int height,
-                      int subsample_x,
-                      int subsample_y,
-                      int depth) {
-  const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
-  const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
-  if (width <= 0 || height == 0) {
-    return -1;
-  }
-
-  ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
-                       depth);
-  MergeUVPlane_16(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
-                  dst_stride_uv, uv_width, uv_height, depth);
-  return 0;
-}
-
-LIBYUV_API
-int I010ToP010(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
-                    width, height, 1, 1, 10);
-}
-
-LIBYUV_API
-int I210ToP210(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
-                    width, height, 1, 0, 10);
-}
-
-LIBYUV_API
-int I012ToP012(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
-                    width, height, 1, 1, 12);
-}
-
-LIBYUV_API
-int I212ToP212(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
-                    width, height, 1, 0, 12);
-}
-
-// 422 chroma is 1/2 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I422ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  const int src_uv_width = SUBSAMPLE(width, 1, 1);
-  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                    dst_v, dst_stride_v, width, height, src_uv_width, height);
-}
-
-// TODO(fbarchard): Implement row conversion.
-LIBYUV_API
-int I422ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  // Allocate u and v buffers
-  align_buffer_64(plane_u, halfwidth * halfheight * 2);
-  uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
-  I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
-             height);
-  MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
-               halfwidth, halfheight);
-  free_aligned_buffer_64(plane_u);
-  return 0;
-}
-
-#ifdef I422TONV21_ROW_VERSION
-// Unittest fails for this version.
-// 422 chroma is 1/2 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-// Swap src_u and src_v to implement I422ToNV12
-LIBYUV_API
-int I422ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int y;
-  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
-                     uint8_t* dst_uv, int width) = MergeUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      MergeUVRow = MergeUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MergeUVRow = MergeUVRow_Any_MSA;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      InterpolateRow = InterpolateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
-  }
-  {
-    // Allocate 2 rows of vu.
-    int awidth = halfwidth * 2;
-    align_buffer_64(row_vu_0, awidth * 2);
-    uint8_t* row_vu_1 = row_vu_0 + awidth;
-
-    for (y = 0; y < height - 1; y += 2) {
-      MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
-      MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1,
-                 halfwidth);
-      InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128);
-      src_u += src_stride_u * 2;
-      src_v += src_stride_v * 2;
-      dst_vu += dst_stride_vu;
-    }
-    if (height & 1) {
-      MergeUVRow(src_v, src_u, dst_vu, halfwidth);
-    }
-    free_aligned_buffer_64(row_vu_0);
-  }
-  return 0;
-}
-#endif  // I422TONV21_ROW_VERSION
-
-// 444 chroma is 1x width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I444ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                    dst_v, dst_stride_v, width, height, width, height);
-}
-
-LIBYUV_API
-int I444ToNV12(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
-                   dst_stride_uv, width, height);
-  return 0;
-}
-
-LIBYUV_API
-int I444ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
-                    width, height);
-}
-
-// I400 is greyscale typically used in MJPG
-LIBYUV_API
-int I400ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
-  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
-  return 0;
-}
-
-// I400 is greyscale typically used in MJPG
-LIBYUV_API
-int I400ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!dst_vu || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128);
-  return 0;
-}
-
-// Convert NV12 to I420.
-// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm.
-LIBYUV_API
-int NV12ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
-    src_stride_y = -src_stride_y;
-    src_stride_uv = -src_stride_uv;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-  // Coalesce rows.
-  if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
-      dst_stride_v == halfwidth) {
-    halfwidth *= halfheight;
-    halfheight = 1;
-    src_stride_uv = dst_stride_u = dst_stride_v = 0;
-  }
-
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  // Split UV plane - NV12 / NV21
-  SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
-               halfwidth, halfheight);
-
-  return 0;
-}
-
-// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
-LIBYUV_API
-int NV21ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
-                    dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u,
-                    width, height);
-}
-
-LIBYUV_API
-int NV12ToNV24(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  if (width <= 0 || height == 0) {
-    return -1;
-  }
-
-  if (dst_y) {
-    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-               Abs(width), Abs(height), kFilterBilinear);
-  }
-  UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
-          SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
-          Abs(height), kFilterBilinear);
-  return 0;
-}
-
-LIBYUV_API
-int NV16ToNV24(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  if (width <= 0 || height == 0) {
-    return -1;
-  }
-
-  if (dst_y) {
-    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-               Abs(width), Abs(height), kFilterBilinear);
-  }
-  UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
-          dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
-  return 0;
-}
-
-LIBYUV_API
-int P010ToP410(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_uv,
-               int src_stride_uv,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  if (width <= 0 || height == 0) {
-    return -1;
-  }
-
-  if (dst_y) {
-    ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-                  Abs(width), Abs(height), kFilterBilinear);
-  }
-  UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
-             SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
-             Abs(height), kFilterBilinear);
-  return 0;
-}
-
-LIBYUV_API
-int P210ToP410(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_uv,
-               int src_stride_uv,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  if (width <= 0 || height == 0) {
-    return -1;
-  }
-
-  if (dst_y) {
-    ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-                  Abs(width), Abs(height), kFilterBilinear);
-  }
-  UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
-             dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
-  return 0;
-}
-
-// Convert YUY2 to I420.
-LIBYUV_API
-int YUY2ToI420(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      YUY2ToUVRow_C;
-  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
-      YUY2ToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-#if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToUVRow = YUY2ToUVRow_SSE2;
-      YUY2ToYRow = YUY2ToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToUVRow = YUY2ToUVRow_AVX2;
-      YUY2ToYRow = YUY2ToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToYRow = YUY2ToYRow_NEON;
-      YUY2ToUVRow = YUY2ToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    YUY2ToYRow = YUY2ToYRow_Any_MMI;
-    YUY2ToUVRow = YUY2ToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToYRow = YUY2ToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        YUY2ToUVRow = YUY2ToUVRow_MMI;
-      }
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    YUY2ToYRow = YUY2ToYRow_Any_MSA;
-    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToYRow = YUY2ToYRow_MSA;
-      YUY2ToUVRow = YUY2ToUVRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
-    src_yuy2 += src_stride_yuy2 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert UYVY to I420.
-LIBYUV_API
-int UYVYToI420(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      UYVYToUVRow_C;
-  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
-      UYVYToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-#if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
-    UYVYToYRow = UYVYToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToUVRow = UYVYToUVRow_SSE2;
-      UYVYToYRow = UYVYToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
-    UYVYToYRow = UYVYToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToUVRow = UYVYToUVRow_AVX2;
-      UYVYToYRow = UYVYToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    UYVYToYRow = UYVYToYRow_Any_NEON;
-    UYVYToUVRow = UYVYToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_NEON;
-      UYVYToUVRow = UYVYToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    UYVYToYRow = UYVYToYRow_Any_MMI;
-    UYVYToUVRow = UYVYToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_MMI;
-      UYVYToUVRow = UYVYToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    UYVYToYRow = UYVYToYRow_Any_MSA;
-    UYVYToUVRow = UYVYToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToYRow = UYVYToYRow_MSA;
-      UYVYToUVRow = UYVYToUVRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
-    UYVYToYRow(src_uyvy, dst_y, width);
-    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
-    src_uyvy += src_stride_uyvy * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
-    UYVYToYRow(src_uyvy, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert AYUV to NV12.
-LIBYUV_API
-int AYUVToNV12(const uint8_t* src_ayuv,
-               int src_stride_ayuv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int y;
-  void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
-                      uint8_t* dst_uv, int width) = AYUVToUVRow_C;
-  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
-      AYUVToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
-    src_stride_ayuv = -src_stride_ayuv;
-  }
-// place holders for future intel code
-#if defined(HAS_AYUVTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    AYUVToUVRow = AYUVToUVRow_Any_SSE2;
-    AYUVToYRow = AYUVToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      AYUVToUVRow = AYUVToUVRow_SSE2;
-      AYUVToYRow = AYUVToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_AYUVTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    AYUVToUVRow = AYUVToUVRow_Any_AVX2;
-    AYUVToYRow = AYUVToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      AYUVToUVRow = AYUVToUVRow_AVX2;
-      AYUVToYRow = AYUVToYRow_AVX2;
-    }
-  }
-#endif
-
-#if defined(HAS_AYUVTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    AYUVToYRow = AYUVToYRow_Any_NEON;
-    AYUVToUVRow = AYUVToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      AYUVToYRow = AYUVToYRow_NEON;
-      AYUVToUVRow = AYUVToUVRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
-    AYUVToYRow(src_ayuv, dst_y, width);
-    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
-    src_ayuv += src_stride_ayuv * 2;
-    dst_y += dst_stride_y * 2;
-    dst_uv += dst_stride_uv;
-  }
-  if (height & 1) {
-    AYUVToUVRow(src_ayuv, 0, dst_uv, width);
-    AYUVToYRow(src_ayuv, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert AYUV to NV21.
-LIBYUV_API
-int AYUVToNV21(const uint8_t* src_ayuv,
-               int src_stride_ayuv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int y;
-  void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
-                      uint8_t* dst_vu, int width) = AYUVToVURow_C;
-  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
-      AYUVToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
-    src_stride_ayuv = -src_stride_ayuv;
-  }
-// place holders for future intel code
-#if defined(HAS_AYUVTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    AYUVToVURow = AYUVToVURow_Any_SSE2;
-    AYUVToYRow = AYUVToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      AYUVToVURow = AYUVToVURow_SSE2;
-      AYUVToYRow = AYUVToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_AYUVTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    AYUVToVURow = AYUVToVURow_Any_AVX2;
-    AYUVToYRow = AYUVToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      AYUVToVURow = AYUVToVURow_AVX2;
-      AYUVToYRow = AYUVToYRow_AVX2;
-    }
-  }
-#endif
-
-#if defined(HAS_AYUVTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    AYUVToYRow = AYUVToYRow_Any_NEON;
-    AYUVToVURow = AYUVToVURow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      AYUVToYRow = AYUVToYRow_NEON;
-      AYUVToVURow = AYUVToVURow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
-    AYUVToYRow(src_ayuv, dst_y, width);
-    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
-    src_ayuv += src_stride_ayuv * 2;
-    dst_y += dst_stride_y * 2;
-    dst_vu += dst_stride_vu;
-  }
-  if (height & 1) {
-    AYUVToVURow(src_ayuv, 0, dst_vu, width);
-    AYUVToYRow(src_ayuv, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert ARGB to I420.
-LIBYUV_API
-int ARGBToI420(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-    }
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert BGRA to I420.
-LIBYUV_API
-int BGRAToI420(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      BGRAToUVRow_C;
-  void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
-      BGRAToYRow_C;
-  if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
-    src_stride_bgra = -src_stride_bgra;
-  }
-#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
-    BGRAToYRow = BGRAToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_SSSE3;
-      BGRAToYRow = BGRAToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    BGRAToYRow = BGRAToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      BGRAToYRow = BGRAToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    BGRAToUVRow = BGRAToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOYROW_MMI) && defined(HAS_BGRATOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    BGRAToYRow = BGRAToYRow_Any_MMI;
-    BGRAToUVRow = BGRAToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      BGRAToYRow = BGRAToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    BGRAToYRow = BGRAToYRow_Any_MSA;
-    BGRAToUVRow = BGRAToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToYRow = BGRAToYRow_MSA;
-      BGRAToUVRow = BGRAToUVRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
-    BGRAToYRow(src_bgra, dst_y, width);
-    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
-    src_bgra += src_stride_bgra * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
-    BGRAToYRow(src_bgra, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert ABGR to I420.
-LIBYUV_API
-int ABGRToI420(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ABGRToUVRow_C;
-  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
-      ABGRToYRow_C;
-  if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
-    src_stride_abgr = -src_stride_abgr;
-  }
-#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
-    ABGRToYRow = ABGRToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_SSSE3;
-      ABGRToYRow = ABGRToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
-    ABGRToYRow = ABGRToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ABGRToUVRow = ABGRToUVRow_AVX2;
-      ABGRToYRow = ABGRToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ABGRToYRow = ABGRToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToYRow = ABGRToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ABGRToUVRow = ABGRToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ABGRToYRow = ABGRToYRow_Any_MMI;
-    ABGRToUVRow = ABGRToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToYRow = ABGRToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ABGRToYRow = ABGRToYRow_Any_MSA;
-    ABGRToUVRow = ABGRToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToYRow = ABGRToYRow_MSA;
-      ABGRToUVRow = ABGRToUVRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
-    ABGRToYRow(src_abgr, dst_y, width);
-    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
-    src_abgr += src_stride_abgr * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
-    ABGRToYRow(src_abgr, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert RGBA to I420.
-LIBYUV_API
-int RGBAToI420(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RGBAToUVRow_C;
-  void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
-      RGBAToYRow_C;
-  if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
-    src_stride_rgba = -src_stride_rgba;
-  }
-#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
-    RGBAToYRow = RGBAToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_SSSE3;
-      RGBAToYRow = RGBAToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGBAToYRow = RGBAToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGBAToYRow = RGBAToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGBAToUVRow = RGBAToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYROW_MMI) && defined(HAS_RGBATOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGBAToYRow = RGBAToYRow_Any_MMI;
-    RGBAToUVRow = RGBAToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGBAToYRow = RGBAToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGBAToYRow = RGBAToYRow_Any_MSA;
-    RGBAToUVRow = RGBAToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToYRow = RGBAToYRow_MSA;
-      RGBAToUVRow = RGBAToUVRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
-    RGBAToYRow(src_rgba, dst_y, width);
-    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
-    src_rgba += src_stride_rgba * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
-    RGBAToYRow(src_rgba, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert RGB24 to I420.
-LIBYUV_API
-int RGB24ToI420(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_u,
-                int dst_stride_u,
-                uint8_t* dst_v,
-                int dst_stride_v,
-                int width,
-                int height) {
-  int y;
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-     defined(HAS_RGB24TOYROW_MMI))
-  void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RGB24ToUVRow_C;
-  void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
-      RGB24ToYRow_C;
-#else
-  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RGB24ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-
-// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
-    RGB24ToYRow = RGB24ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYRow = RGB24ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGB24ToUVRow = RGB24ToUVRow_NEON;
-      }
-    }
-  }
-// MMI and MSA version does direct RGB24 to YUV.
-#elif (defined(HAS_RGB24TOYROW_MMI) || defined(HAS_RGB24TOYROW_MSA))
-#if defined(HAS_RGB24TOYROW_MMI) && defined(HAS_RGB24TOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
-    RGB24ToYRow = RGB24ToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYRow = RGB24ToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        RGB24ToUVRow = RGB24ToUVRow_MMI;
-      }
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
-    RGB24ToYRow = RGB24ToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToYRow = RGB24ToYRow_MSA;
-      RGB24ToUVRow = RGB24ToUVRow_MSA;
-    }
-  }
-#endif
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
-#if defined(HAS_RGB24TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#endif
-
-  {
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-      defined(HAS_RGB24TOYROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-     defined(HAS_RGB24TOYROW_MMI))
-      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
-      RGB24ToYRow(src_rgb24, dst_y, width);
-      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_rgb24 += src_stride_rgb24 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-     defined(HAS_RGB24TOYROW_MMI))
-      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
-      RGB24ToYRow(src_rgb24, dst_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-      defined(HAS_RGB24TOYROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// TODO(fbarchard): Use Matrix version to implement I420 and J420.
-// Convert RGB24 to J420.
-LIBYUV_API
-int RGB24ToJ420(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_u,
-                int dst_stride_u,
-                uint8_t* dst_v,
-                int dst_stride_v,
-                int width,
-                int height) {
-  int y;
-#if (defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
-    defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)
-  void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
-                        uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RGB24ToUVJRow_C;
-  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
-      RGB24ToYJRow_C;
-#else
-  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RGB24ToARGBRow_C;
-  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYJRow_C;
-#endif
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-
-// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
-    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYJRow = RGB24ToYJRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGB24ToUVJRow = RGB24ToUVJRow_NEON;
-      }
-    }
-  }
-// MMI and MSA version does direct RGB24 to YUV.
-#elif (defined(HAS_RGB24TOYJROW_MMI) || defined(HAS_RGB24TOYJROW_MSA))
-#if defined(HAS_RGB24TOYJROW_MMI) && defined(HAS_RGB24TOUVJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
-    RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYJRow = RGB24ToYJRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        RGB24ToUVJRow = RGB24ToUVJRow_MMI;
-      }
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
-    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToYJRow = RGB24ToYJRow_MSA;
-      RGB24ToUVJRow = RGB24ToUVJRow_MSA;
-    }
-  }
-#endif
-#else
-#if defined(HAS_RGB24TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVJRow = ARGBToUVJRow_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJRow = ARGBToUVJRow_AVX2;
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#endif
-
-  {
-#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
-      defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
-     defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
-      RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
-      RGB24ToYJRow(src_rgb24, dst_y, width);
-      RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_rgb24 += src_stride_rgb24 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
-     defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
-      RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
-      RGB24ToYJRow(src_rgb24, dst_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-#endif
-    }
-#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
-      defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// Convert RAW to I420.
-LIBYUV_API
-int RAWToI420(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int width,
-              int height) {
-  int y;
-#if (defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)) || \
-    defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)
-  void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
-                     uint8_t* dst_v, int width) = RAWToUVRow_C;
-  void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
-      RAWToYRow_C;
-#else
-  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RAWToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-
-// Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToUVRow = RAWToUVRow_Any_NEON;
-    RAWToYRow = RAWToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToYRow = RAWToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RAWToUVRow = RAWToUVRow_NEON;
-      }
-    }
-  }
-// MMI and MSA version does direct RAW to YUV.
-#elif (defined(HAS_RAWTOYROW_MMI) || defined(HAS_RAWTOYROW_MSA))
-#if defined(HAS_RAWTOYROW_MMI) && defined(HAS_RAWTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RAWToUVRow = RAWToUVRow_Any_MMI;
-    RAWToYRow = RAWToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToYRow = RAWToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        RAWToUVRow = RAWToUVRow_MMI;
-      }
-    }
-  }
-#endif
-#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RAWToUVRow = RAWToUVRow_Any_MSA;
-    RAWToYRow = RAWToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToYRow = RAWToYRow_MSA;
-      RAWToUVRow = RAWToUVRow_MSA;
-    }
-  }
-#endif
-// Other platforms do intermediate conversion from RAW to ARGB.
-#else
-#if defined(HAS_RAWTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToARGBRow = RAWToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToARGBRow = RAWToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#endif
-
-  {
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-      defined(HAS_RAWTOYROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-     defined(HAS_RAWTOYROW_MMI))
-      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
-      RAWToYRow(src_raw, dst_y, width);
-      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_raw += src_stride_raw * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-     defined(HAS_RAWTOYROW_MMI))
-      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
-      RAWToYRow(src_raw, dst_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-      defined(HAS_RAWTOYROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// TODO(fbarchard): Use Matrix version to implement I420 and J420.
-// Convert RAW to J420.
-LIBYUV_API
-int RAWToJ420(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int width,
-              int height) {
-  int y;
-#if (defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
-    defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)
-  void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RAWToUVJRow_C;
-  void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
-      RAWToYJRow_C;
-#else
-  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RAWToARGBRow_C;
-  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYJRow_C;
-#endif
-  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-
-// Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToUVJRow = RAWToUVJRow_Any_NEON;
-    RAWToYJRow = RAWToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToYJRow = RAWToYJRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RAWToUVJRow = RAWToUVJRow_NEON;
-      }
-    }
-  }
-// MMI and MSA version does direct RAW to YUV.
-#elif (defined(HAS_RAWTOYJROW_MMI) || defined(HAS_RAWTOYJROW_MSA))
-#if defined(HAS_RAWTOYJROW_MMI) && defined(HAS_RAWTOUVJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RAWToUVJRow = RAWToUVJRow_Any_MMI;
-    RAWToYJRow = RAWToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToYJRow = RAWToYJRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        RAWToUVJRow = RAWToUVJRow_MMI;
-      }
-    }
-  }
-#endif
-#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RAWToUVJRow = RAWToUVJRow_Any_MSA;
-    RAWToYJRow = RAWToYJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToYJRow = RAWToYJRow_MSA;
-      RAWToUVJRow = RAWToUVJRow_MSA;
-    }
-  }
-#endif
-#else
-#if defined(HAS_RAWTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToARGBRow = RAWToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToARGBRow = RAWToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVJRow = ARGBToUVJRow_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJRow = ARGBToUVJRow_AVX2;
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#endif
-
-  {
-#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
-      defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
-     defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
-      RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width);
-      RAWToYJRow(src_raw, dst_y, width);
-      RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_raw += src_stride_raw * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
-     defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
-      RAWToUVJRow(src_raw, 0, dst_u, dst_v, width);
-      RAWToYJRow(src_raw, dst_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-#endif
-    }
-#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
-      defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// Convert RGB565 to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8_t* src_rgb565,
-                 int src_stride_rgb565,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 uint8_t* dst_u,
-                 int dst_stride_u,
-                 uint8_t* dst_v,
-                 int dst_stride_v,
-                 int width,
-                 int height) {
-  int y;
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
-     defined(HAS_RGB565TOYROW_MMI))
-  void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
-                        uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RGB565ToUVRow_C;
-  void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
-      RGB565ToYRow_C;
-#else
-  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
-                          int width) = RGB565ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
-    src_stride_rgb565 = -src_stride_rgb565;
-  }
-
-// Neon version does direct RGB565 to YUV.
-#if defined(HAS_RGB565TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
-    RGB565ToYRow = RGB565ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToYRow = RGB565ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGB565ToUVRow = RGB565ToUVRow_NEON;
-      }
-    }
-  }
-// MMI and MSA version does direct RGB565 to YUV.
-#elif (defined(HAS_RGB565TOYROW_MMI) || defined(HAS_RGB565TOYROW_MSA))
-#if defined(HAS_RGB565TOYROW_MMI) && defined(HAS_RGB565TOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB565ToUVRow = RGB565ToUVRow_Any_MMI;
-    RGB565ToYRow = RGB565ToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToYRow = RGB565ToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        RGB565ToUVRow = RGB565ToUVRow_MMI;
-      }
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
-    RGB565ToYRow = RGB565ToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToYRow = RGB565ToYRow_MSA;
-      RGB565ToUVRow = RGB565ToUVRow_MSA;
-    }
-  }
-#endif
-// Other platforms do intermediate conversion from RGB565 to ARGB.
-#else
-#if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#endif
-  {
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
-      defined(HAS_RGB565TOYROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
-     defined(HAS_RGB565TOYROW_MMI))
-      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
-      RGB565ToYRow(src_rgb565, dst_y, width);
-      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
-#else
-      RGB565ToARGBRow(src_rgb565, row, width);
-      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_rgb565 += src_stride_rgb565 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
-     defined(HAS_RGB565TOYROW_MMI))
-      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
-      RGB565ToYRow(src_rgb565, dst_y, width);
-#else
-      RGB565ToARGBRow(src_rgb565, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
-      defined(HAS_RGB565TOYROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// Convert ARGB1555 to I420.
-LIBYUV_API
-int ARGB1555ToI420(const uint8_t* src_argb1555,
-                   int src_stride_argb1555,
-                   uint8_t* dst_y,
-                   int dst_stride_y,
-                   uint8_t* dst_u,
-                   int dst_stride_u,
-                   uint8_t* dst_v,
-                   int dst_stride_v,
-                   int width,
-                   int height) {
-  int y;
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
-     defined(HAS_ARGB1555TOYROW_MMI))
-  void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
-                          uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGB1555ToUVRow_C;
-  void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
-                         int width) = ARGB1555ToYRow_C;
-#else
-  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
-                            int width) = ARGB1555ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
-    src_stride_argb1555 = -src_stride_argb1555;
-  }
-
-// Neon version does direct ARGB1555 to YUV.
-#if defined(HAS_ARGB1555TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
-    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
-      }
-    }
-  }
-// MMI and MSA version does direct ARGB1555 to YUV.
-#elif (defined(HAS_ARGB1555TOYROW_MMI) || defined(HAS_ARGB1555TOYROW_MSA))
-#if defined(HAS_ARGB1555TOYROW_MMI) && defined(HAS_ARGB1555TOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI;
-    ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToYRow = ARGB1555ToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB1555ToUVRow = ARGB1555ToUVRow_MMI;
-      }
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
-    ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToYRow = ARGB1555ToYRow_MSA;
-      ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
-    }
-  }
-#endif
-// Other platforms do intermediate conversion from ARGB1555 to ARGB.
-#else
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#endif
-  {
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
-      defined(HAS_ARGB1555TOYROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
-     defined(HAS_ARGB1555TOYROW_MMI))
-      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
-      ARGB1555ToYRow(src_argb1555, dst_y, width);
-      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
-                     width);
-#else
-      ARGB1555ToARGBRow(src_argb1555, row, width);
-      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
-                        width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_argb1555 += src_stride_argb1555 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
-     defined(HAS_ARGB1555TOYROW_MMI))
-      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
-      ARGB1555ToYRow(src_argb1555, dst_y, width);
-#else
-      ARGB1555ToARGBRow(src_argb1555, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
-      defined(HAS_ARGB1555TOYROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// Convert ARGB4444 to I420.
-LIBYUV_API
-int ARGB4444ToI420(const uint8_t* src_argb4444,
-                   int src_stride_argb4444,
-                   uint8_t* dst_y,
-                   int dst_stride_y,
-                   uint8_t* dst_u,
-                   int dst_stride_u,
-                   uint8_t* dst_v,
-                   int dst_stride_v,
-                   int width,
-                   int height) {
-  int y;
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
-  void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
-                          uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGB4444ToUVRow_C;
-  void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
-                         int width) = ARGB4444ToYRow_C;
-#else
-  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
-                            int width) = ARGB4444ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
-    src_stride_argb4444 = -src_stride_argb4444;
-  }
-
-// Neon version does direct ARGB4444 to YUV.
-#if defined(HAS_ARGB4444TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
-    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
-      }
-    }
-  }
-#elif defined(HAS_ARGB4444TOYROW_MMI) && defined(HAS_ARGB4444TOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI;
-    ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToYRow = ARGB4444ToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB4444ToUVRow = ARGB4444ToUVRow_MMI;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from ARGB4444 to ARGB.
-#else
-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_MMI;
-      }
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-      if (IS_ALIGNED(width, 32)) {
-        ARGBToUVRow = ARGBToUVRow_MSA;
-      }
-    }
-  }
-#endif
-#endif
-
-  {
-#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
-      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
-      ARGB4444ToYRow(src_argb4444, dst_y, width);
-      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
-                     width);
-#else
-      ARGB4444ToARGBRow(src_argb4444, row, width);
-      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
-                        width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_argb4444 += src_stride_argb4444 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
-      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
-      ARGB4444ToYRow(src_argb4444, dst_y, width);
-#else
-      ARGB4444ToARGBRow(src_argb4444, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// Convert RGB24 to J400.
-LIBYUV_API
-int RGB24ToJ400(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_yj,
-                int dst_stride_yj,
-                int width,
-                int height) {
-  int y;
-  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
-      RGB24ToYJRow_C;
-  if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-  // Coalesce rows.
-  if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) {
-    width *= height;
-    height = 1;
-    src_stride_rgb24 = dst_stride_yj = 0;
-  }
-#if defined(HAS_RGB24TOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToYJRow = RGB24ToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      RGB24ToYJRow = RGB24ToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYJRow = RGB24ToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYJRow = RGB24ToYJRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOYJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToYJRow = RGB24ToYJRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RGB24ToYJRow(src_rgb24, dst_yj, width);
-    src_rgb24 += src_stride_rgb24;
-    dst_yj += dst_stride_yj;
-  }
-  return 0;
-}
-
-// Convert RAW to J400.
-LIBYUV_API
-int RAWToJ400(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_yj,
-              int dst_stride_yj,
-              int width,
-              int height) {
-  int y;
-  void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) =
-      RAWToYJRow_C;
-  if (!src_raw || !dst_yj || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  // Coalesce rows.
-  if (src_stride_raw == width * 3 && dst_stride_yj == width) {
-    width *= height;
-    height = 1;
-    src_stride_raw = dst_stride_yj = 0;
-  }
-#if defined(HAS_RAWTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToYJRow = RAWToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToYJRow = RAWToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RAWToYJRow = RAWToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      RAWToYJRow = RAWToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToYJRow = RAWToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToYJRow = RAWToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RAWToYJRow = RAWToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToYJRow = RAWToYJRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOYJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RAWToYJRow = RAWToYJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToYJRow = RAWToYJRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RAWToYJRow(src_raw, dst_yj, width);
-    src_raw += src_stride_raw;
-    dst_yj += dst_stride_yj;
-  }
-  return 0;
-}
-
-static void SplitPixels(const uint8_t* src_u,
-                        int src_pixel_stride_uv,
-                        uint8_t* dst_u,
-                        int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    *dst_u = *src_u;
-    ++dst_u;
-    src_u += src_pixel_stride_uv;
-  }
-}
-
-// Convert Android420 to I420.
-LIBYUV_API
-int Android420ToI420(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     uint8_t* dst_u,
-                     int dst_stride_u,
-                     uint8_t* dst_v,
-                     int dst_stride_v,
-                     int width,
-                     int height) {
-  int y;
-  const ptrdiff_t vu_off = src_v - src_u;
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  // Copy UV planes as is - I420
-  if (src_pixel_stride_uv == 1) {
-    CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-    CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
-    return 0;
-    // Split UV planes - NV21
-  }
-  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
-      src_stride_u == src_stride_v) {
-    SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
-                 halfwidth, halfheight);
-    return 0;
-    // Split UV planes - NV12
-  }
-  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
-    SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
-                 halfwidth, halfheight);
-    return 0;
-  }
-
-  for (y = 0; y < halfheight; ++y) {
-    SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
-    SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/convert_argb.cc b/thirdparty/libyuv/source/convert_argb.cc
deleted file mode 100644
index d8f7b27..0000000
--- a/thirdparty/libyuv/source/convert_argb.cc
+++ /dev/null
@@ -1,5350 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_argb.h"
-
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/planar_functions.h"  // For CopyPlane and ARGBShuffle.
-#include "libyuv/rotate_argb.h"
-#include "libyuv/row.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy ARGB with optional flipping
-LIBYUV_API
-int ARGBCopy(const uint8_t* src_argb,
-             int src_stride_argb,
-             uint8_t* dst_argb,
-             int dst_stride_argb,
-             int width,
-             int height) {
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
-            height);
-  return 0;
-}
-
-// Convert I420 to ARGB with matrix.
-LIBYUV_API
-int I420ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422ToARGBRow = I422ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants, width, height);
-}
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuJPEGConstants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuH709Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert U420 to ARGB.
-LIBYUV_API
-int U420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuv2020Constants, width, height);
-}
-
-// Convert U420 to ABGR.
-LIBYUV_API
-int U420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvu2020Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert I422 to ARGB with matrix.
-LIBYUV_API
-int I422ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422ToARGBRow = I422ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I422 to ARGB.
-LIBYUV_API
-int I422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert J422 to ARGB.
-LIBYUV_API
-int J422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants, width, height);
-}
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuJPEGConstants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuH709Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert U422 to ARGB.
-LIBYUV_API
-int U422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuv2020Constants, width, height);
-}
-
-// Convert U422 to ABGR.
-LIBYUV_API
-int U422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvu2020Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert I444 to ARGB with matrix.
-LIBYUV_API
-int I444ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I444ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I444TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I444TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I444ToARGBRow = I444ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I444ToARGBRow = I444ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I444TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I444ToARGBRow = I444ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I444ToARGBRow = I444ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I444TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I444ToARGBRow = I444ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I444 to ARGB.
-LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants, width, height);
-}
-
-// Convert J444 to ABGR.
-LIBYUV_API
-int J444ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuJPEGConstants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert H444 to ARGB.
-LIBYUV_API
-int H444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert H444 to ABGR.
-LIBYUV_API
-int H444ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuH709Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert U444 to ARGB.
-LIBYUV_API
-int U444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuv2020Constants, width, height);
-}
-
-// Convert U444 to ABGR.
-LIBYUV_API
-int U444ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvu2020Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert 10 bit YUV to ARGB with matrix.
-// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
-// multiply 10 bit yuv into high bits to allow any number of bits.
-LIBYUV_API
-int I010ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
-                        const uint16_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I210ToAR30Row_C;
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-#if defined(HAS_I210TOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I210ToAR30Row = I210ToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I210TOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I210ToAR30Row = I210ToAR30Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
-    dst_ar30 += dst_stride_ar30;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I010 to AR30.
-LIBYUV_API
-int I010ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert H010 to AR30.
-LIBYUV_API
-int H010ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert U010 to AR30.
-LIBYUV_API
-int U010ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuv2020Constants, width, height);
-}
-
-// Convert I010 to AB30.
-LIBYUV_API
-int I010ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                          src_stride_u, dst_ab30, dst_stride_ab30,
-                          &kYvuI601Constants, width, height);
-}
-
-// Convert H010 to AB30.
-LIBYUV_API
-int H010ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                          src_stride_u, dst_ab30, dst_stride_ab30,
-                          &kYvuH709Constants, width, height);
-}
-
-// Convert U010 to AB30.
-LIBYUV_API
-int U010ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                          src_stride_u, dst_ab30, dst_stride_ab30,
-                          &kYuv2020Constants, width, height);
-}
-
-// Convert 12 bit YUV to ARGB with matrix.
-// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to
-// multiply 12 bit yuv into high bits to allow any number of bits.
-LIBYUV_API
-int I012ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
-                        const uint16_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I212ToAR30Row_C;
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-#if defined(HAS_I212TOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I212ToAR30Row = I212ToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I212ToAR30Row = I212ToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I212TOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I212ToAR30Row = I212ToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I212ToAR30Row = I212ToAR30Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
-    dst_ar30 += dst_stride_ar30;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert 10 bit YUV to ARGB with matrix.
-// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
-// multiply 10 bit yuv into high bits to allow any number of bits.
-LIBYUV_API
-int I210ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
-                        const uint16_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I210ToAR30Row_C;
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-#if defined(HAS_I210TOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I210ToAR30Row = I210ToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I210TOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I210ToAR30Row = I210ToAR30Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
-    dst_ar30 += dst_stride_ar30;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I210 to AR30.
-LIBYUV_API
-int I210ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert H210 to AR30.
-LIBYUV_API
-int H210ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert U210 to AR30.
-LIBYUV_API
-int U210ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuv2020Constants, width, height);
-}
-
-// Convert I210 to AB30.
-LIBYUV_API
-int I210ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                          src_stride_u, dst_ab30, dst_stride_ab30,
-                          &kYvuI601Constants, width, height);
-}
-
-// Convert H210 to AB30.
-LIBYUV_API
-int H210ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                          src_stride_u, dst_ab30, dst_stride_ab30,
-                          &kYvuH709Constants, width, height);
-}
-
-// Convert U210 to AB30.
-LIBYUV_API
-int U210ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                          src_stride_u, dst_ab30, dst_stride_ab30,
-                          &kYuv2020Constants, width, height);
-}
-
-LIBYUV_API
-int I410ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
-                        const uint16_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I410ToAR30Row_C;
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-#if defined(HAS_I410TOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I410ToAR30Row = I410ToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I410TOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I410ToAR30Row = I410ToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I410ToAR30Row = I410ToAR30Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
-    dst_ar30 += dst_stride_ar30;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert 10 bit YUV to ARGB with matrix.
-LIBYUV_API
-int I010ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
-                        const uint16_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I210ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I210TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I210ToARGBRow = I210ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I210TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I210ToARGBRow = I210ToARGBRow_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I010 to ARGB.
-LIBYUV_API
-int I010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I010 to ABGR.
-LIBYUV_API
-int I010ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert H010 to ARGB.
-LIBYUV_API
-int H010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert H010 to ABGR.
-LIBYUV_API
-int H010ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuH709Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert U010 to ARGB.
-LIBYUV_API
-int U010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuv2020Constants, width, height);
-}
-
-// Convert U010 to ABGR.
-LIBYUV_API
-int U010ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvu2020Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert 12 bit YUV to ARGB with matrix.
-LIBYUV_API
-int I012ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
-                        const uint16_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I212ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I212TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I212ToARGBRow = I212ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I212ToARGBRow = I212ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I212TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I212ToARGBRow = I212ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I212ToARGBRow = I212ToARGBRow_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert 10 bit 422 YUV to ARGB with matrix.
-LIBYUV_API
-int I210ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
-                        const uint16_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I210ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I210TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I210ToARGBRow = I210ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I210TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I210ToARGBRow = I210ToARGBRow_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I210 to ARGB.
-LIBYUV_API
-int I210ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I210 to ABGR.
-LIBYUV_API
-int I210ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert H210 to ARGB.
-LIBYUV_API
-int H210ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert H210 to ABGR.
-LIBYUV_API
-int H210ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuH709Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert U210 to ARGB.
-LIBYUV_API
-int U210ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuv2020Constants, width, height);
-}
-
-// Convert U210 to ABGR.
-LIBYUV_API
-int U210ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvu2020Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-LIBYUV_API
-int I410ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
-                        const uint16_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I410ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I410TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I410ToARGBRow = I410ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I410TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I410ToARGBRow = I410ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I410ToARGBRow = I410ToARGBRow_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int P010ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*P210ToARGBRow)(
-      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_P210TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    P210ToARGBRow = P210ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      P210ToARGBRow = P210ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_P210TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    P210ToARGBRow = P210ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      P210ToARGBRow = P210ToARGBRow_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-LIBYUV_API
-int P210ToARGBMatrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*P210ToARGBRow)(
-      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_P210TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    P210ToARGBRow = P210ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      P210ToARGBRow = P210ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_P210TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    P210ToARGBRow = P210ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      P210ToARGBRow = P210ToARGBRow_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_uv += src_stride_uv;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int P010ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*P210ToAR30Row)(
-      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
-  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-#if defined(HAS_P210TOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    P210ToAR30Row = P210ToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      P210ToAR30Row = P210ToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_P210TOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    P210ToAR30Row = P210ToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      P210ToAR30Row = P210ToAR30Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
-    dst_ar30 += dst_stride_ar30;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-LIBYUV_API
-int P210ToAR30Matrix(const uint16_t* src_y,
-                     int src_stride_y,
-                     const uint16_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*P210ToAR30Row)(
-      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
-  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-#if defined(HAS_P210TOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    P210ToAR30Row = P210ToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      P210ToAR30Row = P210ToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_P210TOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    P210ToAR30Row = P210ToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      P210ToAR30Row = P210ToAR30Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
-    dst_ar30 += dst_stride_ar30;
-    src_y += src_stride_y;
-    src_uv += src_stride_uv;
-  }
-  return 0;
-}
-
-// Convert I420 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I420AlphaToARGBMatrix(const uint8_t* src_y,
-                          int src_stride_y,
-                          const uint8_t* src_u,
-                          int src_stride_u,
-                          const uint8_t* src_v,
-                          int src_stride_v,
-                          const uint8_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate) {
-  int y;
-  void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                             const uint8_t* v_buf, const uint8_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) = I422AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
-                       width);
-    if (attenuate) {
-      ARGBAttenuateRow(dst_argb, dst_argb, width);
-    }
-    dst_argb += dst_stride_argb;
-    src_a += src_stride_a;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I422 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I422AlphaToARGBMatrix(const uint8_t* src_y,
-                          int src_stride_y,
-                          const uint8_t* src_u,
-                          int src_stride_u,
-                          const uint8_t* src_v,
-                          int src_stride_v,
-                          const uint8_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate) {
-  int y;
-  void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                             const uint8_t* v_buf, const uint8_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) = I422AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
-                       width);
-    if (attenuate) {
-      ARGBAttenuateRow(dst_argb, dst_argb, width);
-    }
-    dst_argb += dst_stride_argb;
-    src_a += src_stride_a;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I444 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I444AlphaToARGBMatrix(const uint8_t* src_y,
-                          int src_stride_y,
-                          const uint8_t* src_u,
-                          int src_stride_u,
-                          const uint8_t* src_v,
-                          int src_stride_v,
-                          const uint8_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate) {
-  int y;
-  void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                             const uint8_t* v_buf, const uint8_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) = I444AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I444ALPHATOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I444ALPHATOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I444AlphaToARGBRow = I444AlphaToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I444ALPHATOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I444AlphaToARGBRow = I444AlphaToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I444ALPHATOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I444AlphaToARGBRow = I444AlphaToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
-                       width);
-    if (attenuate) {
-      ARGBAttenuateRow(dst_argb, dst_argb, width);
-    }
-    dst_argb += dst_stride_argb;
-    src_a += src_stride_a;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I420 with Alpha to ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height,
-                    int attenuate) {
-  return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                               src_stride_v, src_a, src_stride_a, dst_argb,
-                               dst_stride_argb, &kYuvI601Constants, width,
-                               height, attenuate);
-}
-
-// Convert I420 with Alpha to ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_abgr,
-                    int dst_stride_abgr,
-                    int width,
-                    int height,
-                    int attenuate) {
-  return I420AlphaToARGBMatrix(
-      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
-      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
-      &kYvuI601Constants,  // Use Yvu matrix
-      width, height, attenuate);
-}
-
-// Convert I422 with Alpha to ARGB.
-LIBYUV_API
-int I422AlphaToARGB(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height,
-                    int attenuate) {
-  return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                               src_stride_v, src_a, src_stride_a, dst_argb,
-                               dst_stride_argb, &kYuvI601Constants, width,
-                               height, attenuate);
-}
-
-// Convert I422 with Alpha to ABGR.
-LIBYUV_API
-int I422AlphaToABGR(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_abgr,
-                    int dst_stride_abgr,
-                    int width,
-                    int height,
-                    int attenuate) {
-  return I422AlphaToARGBMatrix(
-      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
-      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
-      &kYvuI601Constants,  // Use Yvu matrix
-      width, height, attenuate);
-}
-
-// Convert I444 with Alpha to ARGB.
-LIBYUV_API
-int I444AlphaToARGB(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height,
-                    int attenuate) {
-  return I444AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                               src_stride_v, src_a, src_stride_a, dst_argb,
-                               dst_stride_argb, &kYuvI601Constants, width,
-                               height, attenuate);
-}
-
-// Convert I444 with Alpha to ABGR.
-LIBYUV_API
-int I444AlphaToABGR(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_abgr,
-                    int dst_stride_abgr,
-                    int width,
-                    int height,
-                    int attenuate) {
-  return I444AlphaToARGBMatrix(
-      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
-      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
-      &kYvuI601Constants,  // Use Yvu matrix
-      width, height, attenuate);
-}
-
-// Convert I010 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I010AlphaToARGBMatrix(const uint16_t* src_y,
-                          int src_stride_y,
-                          const uint16_t* src_u,
-                          int src_stride_u,
-                          const uint16_t* src_v,
-                          int src_stride_v,
-                          const uint16_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate) {
-  int y;
-  void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
-                             const uint16_t* v_buf, const uint16_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) = I210AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I210ALPHATOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
-                       width);
-    if (attenuate) {
-      ARGBAttenuateRow(dst_argb, dst_argb, width);
-    }
-    dst_argb += dst_stride_argb;
-    src_a += src_stride_a;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I210 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I210AlphaToARGBMatrix(const uint16_t* src_y,
-                          int src_stride_y,
-                          const uint16_t* src_u,
-                          int src_stride_u,
-                          const uint16_t* src_v,
-                          int src_stride_v,
-                          const uint16_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate) {
-  int y;
-  void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
-                             const uint16_t* v_buf, const uint16_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) = I210AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I210ALPHATOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
-                       width);
-    if (attenuate) {
-      ARGBAttenuateRow(dst_argb, dst_argb, width);
-    }
-    dst_argb += dst_stride_argb;
-    src_a += src_stride_a;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I410 with Alpha to preattenuated ARGB with matrix.
-LIBYUV_API
-int I410AlphaToARGBMatrix(const uint16_t* src_y,
-                          int src_stride_y,
-                          const uint16_t* src_u,
-                          int src_stride_u,
-                          const uint16_t* src_v,
-                          int src_stride_v,
-                          const uint16_t* src_a,
-                          int src_stride_a,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width,
-                          int height,
-                          int attenuate) {
-  int y;
-  void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
-                             const uint16_t* v_buf, const uint16_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) = I410AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I410ALPHATOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
-                       width);
-    if (attenuate) {
-      ARGBAttenuateRow(dst_argb, dst_argb, width);
-    }
-    dst_argb += dst_stride_argb;
-    src_a += src_stride_a;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I400 to ARGB with matrix.
-LIBYUV_API
-int I400ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I400ToARGBRow_C;
-  if (!src_y || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_argb = 0;
-  }
-#if defined(HAS_I400TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I400ToARGBRow = I400ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I400TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I400ToARGBRow = I400ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I400TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I400ToARGBRow = I400ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I400TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I400ToARGBRow = I400ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      I400ToARGBRow = I400ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-  }
-  return 0;
-}
-
-// Convert I400 to ARGB.
-LIBYUV_API
-int I400ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert J400 to ARGB.
-LIBYUV_API
-int J400ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
-      J400ToARGBRow_C;
-  if (!src_y || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_argb = 0;
-  }
-#if defined(HAS_J400TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      J400ToARGBRow = J400ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_J400TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      J400ToARGBRow = J400ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_J400TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    J400ToARGBRow = J400ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      J400ToARGBRow = J400ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_J400TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    J400ToARGBRow = J400ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      J400ToARGBRow = J400ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_J400TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    J400ToARGBRow = J400ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      J400ToARGBRow = J400ToARGBRow_MSA;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    J400ToARGBRow(src_y, dst_argb, width);
-    src_y += src_stride_y;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Shuffle table for converting BGRA to ARGB.
-static const uvec8 kShuffleMaskBGRAToARGB = {
-    3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
-
-// Shuffle table for converting ABGR to ARGB.
-static const uvec8 kShuffleMaskABGRToARGB = {
-    2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
-
-// Shuffle table for converting RGBA to ARGB.
-static const uvec8 kShuffleMaskRGBAToARGB = {
-    1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
-
-// Shuffle table for converting AR64 to AB64.
-static const uvec8 kShuffleMaskAR64ToAB64 = {
-    4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u};
-
-// Convert BGRA to ARGB.
-LIBYUV_API
-int BGRAToARGB(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
-                     (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height);
-}
-
-// Convert ARGB to BGRA (same as BGRAToARGB).
-LIBYUV_API
-int ARGBToBGRA(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
-                     (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height);
-}
-
-// Convert ABGR to ARGB.
-LIBYUV_API
-int ABGRToARGB(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
-                     (const uint8_t*)&kShuffleMaskABGRToARGB, width, height);
-}
-
-// Convert ARGB to ABGR to (same as ABGRToARGB).
-LIBYUV_API
-int ARGBToABGR(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
-                     (const uint8_t*)&kShuffleMaskABGRToARGB, width, height);
-}
-
-// Convert RGBA to ARGB.
-LIBYUV_API
-int RGBAToARGB(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
-                     (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height);
-}
-
-// Convert AR64 To AB64.
-LIBYUV_API
-int AR64ToAB64(const uint16_t* src_ar64,
-               int src_stride_ar64,
-               uint16_t* dst_ab64,
-               int dst_stride_ab64,
-               int width,
-               int height) {
-  return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64,
-                     (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height);
-}
-
-// Convert RGB24 to ARGB.
-LIBYUV_API
-int RGB24ToARGB(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_argb,
-                int dst_stride_argb,
-                int width,
-                int height) {
-  int y;
-  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RGB24ToARGBRow_C;
-  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-  // Coalesce rows.
-  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_rgb24 = dst_stride_argb = 0;
-  }
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RGB24ToARGBRow(src_rgb24, dst_argb, width);
-    src_rgb24 += src_stride_rgb24;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert RAW to ARGB.
-LIBYUV_API
-int RAWToARGB(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int width,
-              int height) {
-  int y;
-  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RAWToARGBRow_C;
-  if (!src_raw || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  // Coalesce rows.
-  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_raw = dst_stride_argb = 0;
-  }
-#if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToARGBRow = RAWToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToARGBRow = RAWToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RAWToARGBRow = RAWToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      RAWToARGBRow = RAWToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RAWToARGBRow = RAWToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RAWToARGBRow(src_raw, dst_argb, width);
-    src_raw += src_stride_raw;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert RAW to RGBA.
-LIBYUV_API
-int RAWToRGBA(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_rgba,
-              int dst_stride_rgba,
-              int width,
-              int height) {
-  int y;
-  void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) =
-      RAWToRGBARow_C;
-  if (!src_raw || !dst_rgba || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  // Coalesce rows.
-  if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_raw = dst_stride_rgba = 0;
-  }
-#if defined(HAS_RAWTORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToRGBARow = RAWToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToRGBARow = RAWToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RAWTORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToRGBARow = RAWToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToRGBARow = RAWToRGBARow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RAWToRGBARow(src_raw, dst_rgba, width);
-    src_raw += src_stride_raw;
-    dst_rgba += dst_stride_rgba;
-  }
-  return 0;
-}
-
-// Convert RGB565 to ARGB.
-LIBYUV_API
-int RGB565ToARGB(const uint8_t* src_rgb565,
-                 int src_stride_rgb565,
-                 uint8_t* dst_argb,
-                 int dst_stride_argb,
-                 int width,
-                 int height) {
-  int y;
-  void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
-                          int width) = RGB565ToARGBRow_C;
-  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
-    src_stride_rgb565 = -src_stride_rgb565;
-  }
-  // Coalesce rows.
-  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_rgb565 = dst_stride_argb = 0;
-  }
-#if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RGB565ToARGBRow(src_rgb565, dst_argb, width);
-    src_rgb565 += src_stride_rgb565;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert ARGB1555 to ARGB.
-LIBYUV_API
-int ARGB1555ToARGB(const uint8_t* src_argb1555,
-                   int src_stride_argb1555,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
-  int y;
-  void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
-                            int width) = ARGB1555ToARGBRow_C;
-  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
-    src_stride_argb1555 = -src_stride_argb1555;
-  }
-  // Coalesce rows.
-  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb1555 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
-    src_argb1555 += src_stride_argb1555;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert ARGB4444 to ARGB.
-LIBYUV_API
-int ARGB4444ToARGB(const uint8_t* src_argb4444,
-                   int src_stride_argb4444,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
-  int y;
-  void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
-                            int width) = ARGB4444ToARGBRow_C;
-  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
-    src_stride_argb4444 = -src_stride_argb4444;
-  }
-  // Coalesce rows.
-  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb4444 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
-    src_argb4444 += src_stride_argb4444;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert AR30 to ARGB.
-LIBYUV_API
-int AR30ToARGB(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
-    src_stride_ar30 = -src_stride_ar30;
-  }
-  // Coalesce rows.
-  if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_ar30 = dst_stride_argb = 0;
-  }
-  for (y = 0; y < height; ++y) {
-    AR30ToARGBRow_C(src_ar30, dst_argb, width);
-    src_ar30 += src_stride_ar30;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert AR30 to ABGR.
-LIBYUV_API
-int AR30ToABGR(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  int y;
-  if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
-    src_stride_ar30 = -src_stride_ar30;
-  }
-  // Coalesce rows.
-  if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_ar30 = dst_stride_abgr = 0;
-  }
-  for (y = 0; y < height; ++y) {
-    AR30ToABGRRow_C(src_ar30, dst_abgr, width);
-    src_ar30 += src_stride_ar30;
-    dst_abgr += dst_stride_abgr;
-  }
-  return 0;
-}
-
-// Convert AR30 to AB30.
-LIBYUV_API
-int AR30ToAB30(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  int y;
-  if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
-    src_stride_ar30 = -src_stride_ar30;
-  }
-  // Coalesce rows.
-  if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_ar30 = dst_stride_ab30 = 0;
-  }
-  for (y = 0; y < height; ++y) {
-    AR30ToAB30Row_C(src_ar30, dst_ab30, width);
-    src_ar30 += src_stride_ar30;
-    dst_ab30 += dst_stride_ab30;
-  }
-  return 0;
-}
-
-// Convert AR64 to ARGB.
-LIBYUV_API
-int AR64ToARGB(const uint16_t* src_ar64,
-               int src_stride_ar64,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
-                        int width) = AR64ToARGBRow_C;
-  if (!src_ar64 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
-    src_stride_ar64 = -src_stride_ar64;
-  }
-  // Coalesce rows.
-  if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_ar64 = dst_stride_argb = 0;
-  }
-#if defined(HAS_AR64TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      AR64ToARGBRow = AR64ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_AR64TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    AR64ToARGBRow = AR64ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      AR64ToARGBRow = AR64ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_AR64TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    AR64ToARGBRow = AR64ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      AR64ToARGBRow = AR64ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    AR64ToARGBRow(src_ar64, dst_argb, width);
-    src_ar64 += src_stride_ar64;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert AB64 to ARGB.
-LIBYUV_API
-int AB64ToARGB(const uint16_t* src_ab64,
-               int src_stride_ab64,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
-                        int width) = AB64ToARGBRow_C;
-  if (!src_ab64 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ab64 = src_ab64 + (height - 1) * src_stride_ab64;
-    src_stride_ab64 = -src_stride_ab64;
-  }
-  // Coalesce rows.
-  if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_ab64 = dst_stride_argb = 0;
-  }
-#if defined(HAS_AB64TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      AB64ToARGBRow = AB64ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_AB64TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    AB64ToARGBRow = AB64ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      AB64ToARGBRow = AB64ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_AB64TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    AB64ToARGBRow = AB64ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      AB64ToARGBRow = AB64ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    AB64ToARGBRow(src_ab64, dst_argb, width);
-    src_ab64 += src_stride_ab64;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert NV12 to ARGB with matrix.
-LIBYUV_API
-int NV12ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*NV12ToARGBRow)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToARGBRow = NV12ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      NV12ToARGBRow = NV12ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-// Convert NV21 to ARGB with matrix.
-LIBYUV_API
-int NV21ToARGBMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_vu,
-                     int src_stride_vu,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*NV21ToARGBRow)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
-  if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_NV21TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV21ToARGBRow = NV21ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      NV21ToARGBRow = NV21ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_vu += src_stride_vu;
-    }
-  }
-  return 0;
-}
-
-// Convert NV12 to ARGB.
-LIBYUV_API
-int NV12ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
-                          dst_stride_argb, &kYuvI601Constants, width, height);
-}
-
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
-                          dst_stride_argb, &kYuvI601Constants, width, height);
-}
-
-// Convert NV12 to ABGR.
-// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
-// To swap the UV use NV12 instead of NV21.LIBYUV_API
-LIBYUV_API
-int NV12ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,
-                          dst_stride_abgr, &kYvuI601Constants, width, height);
-}
-
-// Convert NV21 to ABGR.
-LIBYUV_API
-int NV21ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,
-                          dst_stride_abgr, &kYvuI601Constants, width, height);
-}
-
-// TODO(fbarchard): Consider SSSE3 2 step conversion.
-// Convert NV12 to RGB24 with matrix.
-LIBYUV_API
-int NV12ToRGB24Matrix(const uint8_t* src_y,
-                      int src_stride_y,
-                      const uint8_t* src_uv,
-                      int src_stride_uv,
-                      uint8_t* dst_rgb24,
-                      int dst_stride_rgb24,
-                      const struct YuvConstants* yuvconstants,
-                      int width,
-                      int height) {
-  int y;
-  void (*NV12ToRGB24Row)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
-  if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_rgb24 = -dst_stride_rgb24;
-  }
-#if defined(HAS_NV12TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB24Row = NV12ToRGB24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB24ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    NV12ToRGB24Row = NV12ToRGB24Row_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB24Row = NV12ToRGB24Row_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
-    dst_rgb24 += dst_stride_rgb24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-// Convert NV21 to RGB24 with matrix.
-LIBYUV_API
-int NV21ToRGB24Matrix(const uint8_t* src_y,
-                      int src_stride_y,
-                      const uint8_t* src_vu,
-                      int src_stride_vu,
-                      uint8_t* dst_rgb24,
-                      int dst_stride_rgb24,
-                      const struct YuvConstants* yuvconstants,
-                      int width,
-                      int height) {
-  int y;
-  void (*NV21ToRGB24Row)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
-  if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_rgb24 = -dst_stride_rgb24;
-  }
-#if defined(HAS_NV21TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToRGB24Row = NV21ToRGB24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV21TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV21TORGB24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV21TORGB24ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    NV21ToRGB24Row = NV21ToRGB24Row_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToRGB24Row = NV21ToRGB24Row_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
-    dst_rgb24 += dst_stride_rgb24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_vu += src_stride_vu;
-    }
-  }
-  return 0;
-}
-
-// Convert NV12 to RGB24.
-LIBYUV_API
-int NV12ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_uv,
-                int src_stride_uv,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
-                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
-                           width, height);
-}
-
-// Convert NV21 to RGB24.
-LIBYUV_API
-int NV21ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_vu,
-                int src_stride_vu,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
-                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
-                           width, height);
-}
-
-// Convert NV12 to RAW.
-LIBYUV_API
-int NV12ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_uv,
-              int src_stride_uv,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
-                           dst_stride_raw, &kYvuI601Constants, width, height);
-}
-
-// Convert NV21 to RAW.
-LIBYUV_API
-int NV21ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_vu,
-              int src_stride_vu,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
-                           dst_stride_raw, &kYvuI601Constants, width, height);
-}
-
-// Convert NV21 to YUV24
-int NV21ToYUV24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_vu,
-                int src_stride_vu,
-                uint8_t* dst_yuv24,
-                int dst_stride_yuv24,
-                int width,
-                int height) {
-  int y;
-  void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
-                         uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
-  if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
-    dst_stride_yuv24 = -dst_stride_yuv24;
-  }
-#if defined(HAS_NV21TOYUV24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      NV21ToYUV24Row = NV21ToYUV24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOYUV24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
-    dst_yuv24 += dst_stride_yuv24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_vu += src_stride_vu;
-    }
-  }
-  return 0;
-}
-
-// Convert YUY2 to ARGB.
-LIBYUV_API
-int YUY2ToARGB(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants, int width) =
-      YUY2ToARGBRow_C;
-  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_yuy2 = dst_stride_argb = 0;
-  }
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_MSA;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
-    src_yuy2 += src_stride_yuy2;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert UYVY to ARGB.
-LIBYUV_API
-int UYVYToARGB(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants, int width) =
-      UYVYToARGBRow_C;
-  if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-  // Coalesce rows.
-  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_uyvy = dst_stride_argb = 0;
-  }
-#if defined(HAS_UYVYTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToARGBRow = UYVYToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      UYVYToARGBRow = UYVYToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      UYVYToARGBRow = UYVYToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      UYVYToARGBRow = UYVYToARGBRow_MSA;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
-    src_uyvy += src_stride_uyvy;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-static void WeavePixels(const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        int src_pixel_stride_uv,
-                        uint8_t* dst_uv,
-                        int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst_uv[0] = *src_u;
-    dst_uv[1] = *src_v;
-    dst_uv += 2;
-    src_u += src_pixel_stride_uv;
-    src_v += src_pixel_stride_uv;
-  }
-}
-
-// Convert Android420 to ARGB with matrix.
-LIBYUV_API
-int Android420ToARGBMatrix(const uint8_t* src_y,
-                           int src_stride_y,
-                           const uint8_t* src_u,
-                           int src_stride_u,
-                           const uint8_t* src_v,
-                           int src_stride_v,
-                           int src_pixel_stride_uv,
-                           uint8_t* dst_argb,
-                           int dst_stride_argb,
-                           const struct YuvConstants* yuvconstants,
-                           int width,
-                           int height) {
-  int y;
-  uint8_t* dst_uv;
-  const ptrdiff_t vu_off = src_v - src_u;
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-
-  // I420
-  if (src_pixel_stride_uv == 1) {
-    return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                            src_stride_v, dst_argb, dst_stride_argb,
-                            yuvconstants, width, height);
-    // NV21
-  }
-  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
-      src_stride_u == src_stride_v) {
-    return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
-                            dst_stride_argb, yuvconstants, width, height);
-    // NV12
-  }
-  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
-    return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
-                            dst_stride_argb, yuvconstants, width, height);
-  }
-
-  // General case fallback creates NV12
-  align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
-  dst_uv = plane_uv;
-  for (y = 0; y < halfheight; ++y) {
-    WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uv += halfwidth * 2;
-  }
-  NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,
-                   dst_stride_argb, yuvconstants, width, height);
-  free_aligned_buffer_64(plane_uv);
-  return 0;
-}
-
-// Convert Android420 to ARGB.
-LIBYUV_API
-int Android420ToARGB(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     int width,
-                     int height) {
-  return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                                src_stride_v, src_pixel_stride_uv, dst_argb,
-                                dst_stride_argb, &kYuvI601Constants, width,
-                                height);
-}
-
-// Convert Android420 to ABGR.
-LIBYUV_API
-int Android420ToABGR(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_abgr,
-                     int dst_stride_abgr,
-                     int width,
-                     int height) {
-  return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                                src_stride_u, src_pixel_stride_uv, dst_abgr,
-                                dst_stride_abgr, &kYvuI601Constants, width,
-                                height);
-}
-
-// Convert I422 to RGBA with matrix.
-LIBYUV_API
-int I422ToRGBAMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_rgba,
-                     int dst_stride_rgba,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
-  }
-#if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToRGBARow = I422ToRGBARow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422ToRGBARow = I422ToRGBARow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGBARow = I422ToRGBARow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert NV12 to RGB565 with matrix.
-LIBYUV_API
-int NV12ToRGB565Matrix(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_uv,
-                       int src_stride_uv,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width,
-                       int height) {
-  int y;
-  void (*NV12ToRGB565Row)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
-  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_uv,
-                 int src_stride_uv,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
-                            dst_rgb565, dst_stride_rgb565, &kYuvI601Constants,
-                            width, height);
-}
-
-// Convert I422 to RGBA with matrix.
-LIBYUV_API
-int I420ToRGBAMatrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_rgba,
-                     int dst_stride_rgba,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
-  }
-#if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToRGBARow = I422ToRGBARow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422ToRGBARow = I422ToRGBARow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGBARow = I422ToRGBARow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGBA.
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert I420 to RGB24 with matrix.
-LIBYUV_API
-int I420ToRGB24Matrix(const uint8_t* src_y,
-                      int src_stride_y,
-                      const uint8_t* src_u,
-                      int src_stride_u,
-                      const uint8_t* src_v,
-                      int src_stride_v,
-                      uint8_t* dst_rgb24,
-                      int dst_stride_rgb24,
-                      const struct YuvConstants* yuvconstants,
-                      int width,
-                      int height) {
-  int y;
-  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                         const uint8_t* v_buf, uint8_t* rgb_buf,
-                         const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGB24Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_rgb24 = -dst_stride_rgb24;
-  }
-#if defined(HAS_I422TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToRGB24Row = I422ToRGB24Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB24Row = I422ToRGB24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422ToRGB24Row = I422ToRGB24Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB24Row = I422ToRGB24Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
-    dst_rgb24 += dst_stride_rgb24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB24.
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_rgb24, dst_stride_rgb24,
-                           &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
-                           src_stride_v,  // Swap U and V
-                           src_u, src_stride_u, dst_raw, dst_stride_raw,
-                           &kYvuI601Constants,  // Use Yvu matrix
-                           width, height);
-}
-
-// Convert J420 to RGB24.
-LIBYUV_API
-int J420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_rgb24, dst_stride_rgb24,
-                           &kYuvJPEGConstants, width, height);
-}
-
-// Convert J420 to RAW.
-LIBYUV_API
-int J420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
-                           src_stride_v,  // Swap U and V
-                           src_u, src_stride_u, dst_raw, dst_stride_raw,
-                           &kYvuJPEGConstants,  // Use Yvu matrix
-                           width, height);
-}
-
-// Convert H420 to RGB24.
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_rgb24, dst_stride_rgb24,
-                           &kYuvH709Constants, width, height);
-}
-
-// Convert H420 to RAW.
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
-                           src_stride_v,  // Swap U and V
-                           src_u, src_stride_u, dst_raw, dst_stride_raw,
-                           &kYvuH709Constants,  // Use Yvu matrix
-                           width, height);
-}
-
-// Convert I420 to ARGB1555.
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb1555,
-                   int dst_stride_argb1555,
-                   int width,
-                   int height) {
-  int y;
-  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                            const uint8_t* v_buf, uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) = I422ToARGB1555Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
-    dst_stride_argb1555 = -dst_stride_argb1555;
-  }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
-                      width);
-    dst_argb1555 += dst_stride_argb1555;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb4444,
-                   int dst_stride_argb4444,
-                   int width,
-                   int height) {
-  int y;
-  void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                            const uint8_t* v_buf, uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) = I422ToARGB4444Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
-    dst_stride_argb4444 = -dst_stride_argb4444;
-  }
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
-                      width);
-    dst_argb4444 += dst_stride_argb4444;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB565 with specified color matrix.
-LIBYUV_API
-int I420ToRGB565Matrix(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width,
-                       int height) {
-  int y;
-  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                          const uint8_t* v_buf, uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB565Row = I422ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422ToRGB565Row = I422ToRGB565Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB565.
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                            src_stride_v, dst_rgb565, dst_stride_rgb565,
-                            &kYuvI601Constants, width, height);
-}
-
-// Convert J420 to RGB565.
-LIBYUV_API
-int J420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                            src_stride_v, dst_rgb565, dst_stride_rgb565,
-                            &kYuvJPEGConstants, width, height);
-}
-
-// Convert H420 to RGB565.
-LIBYUV_API
-int H420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                            src_stride_v, dst_rgb565, dst_stride_rgb565,
-                            &kYuvH709Constants, width, height);
-}
-
-// Convert I422 to RGB565.
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  int y;
-  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                          const uint8_t* v_buf, uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB565Row = I422ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8_t kDither565_4x4[16] = {
-    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
-};
-
-// Convert I420 to RGB565 with dithering.
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const uint8_t* dither4x4,
-                       int width,
-                       int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToARGBRow_C;
-  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
-      ARGBToRGB565DitherRow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-  if (!dither4x4) {
-    dither4x4 = kDither565_4x4;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      I422ToARGBRow = I422ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
-    }
-  }
-#endif
-  {
-    // Allocate a row of argb.
-    align_buffer_64(row_argb, width * 4);
-    for (y = 0; y < height; ++y) {
-      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
-      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
-                            *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
-                            width);
-      dst_rgb565 += dst_stride_rgb565;
-      src_y += src_stride_y;
-      if (y & 1) {
-        src_u += src_stride_u;
-        src_v += src_stride_v;
-      }
-    }
-    free_aligned_buffer_64(row_argb);
-  }
-  return 0;
-}
-
-// Convert I420 to AR30 with matrix.
-LIBYUV_API
-int I420ToAR30Matrix(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     uint8_t* dst_ar30,
-                     int dst_stride_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width,
-                     int height) {
-  int y;
-  void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToAR30Row_C;
-
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-
-#if defined(HAS_I422TOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToAR30Row = I422ToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToAR30Row = I422ToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToAR30Row = I422ToAR30Row_AVX2;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
-    dst_ar30 += dst_stride_ar30;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYvuH709Constants, width, height);
-}
-
-// Convert I420 to AB30.
-LIBYUV_API
-int I420ToAB30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                          src_stride_u, dst_ab30, dst_stride_ab30,
-                          &kYvuI601Constants, width, height);
-}
-
-// Convert H420 to AB30.
-LIBYUV_API
-int H420ToAB30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                          src_stride_u, dst_ab30, dst_stride_ab30,
-                          &kYvuH709Constants, width, height);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/convert_from.cc b/thirdparty/libyuv/source/convert_from.cc
deleted file mode 100644
index 687f0a7..0000000
--- a/thirdparty/libyuv/source/convert_from.cc
+++ /dev/null
@@ -1,855 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_from.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/convert.h"  // For I420Copy
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/row.h"
-#include "libyuv/scale.h"  // For ScalePlane()
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// I420 To any I4xx YUV format with mirroring.
-// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane
-
-static int I420ToI4xx(const uint8_t* src_y,
-                      int src_stride_y,
-                      const uint8_t* src_u,
-                      int src_stride_u,
-                      const uint8_t* src_v,
-                      int src_stride_v,
-                      uint8_t* dst_y,
-                      int dst_stride_y,
-                      uint8_t* dst_u,
-                      int dst_stride_u,
-                      uint8_t* dst_v,
-                      int dst_stride_v,
-                      int src_y_width,
-                      int src_y_height,
-                      int dst_uv_width,
-                      int dst_uv_height) {
-  const int dst_y_width = Abs(src_y_width);
-  const int dst_y_height = Abs(src_y_height);
-  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
-  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
-  if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
-      dst_uv_height <= 0) {
-    return -1;
-  }
-  if (dst_y) {
-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
-               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
-  }
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
-             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
-             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
-  return 0;
-}
-
-// Convert 8 bit YUV to 10 bit.
-LIBYUV_API
-int I420ToI010(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  // Convert Y plane.
-  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
-                    height);
-  // Convert UV planes.
-  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
-                    halfheight);
-  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
-                    halfheight);
-  return 0;
-}
-
-// Convert 8 bit YUV to 12 bit.
-LIBYUV_API
-int I420ToI012(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  // Convert Y plane.
-  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width,
-                    height);
-  // Convert UV planes.
-  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth,
-                    halfheight);
-  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth,
-                    halfheight);
-  return 0;
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 422 chroma is 1/2 width, 1x height
-LIBYUV_API
-int I420ToI422(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  const int dst_uv_width = (Abs(width) + 1) >> 1;
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                    dst_v, dst_stride_v, width, height, dst_uv_width,
-                    dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 444 chroma is 1x width, 1x height
-LIBYUV_API
-int I420ToI444(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  const int dst_uv_width = Abs(width);
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                    dst_v, dst_stride_v, width, height, dst_uv_width,
-                    dst_uv_height);
-}
-
-// 420 chroma to 444 chroma, 10/12 bit version
-LIBYUV_API
-int I010ToI410(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  if (width == 0 || height == 0) {
-    return -1;
-  }
-
-  if (dst_y) {
-    ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-                  Abs(width), Abs(height), kFilterBilinear);
-  }
-  ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
-                SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width),
-                Abs(height), kFilterBilinear);
-  ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
-                SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width),
-                Abs(height), kFilterBilinear);
-  return 0;
-}
-
-// 422 chroma to 444 chroma, 10/12 bit version
-LIBYUV_API
-int I210ToI410(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  if (width == 0 || height == 0) {
-    return -1;
-  }
-
-  if (dst_y) {
-    ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-                  Abs(width), Abs(height), kFilterBilinear);
-  }
-  ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
-                dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
-  ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
-                dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
-  return 0;
-}
-
-// 422 chroma is 1/2 width, 1x height
-// 444 chroma is 1x width, 1x height
-LIBYUV_API
-int I422ToI444(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  if (width == 0 || height == 0) {
-    return -1;
-  }
-
-  if (dst_y) {
-    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
-               Abs(width), Abs(height), kFilterBilinear);
-  }
-  ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
-             dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
-             dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
-  return 0;
-}
-
-// Copy to I400. Source can be I420,422,444,400,NV12,NV21
-LIBYUV_API
-int I400Copy(const uint8_t* src_y,
-             int src_stride_y,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             int width,
-             int height) {
-  if (!src_y || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  return 0;
-}
-
-LIBYUV_API
-int I422ToYUY2(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_yuy2,
-               int dst_stride_yuy2,
-               int width,
-               int height) {
-  int y;
-  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
-    dst_stride_yuy2 = -dst_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
-  }
-#if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToYUY2Row = I422ToYUY2Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_yuy2 += dst_stride_yuy2;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I420ToYUY2(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_yuy2,
-               int dst_stride_yuy2,
-               int width,
-               int height) {
-  int y;
-  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
-    dst_stride_yuy2 = -dst_stride_yuy2;
-  }
-#if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToYUY2Row = I422ToYUY2Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToYUY2Row = I422ToYUY2Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToYUY2Row = I422ToYUY2Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
-    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
-                  dst_yuy2 + dst_stride_yuy2, width);
-    src_y += src_stride_y * 2;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_yuy2 += dst_stride_yuy2 * 2;
-  }
-  if (height & 1) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I422ToUYVY(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_uyvy,
-               int dst_stride_uyvy,
-               int width,
-               int height) {
-  int y;
-  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
-      I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
-    dst_stride_uyvy = -dst_stride_uyvy;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
-  }
-#if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToUYVYRow = I422ToUYVYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToUYVYRow = I422ToUYVYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToUYVYRow = I422ToUYVYRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uyvy += dst_stride_uyvy;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I420ToUYVY(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_uyvy,
-               int dst_stride_uyvy,
-               int width,
-               int height) {
-  int y;
-  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
-      I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
-    dst_stride_uyvy = -dst_stride_uyvy;
-  }
-#if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToUYVYRow = I422ToUYVYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToUYVYRow = I422ToUYVYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToUYVYRow = I422ToUYVYRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
-    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
-                  dst_uyvy + dst_stride_uyvy, width);
-    src_y += src_stride_y * 2;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uyvy += dst_stride_uyvy * 2;
-  }
-  if (height & 1) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I420ToNV12(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) / 2;
-  int halfheight = (height + 1) / 2;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
-               halfwidth, halfheight);
-  return 0;
-}
-
-LIBYUV_API
-int I420ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
-                    width, height);
-}
-
-// Convert I420 to specified format
-LIBYUV_API
-int ConvertFromI420(const uint8_t* y,
-                    int y_stride,
-                    const uint8_t* u,
-                    int u_stride,
-                    const uint8_t* v,
-                    int v_stride,
-                    uint8_t* dst_sample,
-                    int dst_sample_stride,
-                    int width,
-                    int height,
-                    uint32_t fourcc) {
-  uint32_t format = CanonicalFourCC(fourcc);
-  int r = 0;
-  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
-    return -1;
-  }
-  switch (format) {
-    // Single plane formats
-    case FOURCC_YUY2:
-      r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2, width,
-                     height);
-      break;
-    case FOURCC_UYVY:
-      r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2, width,
-                     height);
-      break;
-    case FOURCC_RGBP:
-      r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                       dst_sample_stride ? dst_sample_stride : width * 2, width,
-                       height);
-      break;
-    case FOURCC_RGBO:
-      r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                         dst_sample_stride ? dst_sample_stride : width * 2,
-                         width, height);
-      break;
-    case FOURCC_R444:
-      r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                         dst_sample_stride ? dst_sample_stride : width * 2,
-                         width, height);
-      break;
-    case FOURCC_24BG:
-      r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width * 3, width,
-                      height);
-      break;
-    case FOURCC_RAW:
-      r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                    dst_sample_stride ? dst_sample_stride : width * 3, width,
-                    height);
-      break;
-    case FOURCC_ARGB:
-      r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4, width,
-                     height);
-      break;
-    case FOURCC_BGRA:
-      r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4, width,
-                     height);
-      break;
-    case FOURCC_ABGR:
-      r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4, width,
-                     height);
-      break;
-    case FOURCC_RGBA:
-      r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4, width,
-                     height);
-      break;
-    case FOURCC_AR30:
-      r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4, width,
-                     height);
-      break;
-    case FOURCC_I400:
-      r = I400Copy(y, y_stride, dst_sample,
-                   dst_sample_stride ? dst_sample_stride : width, width,
-                   height);
-      break;
-    case FOURCC_NV12: {
-      uint8_t* dst_uv = dst_sample + width * height;
-      r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width, dst_uv,
-                     dst_sample_stride ? dst_sample_stride : width, width,
-                     height);
-      break;
-    }
-    case FOURCC_NV21: {
-      uint8_t* dst_vu = dst_sample + width * height;
-      r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width, dst_vu,
-                     dst_sample_stride ? dst_sample_stride : width, width,
-                     height);
-      break;
-    }
-    // Triplanar formats
-    case FOURCC_I420:
-    case FOURCC_YV12: {
-      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
-      int halfstride = (dst_sample_stride + 1) / 2;
-      int halfheight = (height + 1) / 2;
-      uint8_t* dst_u;
-      uint8_t* dst_v;
-      if (format == FOURCC_YV12) {
-        dst_v = dst_sample + dst_sample_stride * height;
-        dst_u = dst_v + halfstride * halfheight;
-      } else {
-        dst_u = dst_sample + dst_sample_stride * height;
-        dst_v = dst_u + halfstride * halfheight;
-      }
-      r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                   dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
-                   width, height);
-      break;
-    }
-    case FOURCC_I422:
-    case FOURCC_YV16: {
-      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
-      int halfstride = (dst_sample_stride + 1) / 2;
-      uint8_t* dst_u;
-      uint8_t* dst_v;
-      if (format == FOURCC_YV16) {
-        dst_v = dst_sample + dst_sample_stride * height;
-        dst_u = dst_v + halfstride * height;
-      } else {
-        dst_u = dst_sample + dst_sample_stride * height;
-        dst_v = dst_u + halfstride * height;
-      }
-      r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
-                     width, height);
-      break;
-    }
-    case FOURCC_I444:
-    case FOURCC_YV24: {
-      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
-      uint8_t* dst_u;
-      uint8_t* dst_v;
-      if (format == FOURCC_YV24) {
-        dst_v = dst_sample + dst_sample_stride * height;
-        dst_u = dst_v + dst_sample_stride * height;
-      } else {
-        dst_u = dst_sample + dst_sample_stride * height;
-        dst_v = dst_u + dst_sample_stride * height;
-      }
-      r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride, dst_u, dst_sample_stride, dst_v,
-                     dst_sample_stride, width, height);
-      break;
-    }
-    // Formats not supported - MJPG, biplanar, some rgb formats.
-    default:
-      return -1;  // unknown fourcc - return failure code.
-  }
-  return r;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/convert_from_argb.cc b/thirdparty/libyuv/source/convert_from_argb.cc
deleted file mode 100644
index e146158..0000000
--- a/thirdparty/libyuv/source/convert_from_argb.cc
+++ /dev/null
@@ -1,2281 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_from_argb.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// ARGB little endian (bgra in memory) to I444
-LIBYUV_API
-int ARGBToI444(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u,
-                         uint8_t* dst_v, int width) = ARGBToUV444Row_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_y == width &&
-      dst_stride_u == width && dst_stride_v == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOUV444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUV444Row = ARGBToUV444Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUV444Row = ARGBToUV444Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV444ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUV444Row = ARGBToUV444Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV444ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUV444Row = ARGBToUV444Row_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToUV444Row(src_argb, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-// ARGB little endian (bgra in memory) to I422
-LIBYUV_API
-int ARGBToI422(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_y == width &&
-      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-
-#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
-    }
-  }
-#endif
-
-#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-    }
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int ARGBToNV12(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
-                      uint8_t* dst_uv, int width) = MergeUVRow_C;
-  if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-    }
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow_ = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow_ = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow_ = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      MergeUVRow_ = MergeUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MergeUVRow_ = MergeUVRow_Any_MSA;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_MSA;
-    }
-  }
-#endif
-  {
-    // Allocate a rows of uv.
-    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
-
-    for (y = 0; y < height - 1; y += 2) {
-      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-      ARGBToYRow(src_argb, dst_y, width);
-      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-      src_argb += src_stride_argb * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-      ARGBToYRow(src_argb, dst_y, width);
-    }
-    free_aligned_buffer_64(row_u);
-  }
-  return 0;
-}
-
-// Same as NV12 but U and V swapped.
-LIBYUV_API
-int ARGBToNV21(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
-                      uint8_t* dst_vu, int width) = MergeUVRow_C;
-  if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-    }
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow_ = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow_ = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow_ = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      MergeUVRow_ = MergeUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MergeUVRow_ = MergeUVRow_Any_MSA;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_MSA;
-    }
-  }
-#endif
-  {
-    // Allocate a rows of uv.
-    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
-
-    for (y = 0; y < height - 1; y += 2) {
-      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
-      ARGBToYRow(src_argb, dst_y, width);
-      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-      src_argb += src_stride_argb * 2;
-      dst_y += dst_stride_y * 2;
-      dst_vu += dst_stride_vu;
-    }
-    if (height & 1) {
-      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
-      ARGBToYRow(src_argb, dst_y, width);
-    }
-    free_aligned_buffer_64(row_u);
-  }
-  return 0;
-}
-
-LIBYUV_API
-int ABGRToNV12(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ABGRToUVRow_C;
-  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
-      ABGRToYRow_C;
-  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
-                      uint8_t* dst_uv, int width) = MergeUVRow_C;
-  if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
-    src_stride_abgr = -src_stride_abgr;
-  }
-#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
-    ABGRToYRow = ABGRToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_SSSE3;
-      ABGRToYRow = ABGRToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
-    ABGRToYRow = ABGRToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ABGRToUVRow = ABGRToUVRow_AVX2;
-      ABGRToYRow = ABGRToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ABGRToYRow = ABGRToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToYRow = ABGRToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ABGRToUVRow = ABGRToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ABGRToYRow = ABGRToYRow_Any_MMI;
-    ABGRToUVRow = ABGRToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToYRow = ABGRToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ABGRToYRow = ABGRToYRow_Any_MSA;
-    ABGRToUVRow = ABGRToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToYRow = ABGRToYRow_MSA;
-    }
-    if (IS_ALIGNED(width, 32)) {
-      ABGRToUVRow = ABGRToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow_ = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow_ = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow_ = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      MergeUVRow_ = MergeUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MergeUVRow_ = MergeUVRow_Any_MSA;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_MSA;
-    }
-  }
-#endif
-  {
-    // Allocate a rows of uv.
-    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
-
-    for (y = 0; y < height - 1; y += 2) {
-      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
-      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-      ABGRToYRow(src_abgr, dst_y, width);
-      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
-      src_abgr += src_stride_abgr * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
-      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-      ABGRToYRow(src_abgr, dst_y, width);
-    }
-    free_aligned_buffer_64(row_u);
-  }
-  return 0;
-}
-
-// Same as NV12 but U and V swapped.
-LIBYUV_API
-int ABGRToNV21(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ABGRToUVRow_C;
-  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
-      ABGRToYRow_C;
-  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
-                      uint8_t* dst_vu, int width) = MergeUVRow_C;
-  if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
-    src_stride_abgr = -src_stride_abgr;
-  }
-#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
-    ABGRToYRow = ABGRToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_SSSE3;
-      ABGRToYRow = ABGRToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
-    ABGRToYRow = ABGRToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ABGRToUVRow = ABGRToUVRow_AVX2;
-      ABGRToYRow = ABGRToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ABGRToYRow = ABGRToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToYRow = ABGRToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ABGRToUVRow = ABGRToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ABGRToYRow = ABGRToYRow_Any_MMI;
-    ABGRToUVRow = ABGRToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToYRow = ABGRToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ABGRToYRow = ABGRToYRow_Any_MSA;
-    ABGRToUVRow = ABGRToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToYRow = ABGRToYRow_MSA;
-    }
-    if (IS_ALIGNED(width, 32)) {
-      ABGRToUVRow = ABGRToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow_ = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow_ = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow_ = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      MergeUVRow_ = MergeUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MergeUVRow_ = MergeUVRow_Any_MSA;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_MSA;
-    }
-  }
-#endif
-  {
-    // Allocate a rows of uv.
-    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
-
-    for (y = 0; y < height - 1; y += 2) {
-      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
-      ABGRToYRow(src_abgr, dst_y, width);
-      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
-      src_abgr += src_stride_abgr * 2;
-      dst_y += dst_stride_y * 2;
-      dst_vu += dst_stride_vu;
-    }
-    if (height & 1) {
-      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
-      ABGRToYRow(src_abgr, dst_y, width);
-    }
-    free_aligned_buffer_64(row_u);
-  }
-  return 0;
-}
-
-// Convert ARGB to YUY2.
-LIBYUV_API
-int ARGBToYUY2(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_yuy2,
-               int dst_stride_yuy2,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
-
-  if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
-    dst_stride_yuy2 = -dst_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_yuy2 = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-    }
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToYUY2Row = I422ToYUY2Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToYUY2Row = I422ToYUY2Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToYUY2Row = I422ToYUY2Row_MSA;
-    }
-  }
-#endif
-
-  {
-    // Allocate a rows of yuv.
-    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-    uint8_t* row_u = row_y + ((width + 63) & ~63);
-    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
-
-    for (y = 0; y < height; ++y) {
-      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      ARGBToYRow(src_argb, row_y, width);
-      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
-      src_argb += src_stride_argb;
-      dst_yuy2 += dst_stride_yuy2;
-    }
-
-    free_aligned_buffer_64(row_y);
-  }
-  return 0;
-}
-
-// Convert ARGB to UYVY.
-LIBYUV_API
-int ARGBToUYVY(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_uyvy,
-               int dst_stride_uyvy,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
-      I422ToUYVYRow_C;
-
-  if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
-    dst_stride_uyvy = -dst_stride_uyvy;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_uyvy = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-    }
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToUYVYRow = I422ToUYVYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToUYVYRow = I422ToUYVYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToUYVYRow = I422ToUYVYRow_MSA;
-    }
-  }
-#endif
-
-  {
-    // Allocate a rows of yuv.
-    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-    uint8_t* row_u = row_y + ((width + 63) & ~63);
-    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
-
-    for (y = 0; y < height; ++y) {
-      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      ARGBToYRow(src_argb, row_y, width);
-      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
-      src_argb += src_stride_argb;
-      dst_uyvy += dst_stride_uyvy;
-    }
-
-    free_aligned_buffer_64(row_y);
-  }
-  return 0;
-}
-
-// Convert ARGB to I400.
-LIBYUV_API
-int ARGBToI400(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-  }
-  return 0;
-}
-
-// Shuffle table for converting ARGB to RGBA.
-static const uvec8 kShuffleMaskARGBToRGBA = {
-    3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
-
-// Convert ARGB to RGBA.
-LIBYUV_API
-int ARGBToRGBA(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height) {
-  return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
-                     (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);
-}
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8_t* src_argb,
-                int src_stride_argb,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  int y;
-  void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
-      ARGBToRGB24Row_C;
-  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_rgb24 = 0;
-  }
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
-  if (TestCpuFlag(kCpuHasAVX512VBMI)) {
-    ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB24ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB24ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToRGB24Row(src_argb, dst_rgb24, width);
-    src_argb += src_stride_argb;
-    dst_rgb24 += dst_stride_rgb24;
-  }
-  return 0;
-}
-
-// Convert ARGB To RAW.
-LIBYUV_API
-int ARGBToRAW(const uint8_t* src_argb,
-              int src_stride_argb,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  int y;
-  void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
-      ARGBToRAWRow_C;
-  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_raw = 0;
-  }
-#if defined(HAS_ARGBTORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORAWROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRAWRow = ARGBToRAWRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToRAWRow = ARGBToRAWRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORAWROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRAWRow = ARGBToRAWRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORAWROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToRAWRow = ARGBToRAWRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRAWRow = ARGBToRAWRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORAWROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToRAWRow = ARGBToRAWRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToRAWRow(src_argb, dst_raw, width);
-    src_argb += src_stride_argb;
-    dst_raw += dst_stride_raw;
-  }
-  return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8_t kDither565_4x4[16] = {
-    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
-};
-
-// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
-LIBYUV_API
-int ARGBToRGB565Dither(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const uint8_t* dither4x4,
-                       int width,
-                       int height) {
-  int y;
-  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
-      ARGBToRGB565DitherRow_C;
-  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  if (!dither4x4) {
-    dither4x4 = kDither565_4x4;
-  }
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
-                          *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
-                          width);
-    src_argb += src_stride_argb;
-    dst_rgb565 += dst_stride_rgb565;
-  }
-  return 0;
-}
-
-// Convert ARGB To RGB565.
-// TODO(fbarchard): Consider using dither function low level with zeros.
-LIBYUV_API
-int ARGBToRGB565(const uint8_t* src_argb,
-                 int src_stride_argb,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  int y;
-  void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                          int width) = ARGBToRGB565Row_C;
-  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_rgb565 = 0;
-  }
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToRGB565Row(src_argb, dst_rgb565, width);
-    src_argb += src_stride_argb;
-    dst_rgb565 += dst_stride_rgb565;
-  }
-  return 0;
-}
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb1555,
-                   int dst_stride_argb1555,
-                   int width,
-                   int height) {
-  int y;
-  void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                            int width) = ARGBToARGB1555Row_C;
-  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb1555 = 0;
-  }
-#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB1555ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB1555ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
-    src_argb += src_stride_argb;
-    dst_argb1555 += dst_stride_argb1555;
-  }
-  return 0;
-}
-
-// Convert ARGB To ARGB4444.
-LIBYUV_API
-int ARGBToARGB4444(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb4444,
-                   int dst_stride_argb4444,
-                   int width,
-                   int height) {
-  int y;
-  void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                            int width) = ARGBToARGB4444Row_C;
-  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb4444 = 0;
-  }
-#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB4444ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOARGB4444ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
-    src_argb += src_stride_argb;
-    dst_argb4444 += dst_stride_argb4444;
-  }
-  return 0;
-}
-
-// Convert ABGR To AR30.
-LIBYUV_API
-int ABGRToAR30(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  int y;
-  void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) =
-      ABGRToAR30Row_C;
-  if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
-    src_stride_abgr = -src_stride_abgr;
-  }
-  // Coalesce rows.
-  if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_abgr = dst_stride_ar30 = 0;
-  }
-#if defined(HAS_ABGRTOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ABGRToAR30Row = ABGRToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ABGRToAR30Row = ABGRToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToAR30Row = ABGRToAR30Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ABGRToAR30Row(src_abgr, dst_ar30, width);
-    src_abgr += src_stride_abgr;
-    dst_ar30 += dst_stride_ar30;
-  }
-  return 0;
-}
-
-// Convert ARGB To AR30.
-LIBYUV_API
-int ARGBToAR30(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
-      ARGBToAR30Row_C;
-  if (!src_argb || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_ar30 = 0;
-  }
-#if defined(HAS_ARGBTOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToAR30Row = ARGBToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAR30Row = ARGBToAR30Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ARGBToAR30Row(src_argb, dst_ar30, width);
-    src_argb += src_stride_argb;
-    dst_ar30 += dst_stride_ar30;
-  }
-  return 0;
-}
-
-// Convert ARGB to J420. (JPeg full range I420).
-LIBYUV_API
-int ARGBToJ420(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_yj,
-               int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
-      ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MMI;
-    ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MSA;
-    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_MSA;
-    }
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJRow = ARGBToUVJRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
-    ARGBToYJRow(src_argb, dst_yj, width);
-    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
-    src_argb += src_stride_argb * 2;
-    dst_yj += dst_stride_yj * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYJRow(src_argb, dst_yj, width);
-  }
-  return 0;
-}
-
-// Convert ARGB to J422. (JPeg full range I422).
-LIBYUV_API
-int ARGBToJ422(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_yj,
-               int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
-      ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_yj == width &&
-      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MMI;
-    ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_MMI;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MSA;
-    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_MSA;
-    }
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJRow = ARGBToUVJRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYJRow(src_argb, dst_yj, width);
-    src_argb += src_stride_argb;
-    dst_yj += dst_stride_yj;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-// Convert ARGB to AR64.
-LIBYUV_API
-int ARGBToAR64(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint16_t* dst_ar64,
-               int dst_stride_ar64,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
-                        int width) = ARGBToAR64Row_C;
-  if (!src_argb || !dst_ar64 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_ar64 = 0;
-  }
-#if defined(HAS_ARGBTOAR64ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToAR64Row = ARGBToAR64Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOAR64ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToAR64Row = ARGBToAR64Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAR64Row = ARGBToAR64Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOAR64ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToAR64Row = ARGBToAR64Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAR64Row = ARGBToAR64Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToAR64Row(src_argb, dst_ar64, width);
-    src_argb += src_stride_argb;
-    dst_ar64 += dst_stride_ar64;
-  }
-  return 0;
-}
-
-// Convert ARGB to AB64.
-LIBYUV_API
-int ARGBToAB64(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint16_t* dst_ab64,
-               int dst_stride_ab64,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
-                        int width) = ARGBToAB64Row_C;
-  if (!src_argb || !dst_ab64 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_ab64 = 0;
-  }
-#if defined(HAS_ARGBTOAB64ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToAB64Row = ARGBToAB64Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOAB64ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToAB64Row = ARGBToAB64Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAB64Row = ARGBToAB64Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOAB64ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToAB64Row = ARGBToAB64Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAB64Row = ARGBToAB64Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToAB64Row(src_argb, dst_ab64, width);
-    src_argb += src_stride_argb;
-    dst_ab64 += dst_stride_ab64;
-  }
-  return 0;
-}
-
-// Convert ARGB to J400.
-LIBYUV_API
-int ARGBToJ400(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_yj,
-               int dst_stride_yj,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
-      ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_yj == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_yj = 0;
-  }
-#if defined(HAS_ARGBTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToYJRow(src_argb, dst_yj, width);
-    src_argb += src_stride_argb;
-    dst_yj += dst_stride_yj;
-  }
-  return 0;
-}
-
-// Convert RGBA to J400.
-LIBYUV_API
-int RGBAToJ400(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_yj,
-               int dst_stride_yj,
-               int width,
-               int height) {
-  int y;
-  void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
-      RGBAToYJRow_C;
-  if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
-    src_stride_rgba = -src_stride_rgba;
-  }
-  // Coalesce rows.
-  if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
-    width *= height;
-    height = 1;
-    src_stride_rgba = dst_stride_yj = 0;
-  }
-#if defined(HAS_RGBATOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToYJRow = RGBAToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGBAToYJRow = RGBAToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      RGBAToYJRow = RGBAToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGBAToYJRow = RGBAToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGBAToYJRow = RGBAToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGBAToYJRow = RGBAToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGBAToYJRow = RGBAToYJRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGBAToYJRow = RGBAToYJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToYJRow = RGBAToYJRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RGBAToYJRow(src_rgba, dst_yj, width);
-    src_rgba += src_stride_rgba;
-    dst_yj += dst_stride_yj;
-  }
-  return 0;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/convert_jpeg.cc b/thirdparty/libyuv/source/convert_jpeg.cc
deleted file mode 100644
index d7556ee..0000000
--- a/thirdparty/libyuv/source/convert_jpeg.cc
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
-
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#ifdef HAVE_JPEG
-struct I420Buffers {
-  uint8_t* y;
-  int y_stride;
-  uint8_t* u;
-  int u_stride;
-  uint8_t* v;
-  int v_stride;
-  int w;
-  int h;
-};
-
-static void JpegCopyI420(void* opaque,
-                         const uint8_t* const* data,
-                         const int* strides,
-                         int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
-           dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
-           dest->v_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI422ToI420(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
-             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
-             dest->v_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI444ToI420(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
-             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
-             dest->v_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI400ToI420(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
-             dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-// Query size of MJPG in pixels.
-LIBYUV_API
-int MJPGSize(const uint8_t* src_mjpg,
-             size_t src_size_mjpg,
-             int* width,
-             int* height) {
-  MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
-  if (ret) {
-    *width = mjpeg_decoder.GetWidth();
-    *height = mjpeg_decoder.GetHeight();
-  }
-  mjpeg_decoder.UnloadFrame();
-  return ret ? 0 : -1;  // -1 for runtime failure.
-}
-
-// MJPG (Motion JPeg) to I420
-// TODO(fbarchard): review src_width and src_height requirement. dst_width and
-// dst_height may be enough.
-LIBYUV_API
-int MJPGToI420(const uint8_t* src_mjpg,
-               size_t src_size_mjpg,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height) {
-  if (src_size_mjpg == kUnknownDataSize) {
-    // ERROR: MJPEG frame size unknown
-    return -1;
-  }
-
-  // TODO(fbarchard): Port MJpeg to C.
-  MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
-  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
-              mjpeg_decoder.GetHeight() != src_height)) {
-    // ERROR: MJPEG frame has unexpected dimensions
-    mjpeg_decoder.UnloadFrame();
-    return 1;  // runtime failure
-  }
-  if (ret) {
-    I420Buffers bufs = {dst_y, dst_stride_y, dst_u,     dst_stride_u,
-                        dst_v, dst_stride_v, dst_width, dst_height};
-    // YUV420
-    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
-        mjpeg_decoder.GetNumComponents() == 3 &&
-        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
-        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width,
-                                           dst_height);
-      // YUV422
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width,
-                                           dst_height);
-      // YUV444
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width,
-                                           dst_height);
-      // YUV400
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceGrayscale &&
-               mjpeg_decoder.GetNumComponents() == 1 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,
-                                           dst_height);
-    } else {
-      // TODO(fbarchard): Implement conversion for any other
-      // colorspace/subsample factors that occur in practice. ERROR: Unable to
-      // convert MJPEG frame because format is not supported
-      mjpeg_decoder.UnloadFrame();
-      return 1;
-    }
-  }
-  return ret ? 0 : 1;
-}
-
-struct NV21Buffers {
-  uint8_t* y;
-  int y_stride;
-  uint8_t* vu;
-  int vu_stride;
-  int w;
-  int h;
-};
-
-static void JpegI420ToNV21(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  NV21Buffers* dest = (NV21Buffers*)(opaque);
-  I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
-             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
-  dest->h -= rows;
-}
-
-static void JpegI422ToNV21(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  NV21Buffers* dest = (NV21Buffers*)(opaque);
-  I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
-             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
-  dest->h -= rows;
-}
-
-static void JpegI444ToNV21(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  NV21Buffers* dest = (NV21Buffers*)(opaque);
-  I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
-             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
-  dest->h -= rows;
-}
-
-static void JpegI400ToNV21(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  NV21Buffers* dest = (NV21Buffers*)(opaque);
-  I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
-             dest->vu_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
-  dest->h -= rows;
-}
-
-// MJPG (Motion JPeg) to NV21
-LIBYUV_API
-int MJPGToNV21(const uint8_t* src_mjpg,
-               size_t src_size_mjpg,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height) {
-  if (src_size_mjpg == kUnknownDataSize) {
-    // ERROR: MJPEG frame size unknown
-    return -1;
-  }
-
-  // TODO(fbarchard): Port MJpeg to C.
-  MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
-  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
-              mjpeg_decoder.GetHeight() != src_height)) {
-    // ERROR: MJPEG frame has unexpected dimensions
-    mjpeg_decoder.UnloadFrame();
-    return 1;  // runtime failure
-  }
-  if (ret) {
-    NV21Buffers bufs = {dst_y,         dst_stride_y, dst_vu,
-                        dst_stride_vu, dst_width,    dst_height};
-    // YUV420
-    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
-        mjpeg_decoder.GetNumComponents() == 3 &&
-        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
-        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width,
-                                           dst_height);
-      // YUV422
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width,
-                                           dst_height);
-      // YUV444
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width,
-                                           dst_height);
-      // YUV400
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceGrayscale &&
-               mjpeg_decoder.GetNumComponents() == 1 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width,
-                                           dst_height);
-    } else {
-      // Unknown colorspace.
-      mjpeg_decoder.UnloadFrame();
-      return 1;
-    }
-  }
-  return ret ? 0 : 1;
-}
-
-static void JpegI420ToNV12(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  NV21Buffers* dest = (NV21Buffers*)(opaque);
-  // Use NV21 with VU swapped.
-  I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
-             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
-  dest->h -= rows;
-}
-
-static void JpegI422ToNV12(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  NV21Buffers* dest = (NV21Buffers*)(opaque);
-  // Use NV21 with VU swapped.
-  I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
-             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
-  dest->h -= rows;
-}
-
-static void JpegI444ToNV12(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  NV21Buffers* dest = (NV21Buffers*)(opaque);
-  // Use NV21 with VU swapped.
-  I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
-             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
-  dest->h -= rows;
-}
-
-static void JpegI400ToNV12(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  NV21Buffers* dest = (NV21Buffers*)(opaque);
-  // Use NV21 since there is no UV plane.
-  I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
-             dest->vu_stride, dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
-  dest->h -= rows;
-}
-
-// MJPG (Motion JPEG) to NV12.
-LIBYUV_API
-int MJPGToNV12(const uint8_t* sample,
-               size_t sample_size,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height) {
-  if (sample_size == kUnknownDataSize) {
-    // ERROR: MJPEG frame size unknown
-    return -1;
-  }
-
-  // TODO(fbarchard): Port MJpeg to C.
-  MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
-              mjpeg_decoder.GetHeight() != src_height)) {
-    // ERROR: MJPEG frame has unexpected dimensions
-    mjpeg_decoder.UnloadFrame();
-    return 1;  // runtime failure
-  }
-  if (ret) {
-    // Use NV21Buffers but with UV instead of VU.
-    NV21Buffers bufs = {dst_y,         dst_stride_y, dst_uv,
-                        dst_stride_uv, dst_width,    dst_height};
-    // YUV420
-    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
-        mjpeg_decoder.GetNumComponents() == 3 &&
-        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
-        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width,
-                                           dst_height);
-      // YUV422
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width,
-                                           dst_height);
-      // YUV444
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width,
-                                           dst_height);
-      // YUV400
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceGrayscale &&
-               mjpeg_decoder.GetNumComponents() == 1 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width,
-                                           dst_height);
-    } else {
-      // Unknown colorspace.
-      mjpeg_decoder.UnloadFrame();
-      return 1;
-    }
-  }
-  return ret ? 0 : 1;
-}
-
-struct ARGBBuffers {
-  uint8_t* argb;
-  int argb_stride;
-  int w;
-  int h;
-};
-
-static void JpegI420ToARGB(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
-             dest->argb, dest->argb_stride, dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI422ToARGB(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
-             dest->argb, dest->argb_stride, dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI444ToARGB(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
-             dest->argb, dest->argb_stride, dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI400ToARGB(void* opaque,
-                           const uint8_t* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-// MJPG (Motion JPeg) to ARGB
-// TODO(fbarchard): review src_width and src_height requirement. dst_width and
-// dst_height may be enough.
-LIBYUV_API
-int MJPGToARGB(const uint8_t* src_mjpg,
-               size_t src_size_mjpg,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height) {
-  if (src_size_mjpg == kUnknownDataSize) {
-    // ERROR: MJPEG frame size unknown
-    return -1;
-  }
-
-  // TODO(fbarchard): Port MJpeg to C.
-  MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
-  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
-              mjpeg_decoder.GetHeight() != src_height)) {
-    // ERROR: MJPEG frame has unexpected dimensions
-    mjpeg_decoder.UnloadFrame();
-    return 1;  // runtime failure
-  }
-  if (ret) {
-    ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height};
-    // YUV420
-    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
-        mjpeg_decoder.GetNumComponents() == 3 &&
-        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
-        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width,
-                                           dst_height);
-      // YUV422
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width,
-                                           dst_height);
-      // YUV444
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width,
-                                           dst_height);
-      // YUV400
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceGrayscale &&
-               mjpeg_decoder.GetNumComponents() == 1 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,
-                                           dst_height);
-    } else {
-      // TODO(fbarchard): Implement conversion for any other
-      // colorspace/subsample factors that occur in practice. ERROR: Unable to
-      // convert MJPEG frame because format is not supported
-      mjpeg_decoder.UnloadFrame();
-      return 1;
-    }
-  }
-  return ret ? 0 : 1;
-}
-
-#endif  // HAVE_JPEG
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/convert_to_argb.cc b/thirdparty/libyuv/source/convert_to_argb.cc
deleted file mode 100644
index 84df16c..0000000
--- a/thirdparty/libyuv/source/convert_to_argb.cc
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_argb.h"
-
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/rotate_argb.h"
-#include "libyuv/row.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert camera sample to ARGB with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-//   With MJPEG it is the compressed size of the frame.
-
-// TODO(fbarchard): Add the following:
-// H010ToARGB
-// I010ToARGB
-
-LIBYUV_API
-int ConvertToARGB(const uint8_t* sample,
-                  size_t sample_size,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int crop_x,
-                  int crop_y,
-                  int src_width,
-                  int src_height,
-                  int crop_width,
-                  int crop_height,
-                  enum RotationMode rotation,
-                  uint32_t fourcc) {
-  uint32_t format = CanonicalFourCC(fourcc);
-  int aligned_src_width = (src_width + 1) & ~1;
-  const uint8_t* src;
-  const uint8_t* src_uv;
-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-  int r = 0;
-
-  // One pass rotation is available for some formats. For the rest, convert
-  // to ARGB (with optional vertical flipping) into a temporary ARGB buffer,
-  // and then rotate the ARGB to the final destination buffer.
-  // For in-place conversion, if destination dst_argb is same as source sample,
-  // also enable temporary buffer.
-  LIBYUV_BOOL need_buf =
-      (rotation && format != FOURCC_ARGB) || dst_argb == sample;
-  uint8_t* dest_argb = dst_argb;
-  int dest_dst_stride_argb = dst_stride_argb;
-  uint8_t* rotate_buffer = NULL;
-  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-
-  if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 ||
-      src_height == 0 || crop_height == 0) {
-    return -1;
-  }
-  if (src_height < 0) {
-    inv_crop_height = -inv_crop_height;
-  }
-
-  if (need_buf) {
-    int argb_size = crop_width * 4 * abs_crop_height;
-    rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */
-    if (!rotate_buffer) {
-      return 1;  // Out of memory runtime error.
-    }
-    dst_argb = rotate_buffer;
-    dst_stride_argb = crop_width * 4;
-  }
-
-  switch (format) {
-    // Single plane formats
-    case FOURCC_YUY2:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_UYVY:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_24BG:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
-                      inv_crop_height);
-      break;
-    case FOURCC_RAW:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
-                    inv_crop_height);
-      break;
-    case FOURCC_ARGB:
-      if (!need_buf && !rotation) {
-        src = sample + (src_width * crop_y + crop_x) * 4;
-        r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,
-                       crop_width, inv_crop_height);
-      }
-      break;
-    case FOURCC_BGRA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_ABGR:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_RGBA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_AR30:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_AB30:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_RGBP:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
-                       crop_width, inv_crop_height);
-      break;
-    case FOURCC_RGBO:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
-                         crop_width, inv_crop_height);
-      break;
-    case FOURCC_R444:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
-                         crop_width, inv_crop_height);
-      break;
-    case FOURCC_I400:
-      src = sample + src_width * crop_y + crop_x;
-      r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_J400:
-      src = sample + src_width * crop_y + crop_x;
-      r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
-                     inv_crop_height);
-      break;
-
-    // Biplanar formats
-    case FOURCC_NV12:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv =
-          sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
-      r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
-                     dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    case FOURCC_NV21:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv =
-          sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
-      // Call NV12 but with u and v parameters swapped.
-      r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
-                     dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    // Triplanar formats
-    case FOURCC_I420:
-    case FOURCC_YV12: {
-      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8_t* src_u;
-      const uint8_t* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      if (format == FOURCC_YV12) {
-        src_v = sample + src_width * abs_src_height +
-                (halfwidth * crop_y + crop_x) / 2;
-        src_u = sample + src_width * abs_src_height +
-                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      } else {
-        src_u = sample + src_width * abs_src_height +
-                (halfwidth * crop_y + crop_x) / 2;
-        src_v = sample + src_width * abs_src_height +
-                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      }
-      r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_J420: {
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8_t* src_u = sample + src_width * abs_src_height +
-                             (halfwidth * crop_y + crop_x) / 2;
-      const uint8_t* src_v = sample + src_width * abs_src_height +
-                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_H420: {
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8_t* src_u = sample + src_width * abs_src_height +
-                             (halfwidth * crop_y + crop_x) / 2;
-      const uint8_t* src_v = sample + src_width * abs_src_height +
-                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_U420: {
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8_t* src_u = sample + src_width * abs_src_height +
-                             (halfwidth * crop_y + crop_x) / 2;
-      const uint8_t* src_v = sample + src_width * abs_src_height +
-                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_I422:
-    case FOURCC_YV16: {
-      int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
-      const uint8_t* src_u;
-      const uint8_t* src_v;
-      if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
-                crop_x / 2;
-        src_u = sample + src_width * abs_src_height +
-                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      } else {
-        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
-                crop_x / 2;
-        src_v = sample + src_width * abs_src_height +
-                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      }
-      r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_J422: {
-      int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
-      const uint8_t* src_u =
-          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
-      const uint8_t* src_v = sample + src_width * abs_src_height +
-                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_H422: {
-      int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
-      const uint8_t* src_u =
-          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
-      const uint8_t* src_v = sample + src_width * abs_src_height +
-                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_U422: {
-      int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
-      const uint8_t* src_u =
-          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
-      const uint8_t* src_v = sample + src_width * abs_src_height +
-                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
-      r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_I444:
-    case FOURCC_YV24: {
-      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
-      const uint8_t* src_u;
-      const uint8_t* src_v;
-      if (format == FOURCC_YV24) {
-        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      } else {
-        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      }
-      r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_J444: {
-      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
-      const uint8_t* src_u;
-      const uint8_t* src_v;
-      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
-      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_H444: {
-      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
-      const uint8_t* src_u;
-      const uint8_t* src_v;
-      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
-      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-    case FOURCC_U444: {
-      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
-      const uint8_t* src_u;
-      const uint8_t* src_v;
-      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
-      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
-                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
-      break;
-    }
-
-#ifdef HAVE_JPEG
-    case FOURCC_MJPG:
-      r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
-                     abs_src_height, crop_width, inv_crop_height);
-      break;
-#endif
-    default:
-      r = -1;  // unknown fourcc - return failure code.
-  }
-
-  if (need_buf) {
-    if (!r) {
-      r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb,
-                     crop_width, abs_crop_height, rotation);
-    }
-    free(rotate_buffer);
-  } else if (rotation) {
-    src = sample + (src_width * crop_y + crop_x) * 4;
-    r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
-                   inv_crop_height, rotation);
-  }
-
-  return r;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/convert_to_i420.cc b/thirdparty/libyuv/source/convert_to_i420.cc
deleted file mode 100644
index ac6eeab..0000000
--- a/thirdparty/libyuv/source/convert_to_i420.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "libyuv/convert.h"
-
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-//   With MJPEG it is the compressed size of the frame.
-LIBYUV_API
-int ConvertToI420(const uint8_t* sample,
-                  size_t sample_size,
-                  uint8_t* dst_y,
-                  int dst_stride_y,
-                  uint8_t* dst_u,
-                  int dst_stride_u,
-                  uint8_t* dst_v,
-                  int dst_stride_v,
-                  int crop_x,
-                  int crop_y,
-                  int src_width,
-                  int src_height,
-                  int crop_width,
-                  int crop_height,
-                  enum RotationMode rotation,
-                  uint32_t fourcc) {
-  uint32_t format = CanonicalFourCC(fourcc);
-  int aligned_src_width = (src_width + 1) & ~1;
-  const uint8_t* src;
-  const uint8_t* src_uv;
-  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  // TODO(nisse): Why allow crop_height < 0?
-  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-  int r = 0;
-  LIBYUV_BOOL need_buf =
-      (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
-       format != FOURCC_NV21 && format != FOURCC_YV12) ||
-      dst_y == sample;
-  uint8_t* tmp_y = dst_y;
-  uint8_t* tmp_u = dst_u;
-  uint8_t* tmp_v = dst_v;
-  int tmp_y_stride = dst_stride_y;
-  int tmp_u_stride = dst_stride_u;
-  int tmp_v_stride = dst_stride_v;
-  uint8_t* rotate_buffer = NULL;
-  const int inv_crop_height =
-      (src_height < 0) ? -abs_crop_height : abs_crop_height;
-
-  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
-      crop_width <= 0 || src_height == 0 || crop_height == 0) {
-    return -1;
-  }
-
-  // One pass rotation is available for some formats. For the rest, convert
-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
-  // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination dst_y is same as source sample,
-  // also enable temporary buffer.
-  if (need_buf) {
-    int y_size = crop_width * abs_crop_height;
-    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
-    rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */
-    if (!rotate_buffer) {
-      return 1;  // Out of memory runtime error.
-    }
-    dst_y = rotate_buffer;
-    dst_u = dst_y + y_size;
-    dst_v = dst_u + uv_size;
-    dst_stride_y = crop_width;
-    dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);
-  }
-
-  switch (format) {
-    // Single plane formats
-    case FOURCC_YUY2:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_UYVY:
-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_RGBP:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
-                       dst_stride_u, dst_v, dst_stride_v, crop_width,
-                       inv_crop_height);
-      break;
-    case FOURCC_RGBO:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
-                         dst_stride_u, dst_v, dst_stride_v, crop_width,
-                         inv_crop_height);
-      break;
-    case FOURCC_R444:
-      src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
-                         dst_stride_u, dst_v, dst_stride_v, crop_width,
-                         inv_crop_height);
-      break;
-    case FOURCC_24BG:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
-                      dst_stride_u, dst_v, dst_stride_v, crop_width,
-                      inv_crop_height);
-      break;
-    case FOURCC_RAW:
-      src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
-                    dst_stride_u, dst_v, dst_stride_v, crop_width,
-                    inv_crop_height);
-      break;
-    case FOURCC_ARGB:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_BGRA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_ABGR:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
-      break;
-    case FOURCC_RGBA:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
-      break;
-    // TODO(fbarchard): Add AR30 and AB30
-    case FOURCC_I400:
-      src = sample + src_width * crop_y + crop_x;
-      r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                     dst_v, dst_stride_v, crop_width, inv_crop_height);
-      break;
-    // Biplanar formats
-    case FOURCC_NV12:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * abs_src_height) +
-               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
-                           dst_stride_y, dst_u, dst_stride_u, dst_v,
-                           dst_stride_v, crop_width, inv_crop_height, rotation);
-      break;
-    case FOURCC_NV21:
-      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * abs_src_height) +
-               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      // Call NV12 but with dst_u and dst_v parameters swapped.
-      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
-                           dst_stride_y, dst_v, dst_stride_v, dst_u,
-                           dst_stride_u, crop_width, inv_crop_height, rotation);
-      break;
-    // Triplanar formats
-    case FOURCC_I420:
-    case FOURCC_YV12: {
-      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8_t* src_u;
-      const uint8_t* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      if (format == FOURCC_YV12) {
-        src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
-                (crop_x / 2);
-        src_u = sample + src_width * abs_src_height +
-                halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
-      } else {
-        src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
-                (crop_x / 2);
-        src_v = sample + src_width * abs_src_height +
-                halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
-      }
-      r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
-                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
-                     dst_stride_v, crop_width, inv_crop_height, rotation);
-      break;
-    }
-    case FOURCC_I422:
-    case FOURCC_YV16: {
-      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
-      const uint8_t* src_u;
-      const uint8_t* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
-                (crop_x / 2);
-        src_u = sample + src_width * abs_src_height +
-                halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
-      } else {
-        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
-                (crop_x / 2);
-        src_v = sample + src_width * abs_src_height +
-                halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
-      }
-      r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
-                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
-                     dst_stride_v, crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I444:
-    case FOURCC_YV24: {
-      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
-      const uint8_t* src_u;
-      const uint8_t* src_v;
-      if (format == FOURCC_YV24) {
-        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      } else {
-        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
-        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
-      }
-      r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,
-                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
-                     dst_stride_v, crop_width, inv_crop_height);
-      break;
-    }
-#ifdef HAVE_JPEG
-    case FOURCC_MJPG:
-      r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, src_width,
-                     abs_src_height, crop_width, inv_crop_height);
-      break;
-#endif
-    default:
-      r = -1;  // unknown fourcc - return failure code.
-  }
-
-  if (need_buf) {
-    if (!r) {
-      r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
-                     dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,
-                     tmp_v, tmp_v_stride, crop_width, abs_crop_height,
-                     rotation);
-    }
-    free(rotate_buffer);
-  }
-
-  return r;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/cpu_id.cc b/thirdparty/libyuv/source/cpu_id.cc
deleted file mode 100644
index fe89452..0000000
--- a/thirdparty/libyuv/source/cpu_id.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/cpu_id.h"
-
-#if defined(_MSC_VER)
-#include <intrin.h>  // For __cpuidex()
-#endif
-#if !defined(__pnacl__) && !defined(__CLR_VER) &&                           \
-    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
-    defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
-#include <immintrin.h>  // For _xgetbv()
-#endif
-
-// For ArmCpuCaps() but unittested on all platforms
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// For functions that use the stack and have runtime checks for overflow,
-// use SAFEBUFFERS to avoid additional check.
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
-    !defined(__clang__)
-#define SAFEBUFFERS __declspec(safebuffers)
-#else
-#define SAFEBUFFERS
-#endif
-
-// cpu_info_ variable for SIMD instruction sets detected.
-LIBYUV_API int cpu_info_ = 0;
-
-// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
-// Low level cpuid for X86.
-#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
-     defined(__x86_64__)) &&                                     \
-    !defined(__pnacl__) && !defined(__CLR_VER)
-LIBYUV_API
-void CpuId(int info_eax, int info_ecx, int* cpu_info) {
-#if defined(_MSC_VER)
-// Visual C version uses intrinsic or inline x86 assembly.
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
-  __cpuidex(cpu_info, info_eax, info_ecx);
-#elif defined(_M_IX86)
-  __asm {
-    mov        eax, info_eax
-    mov        ecx, info_ecx
-    mov        edi, cpu_info
-    cpuid
-    mov        [edi], eax
-    mov        [edi + 4], ebx
-    mov        [edi + 8], ecx
-    mov        [edi + 12], edx
-  }
-#else  // Visual C but not x86
-  if (info_ecx == 0) {
-    __cpuid(cpu_info, info_eax);
-  } else {
-    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
-  }
-#endif
-// GCC version uses inline x86 assembly.
-#else  // defined(_MSC_VER)
-  int info_ebx, info_edx;
-  asm volatile(
-#if defined(__i386__) && defined(__PIC__)
-      // Preserve ebx for fpic 32 bit.
-      "mov         %%ebx, %%edi                  \n"
-      "cpuid                                     \n"
-      "xchg        %%edi, %%ebx                  \n"
-      : "=D"(info_ebx),
-#else
-      "cpuid                                     \n"
-      : "=b"(info_ebx),
-#endif  //  defined( __i386__) && defined(__PIC__)
-        "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
-  cpu_info[0] = info_eax;
-  cpu_info[1] = info_ebx;
-  cpu_info[2] = info_ecx;
-  cpu_info[3] = info_edx;
-#endif  // defined(_MSC_VER)
-}
-#else  // (defined(_M_IX86) || defined(_M_X64) ...
-LIBYUV_API
-void CpuId(int eax, int ecx, int* cpu_info) {
-  (void)eax;
-  (void)ecx;
-  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-}
-#endif
-
-// For VS2010 and earlier emit can be used:
-//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
-//  __asm {
-//    xor        ecx, ecx    // xcr 0
-//    xgetbv
-//    mov        xcr0, eax
-//  }
-// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
-// https://code.google.com/p/libyuv/issues/detail?id=529
-#if defined(_M_IX86) && (_MSC_VER < 1900)
-#pragma optimize("g", off)
-#endif
-#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
-     defined(__x86_64__)) &&                                     \
-    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int GetXCR0() {
-  int xcr0 = 0;
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
-  xcr0 = (int)_xgetbv(0);  // VS2010 SP1 required.  NOLINT
-#elif defined(__i386__) || defined(__x86_64__)
-  asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
-#endif  // defined(__i386__) || defined(__x86_64__)
-  return xcr0;
-}
-#else
-// xgetbv unavailable to query for OSSave support.  Return 0.
-#define GetXCR0() 0
-#endif  // defined(_M_IX86) || defined(_M_X64) ..
-// Return optimization to previous setting.
-#if defined(_M_IX86) && (_MSC_VER < 1900)
-#pragma optimize("g", on)
-#endif
-
-// based on libvpx arm_cpudetect.c
-// For Arm, but public to allow testing on any CPU
-LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
-  char cpuinfo_line[512];
-  FILE* f = fopen(cpuinfo_name, "r");
-  if (!f) {
-    // Assume Neon if /proc/cpuinfo is unavailable.
-    // This will occur for Chrome sandbox for Pepper or Render process.
-    return kCpuHasNEON;
-  }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
-    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
-      char* p = strstr(cpuinfo_line, " neon");
-      if (p && (p[5] == ' ' || p[5] == '\n')) {
-        fclose(f);
-        return kCpuHasNEON;
-      }
-      // aarch64 uses asimd for Neon.
-      p = strstr(cpuinfo_line, " asimd");
-      if (p) {
-        fclose(f);
-        return kCpuHasNEON;
-      }
-    }
-  }
-  fclose(f);
-  return 0;
-}
-
-// TODO(fbarchard): Consider read_msa_ir().
-LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
-  char cpuinfo_line[512];
-  int flag = 0x0;
-  FILE* f = fopen(cpuinfo_name, "r");
-  if (!f) {
-    // Assume nothing if /proc/cpuinfo is unavailable.
-    // This will occur for Chrome sandbox for Pepper or Render process.
-    return 0;
-  }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
-    if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
-      // Workaround early kernel without mmi in ASEs line.
-      if (strstr(cpuinfo_line, "Loongson-3")) {
-        flag |= kCpuHasMMI;
-      } else if (strstr(cpuinfo_line, "Loongson-2K")) {
-        flag |= kCpuHasMMI | kCpuHasMSA;
-      }
-    }
-    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
-      if (strstr(cpuinfo_line, "loongson-mmi") &&
-          strstr(cpuinfo_line, "loongson-ext")) {
-        flag |= kCpuHasMMI;
-      }
-      if (strstr(cpuinfo_line, "msa")) {
-        flag |= kCpuHasMSA;
-      }
-      // ASEs is the last line, so we can break here.
-      break;
-    }
-  }
-  fclose(f);
-  return flag;
-}
-
-static SAFEBUFFERS int GetCpuFlags(void) {
-  int cpu_info = 0;
-#if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
-    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
-     defined(_M_IX86))
-  int cpu_info0[4] = {0, 0, 0, 0};
-  int cpu_info1[4] = {0, 0, 0, 0};
-  int cpu_info7[4] = {0, 0, 0, 0};
-  CpuId(0, 0, cpu_info0);
-  CpuId(1, 0, cpu_info1);
-  if (cpu_info0[0] >= 7) {
-    CpuId(7, 0, cpu_info7);
-  }
-  cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
-             ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
-             ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
-             ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
-             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
-
-  // AVX requires OS saves YMM registers.
-  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
-      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
-    cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
-                ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
-
-    // Detect AVX512bw
-    if ((GetXCR0() & 0xe0) == 0xe0) {
-      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
-      cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
-      cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
-      cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
-      cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
-      cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
-      cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
-    }
-  }
-#endif
-#if defined(__mips__) && defined(__linux__)
-  cpu_info = MipsCpuCaps("/proc/cpuinfo");
-  cpu_info |= kCpuHasMIPS;
-#endif
-#if defined(__arm__) || defined(__aarch64__)
-// gcc -mfpu=neon defines __ARM_NEON__
-// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
-// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
-#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
-  cpu_info = kCpuHasNEON;
-// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
-// flag in it.
-// So for aarch64, neon enabling is hard coded here.
-#endif
-#if defined(__aarch64__)
-  cpu_info = kCpuHasNEON;
-#else
-  // Linux arm parse text file for neon detect.
-  cpu_info = ArmCpuCaps("/proc/cpuinfo");
-#endif
-  cpu_info |= kCpuHasARM;
-#endif  // __arm__
-  cpu_info |= kCpuInitialized;
-  return cpu_info;
-}
-
-// Note that use of this function is not thread safe.
-LIBYUV_API
-int MaskCpuFlags(int enable_flags) {
-  int cpu_info = GetCpuFlags() & enable_flags;
-  SetCpuFlags(cpu_info);
-  return cpu_info;
-}
-
-LIBYUV_API
-int InitCpuFlags(void) {
-  return MaskCpuFlags(-1);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/mjpeg_decoder.cc b/thirdparty/libyuv/source/mjpeg_decoder.cc
deleted file mode 100644
index adba832..0000000
--- a/thirdparty/libyuv/source/mjpeg_decoder.cc
+++ /dev/null
@@ -1,585 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/mjpeg_decoder.h"
-
-#ifdef HAVE_JPEG
-#include <assert.h>
-
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-// Must be included before jpeglib.
-#include <setjmp.h>
-#define HAVE_SETJMP
-
-#if defined(_MSC_VER)
-// disable warning 4324: structure was padded due to __declspec(align())
-#pragma warning(disable : 4324)
-#endif
-
-#endif
-
-#include <stdio.h>  // For jpeglib.h.
-
-// C++ build requires extern C for jpeg internals.
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <jpeglib.h>
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#include "libyuv/planar_functions.h"  // For CopyPlane().
-
-namespace libyuv {
-
-#ifdef HAVE_SETJMP
-struct SetJmpErrorMgr {
-  jpeg_error_mgr base;  // Must be at the top
-  jmp_buf setjmp_buffer;
-};
-#endif
-
-const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
-const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
-const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
-const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
-const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
-const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
-
-// Methods that are passed to jpeglib.
-boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
-void init_source(jpeg_decompress_struct* cinfo);
-void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes);  // NOLINT
-void term_source(jpeg_decompress_struct* cinfo);
-void ErrorHandler(jpeg_common_struct* cinfo);
-void OutputHandler(jpeg_common_struct* cinfo);
-
-MJpegDecoder::MJpegDecoder()
-    : has_scanline_padding_(LIBYUV_FALSE),
-      num_outbufs_(0),
-      scanlines_(NULL),
-      scanlines_sizes_(NULL),
-      databuf_(NULL),
-      databuf_strides_(NULL) {
-  decompress_struct_ = new jpeg_decompress_struct;
-  source_mgr_ = new jpeg_source_mgr;
-#ifdef HAVE_SETJMP
-  error_mgr_ = new SetJmpErrorMgr;
-  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
-  // Override standard exit()-based error handler.
-  error_mgr_->base.error_exit = &ErrorHandler;
-  error_mgr_->base.output_message = &OutputHandler;
-#endif
-  decompress_struct_->client_data = NULL;
-  source_mgr_->init_source = &init_source;
-  source_mgr_->fill_input_buffer = &fill_input_buffer;
-  source_mgr_->skip_input_data = &skip_input_data;
-  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
-  source_mgr_->term_source = &term_source;
-  jpeg_create_decompress(decompress_struct_);
-  decompress_struct_->src = source_mgr_;
-  buf_vec_.buffers = &buf_;
-  buf_vec_.len = 1;
-}
-
-MJpegDecoder::~MJpegDecoder() {
-  jpeg_destroy_decompress(decompress_struct_);
-  delete decompress_struct_;
-  delete source_mgr_;
-#ifdef HAVE_SETJMP
-  delete error_mgr_;
-#endif
-  DestroyOutputBuffers();
-}
-
-LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
-  if (!ValidateJpeg(src, src_len)) {
-    return LIBYUV_FALSE;
-  }
-
-  buf_.data = src;
-  buf_.len = static_cast<int>(src_len);
-  buf_vec_.pos = 0;
-  decompress_struct_->client_data = &buf_vec_;
-#ifdef HAVE_SETJMP
-  if (setjmp(error_mgr_->setjmp_buffer)) {
-    // We called jpeg_read_header, it experienced an error, and we called
-    // longjmp() and rewound the stack to here. Return error.
-    return LIBYUV_FALSE;
-  }
-#endif
-  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
-    // ERROR: Bad MJPEG header
-    return LIBYUV_FALSE;
-  }
-  AllocOutputBuffers(GetNumComponents());
-  for (int i = 0; i < num_outbufs_; ++i) {
-    int scanlines_size = GetComponentScanlinesPerImcuRow(i);
-    if (scanlines_sizes_[i] != scanlines_size) {
-      if (scanlines_[i]) {
-        delete scanlines_[i];
-      }
-      scanlines_[i] = new uint8_t*[scanlines_size];
-      scanlines_sizes_[i] = scanlines_size;
-    }
-
-    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
-    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
-    // the preceding scanlines, the padding is not needed/wanted because the
-    // following addresses will already be valid (they are the initial bytes of
-    // the next scanline) and will be overwritten when jpeglib writes out that
-    // next scanline.
-    int databuf_stride = GetComponentStride(i);
-    int databuf_size = scanlines_size * databuf_stride;
-    if (databuf_strides_[i] != databuf_stride) {
-      if (databuf_[i]) {
-        delete databuf_[i];
-      }
-      databuf_[i] = new uint8_t[databuf_size];
-      databuf_strides_[i] = databuf_stride;
-    }
-
-    if (GetComponentStride(i) != GetComponentWidth(i)) {
-      has_scanline_padding_ = LIBYUV_TRUE;
-    }
-  }
-  return LIBYUV_TRUE;
-}
-
-static int DivideAndRoundUp(int numerator, int denominator) {
-  return (numerator + denominator - 1) / denominator;
-}
-
-static int DivideAndRoundDown(int numerator, int denominator) {
-  return numerator / denominator;
-}
-
-// Returns width of the last loaded frame.
-int MJpegDecoder::GetWidth() {
-  return decompress_struct_->image_width;
-}
-
-// Returns height of the last loaded frame.
-int MJpegDecoder::GetHeight() {
-  return decompress_struct_->image_height;
-}
-
-// Returns format of the last loaded frame. The return value is one of the
-// kColorSpace* constants.
-int MJpegDecoder::GetColorSpace() {
-  return decompress_struct_->jpeg_color_space;
-}
-
-// Number of color components in the color space.
-int MJpegDecoder::GetNumComponents() {
-  return decompress_struct_->num_components;
-}
-
-// Sample factors of the n-th component.
-int MJpegDecoder::GetHorizSampFactor(int component) {
-  return decompress_struct_->comp_info[component].h_samp_factor;
-}
-
-int MJpegDecoder::GetVertSampFactor(int component) {
-  return decompress_struct_->comp_info[component].v_samp_factor;
-}
-
-int MJpegDecoder::GetHorizSubSampFactor(int component) {
-  return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
-}
-
-int MJpegDecoder::GetVertSubSampFactor(int component) {
-  return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
-}
-
-int MJpegDecoder::GetImageScanlinesPerImcuRow() {
-  return decompress_struct_->max_v_samp_factor * DCTSIZE;
-}
-
-int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
-  int vs = GetVertSubSampFactor(component);
-  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
-}
-
-int MJpegDecoder::GetComponentWidth(int component) {
-  int hs = GetHorizSubSampFactor(component);
-  return DivideAndRoundUp(GetWidth(), hs);
-}
-
-int MJpegDecoder::GetComponentHeight(int component) {
-  int vs = GetVertSubSampFactor(component);
-  return DivideAndRoundUp(GetHeight(), vs);
-}
-
-// Get width in bytes padded out to a multiple of DCTSIZE
-int MJpegDecoder::GetComponentStride(int component) {
-  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
-}
-
-int MJpegDecoder::GetComponentSize(int component) {
-  return GetComponentWidth(component) * GetComponentHeight(component);
-}
-
-LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
-#ifdef HAVE_SETJMP
-  if (setjmp(error_mgr_->setjmp_buffer)) {
-    // We called jpeg_abort_decompress, it experienced an error, and we called
-    // longjmp() and rewound the stack to here. Return error.
-    return LIBYUV_FALSE;
-  }
-#endif
-  jpeg_abort_decompress(decompress_struct_);
-  return LIBYUV_TRUE;
-}
-
-// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes,
-                                          int dst_width,
-                                          int dst_height) {
-  if (dst_width != GetWidth() || dst_height > GetHeight()) {
-    // ERROR: Bad dimensions
-    return LIBYUV_FALSE;
-  }
-#ifdef HAVE_SETJMP
-  if (setjmp(error_mgr_->setjmp_buffer)) {
-    // We called into jpeglib, it experienced an error sometime during this
-    // function call, and we called longjmp() and rewound the stack to here.
-    // Return error.
-    return LIBYUV_FALSE;
-  }
-#endif
-  if (!StartDecode()) {
-    return LIBYUV_FALSE;
-  }
-  SetScanlinePointers(databuf_);
-  int lines_left = dst_height;
-  // Compute amount of lines to skip to implement vertical crop.
-  // TODO(fbarchard): Ensure skip is a multiple of maximum component
-  // subsample. ie 2
-  int skip = (GetHeight() - dst_height) / 2;
-  if (skip > 0) {
-    // There is no API to skip lines in the output data, so we read them
-    // into the temp buffer.
-    while (skip >= GetImageScanlinesPerImcuRow()) {
-      if (!DecodeImcuRow()) {
-        FinishDecode();
-        return LIBYUV_FALSE;
-      }
-      skip -= GetImageScanlinesPerImcuRow();
-    }
-    if (skip > 0) {
-      // Have a partial iMCU row left over to skip. Must read it and then
-      // copy the parts we want into the destination.
-      if (!DecodeImcuRow()) {
-        FinishDecode();
-        return LIBYUV_FALSE;
-      }
-      for (int i = 0; i < num_outbufs_; ++i) {
-        // TODO(fbarchard): Compute skip to avoid this
-        assert(skip % GetVertSubSampFactor(i) == 0);
-        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
-        int scanlines_to_copy =
-            GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
-        int data_to_skip = rows_to_skip * GetComponentStride(i);
-        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
-                  GetComponentWidth(i), GetComponentWidth(i),
-                  scanlines_to_copy);
-        planes[i] += scanlines_to_copy * GetComponentWidth(i);
-      }
-      lines_left -= (GetImageScanlinesPerImcuRow() - skip);
-    }
-  }
-
-  // Read full MCUs but cropped horizontally
-  for (; lines_left > GetImageScanlinesPerImcuRow();
-       lines_left -= GetImageScanlinesPerImcuRow()) {
-    if (!DecodeImcuRow()) {
-      FinishDecode();
-      return LIBYUV_FALSE;
-    }
-    for (int i = 0; i < num_outbufs_; ++i) {
-      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
-      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
-                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
-      planes[i] += scanlines_to_copy * GetComponentWidth(i);
-    }
-  }
-
-  if (lines_left > 0) {
-    // Have a partial iMCU row left over to decode.
-    if (!DecodeImcuRow()) {
-      FinishDecode();
-      return LIBYUV_FALSE;
-    }
-    for (int i = 0; i < num_outbufs_; ++i) {
-      int scanlines_to_copy =
-          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
-      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
-                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
-      planes[i] += scanlines_to_copy * GetComponentWidth(i);
-    }
-  }
-  return FinishDecode();
-}
-
-LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
-                                           void* opaque,
-                                           int dst_width,
-                                           int dst_height) {
-  if (dst_width != GetWidth() || dst_height > GetHeight()) {
-    // ERROR: Bad dimensions
-    return LIBYUV_FALSE;
-  }
-#ifdef HAVE_SETJMP
-  if (setjmp(error_mgr_->setjmp_buffer)) {
-    // We called into jpeglib, it experienced an error sometime during this
-    // function call, and we called longjmp() and rewound the stack to here.
-    // Return error.
-    return LIBYUV_FALSE;
-  }
-#endif
-  if (!StartDecode()) {
-    return LIBYUV_FALSE;
-  }
-  SetScanlinePointers(databuf_);
-  int lines_left = dst_height;
-  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
-  int skip = (GetHeight() - dst_height) / 2;
-  if (skip > 0) {
-    while (skip >= GetImageScanlinesPerImcuRow()) {
-      if (!DecodeImcuRow()) {
-        FinishDecode();
-        return LIBYUV_FALSE;
-      }
-      skip -= GetImageScanlinesPerImcuRow();
-    }
-    if (skip > 0) {
-      // Have a partial iMCU row left over to skip.
-      if (!DecodeImcuRow()) {
-        FinishDecode();
-        return LIBYUV_FALSE;
-      }
-      for (int i = 0; i < num_outbufs_; ++i) {
-        // TODO(fbarchard): Compute skip to avoid this
-        assert(skip % GetVertSubSampFactor(i) == 0);
-        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
-        int data_to_skip = rows_to_skip * GetComponentStride(i);
-        // Change our own data buffer pointers so we can pass them to the
-        // callback.
-        databuf_[i] += data_to_skip;
-      }
-      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
-      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
-      // Now change them back.
-      for (int i = 0; i < num_outbufs_; ++i) {
-        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
-        int data_to_skip = rows_to_skip * GetComponentStride(i);
-        databuf_[i] -= data_to_skip;
-      }
-      lines_left -= scanlines_to_copy;
-    }
-  }
-  // Read full MCUs until we get to the crop point.
-  for (; lines_left >= GetImageScanlinesPerImcuRow();
-       lines_left -= GetImageScanlinesPerImcuRow()) {
-    if (!DecodeImcuRow()) {
-      FinishDecode();
-      return LIBYUV_FALSE;
-    }
-    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
-  }
-  if (lines_left > 0) {
-    // Have a partial iMCU row left over to decode.
-    if (!DecodeImcuRow()) {
-      FinishDecode();
-      return LIBYUV_FALSE;
-    }
-    (*fn)(opaque, databuf_, databuf_strides_, lines_left);
-  }
-  return FinishDecode();
-}
-
-void init_source(j_decompress_ptr cinfo) {
-  fill_input_buffer(cinfo);
-}
-
-boolean fill_input_buffer(j_decompress_ptr cinfo) {
-  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
-  if (buf_vec->pos >= buf_vec->len) {
-    // Don't assert-fail when fuzzing.
-#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-    assert(0 && "No more data");
-#endif
-    // ERROR: No more data
-    return FALSE;
-  }
-  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
-  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
-  ++buf_vec->pos;
-  return TRUE;
-}
-
-void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
-  jpeg_source_mgr* src = cinfo->src;
-  size_t bytes = static_cast<size_t>(num_bytes);
-  if (bytes > src->bytes_in_buffer) {
-    src->next_input_byte = nullptr;
-    src->bytes_in_buffer = 0;
-  } else {
-    src->next_input_byte += bytes;
-    src->bytes_in_buffer -= bytes;
-  }
-}
-
-void term_source(j_decompress_ptr cinfo) {
-  (void)cinfo;  // Nothing to do.
-}
-
-#ifdef HAVE_SETJMP
-void ErrorHandler(j_common_ptr cinfo) {
-// This is called when a jpeglib command experiences an error. Unfortunately
-// jpeglib's error handling model is not very flexible, because it expects the
-// error handler to not return--i.e., it wants the program to terminate. To
-// recover from errors we use setjmp() as shown in their example. setjmp() is
-// C's implementation for the "call with current continuation" functionality
-// seen in some functional programming languages.
-// A formatted message can be output, but is unsafe for release.
-#ifdef DEBUG
-  char buf[JMSG_LENGTH_MAX];
-  (*cinfo->err->format_message)(cinfo, buf);
-// ERROR: Error in jpeglib: buf
-#endif
-
-  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
-  // This rewinds the call stack to the point of the corresponding setjmp()
-  // and causes it to return (for a second time) with value 1.
-  longjmp(mgr->setjmp_buffer, 1);
-}
-
-// Suppress fprintf warnings.
-void OutputHandler(j_common_ptr cinfo) {
-  (void)cinfo;
-}
-
-#endif  // HAVE_SETJMP
-
-void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
-  if (num_outbufs != num_outbufs_) {
-    // We could perhaps optimize this case to resize the output buffers without
-    // necessarily having to delete and recreate each one, but it's not worth
-    // it.
-    DestroyOutputBuffers();
-
-    scanlines_ = new uint8_t**[num_outbufs];
-    scanlines_sizes_ = new int[num_outbufs];
-    databuf_ = new uint8_t*[num_outbufs];
-    databuf_strides_ = new int[num_outbufs];
-
-    for (int i = 0; i < num_outbufs; ++i) {
-      scanlines_[i] = NULL;
-      scanlines_sizes_[i] = 0;
-      databuf_[i] = NULL;
-      databuf_strides_[i] = 0;
-    }
-
-    num_outbufs_ = num_outbufs;
-  }
-}
-
-void MJpegDecoder::DestroyOutputBuffers() {
-  for (int i = 0; i < num_outbufs_; ++i) {
-    delete[] scanlines_[i];
-    delete[] databuf_[i];
-  }
-  delete[] scanlines_;
-  delete[] databuf_;
-  delete[] scanlines_sizes_;
-  delete[] databuf_strides_;
-  scanlines_ = NULL;
-  databuf_ = NULL;
-  scanlines_sizes_ = NULL;
-  databuf_strides_ = NULL;
-  num_outbufs_ = 0;
-}
-
-// JDCT_IFAST and do_block_smoothing improve performance substantially.
-LIBYUV_BOOL MJpegDecoder::StartDecode() {
-  decompress_struct_->raw_data_out = TRUE;
-  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
-  decompress_struct_->dither_mode = JDITHER_NONE;
-  // Not applicable to 'raw':
-  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
-  // Only for buffered mode:
-  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
-  // Blocky but fast:
-  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
-
-  if (!jpeg_start_decompress(decompress_struct_)) {
-    // ERROR: Couldn't start JPEG decompressor";
-    return LIBYUV_FALSE;
-  }
-  return LIBYUV_TRUE;
-}
-
-LIBYUV_BOOL MJpegDecoder::FinishDecode() {
-  // jpeglib considers it an error if we finish without decoding the whole
-  // image, so we call "abort" rather than "finish".
-  jpeg_abort_decompress(decompress_struct_);
-  return LIBYUV_TRUE;
-}
-
-void MJpegDecoder::SetScanlinePointers(uint8_t** data) {
-  for (int i = 0; i < num_outbufs_; ++i) {
-    uint8_t* data_i = data[i];
-    for (int j = 0; j < scanlines_sizes_[i]; ++j) {
-      scanlines_[i][j] = data_i;
-      data_i += GetComponentStride(i);
-    }
-  }
-}
-
-inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
-  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
-         jpeg_read_raw_data(decompress_struct_, scanlines_,
-                            GetImageScanlinesPerImcuRow());
-}
-
-// The helper function which recognizes the jpeg sub-sampling type.
-JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
-    int* subsample_x,
-    int* subsample_y,
-    int number_of_components) {
-  if (number_of_components == 3) {  // Color images.
-    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
-        subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
-      return kJpegYuv420;
-    }
-    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
-        subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) {
-      return kJpegYuv422;
-    }
-    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 &&
-        subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) {
-      return kJpegYuv444;
-    }
-  } else if (number_of_components == 1) {  // Grey-scale images.
-    if (subsample_x[0] == 1 && subsample_y[0] == 1) {
-      return kJpegYuv400;
-    }
-  }
-  return kJpegUnknown;
-}
-
-}  // namespace libyuv
-#endif  // HAVE_JPEG
diff --git a/thirdparty/libyuv/source/mjpeg_validate.cc b/thirdparty/libyuv/source/mjpeg_validate.cc
deleted file mode 100644
index ba0a03a..0000000
--- a/thirdparty/libyuv/source/mjpeg_validate.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/mjpeg_decoder.h"
-
-#include <string.h>  // For memchr.
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Helper function to scan for EOI marker (0xff 0xd9).
-static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) {
-  if (src_size_mjpg >= 2) {
-    const uint8_t* end = src_mjpg + src_size_mjpg - 1;
-    const uint8_t* it = src_mjpg;
-    while (it < end) {
-      // TODO(fbarchard): scan for 0xd9 instead.
-      it = (const uint8_t*)(memchr(it, 0xff, end - it));
-      if (it == NULL) {
-        break;
-      }
-      if (it[1] == 0xd9) {
-        return LIBYUV_TRUE;  // Success: Valid jpeg.
-      }
-      ++it;  // Skip over current 0xff.
-    }
-  }
-  // ERROR: Invalid jpeg end code not found. Size src_size_mjpg
-  return LIBYUV_FALSE;
-}
-
-// Helper function to validate the jpeg appears intact.
-LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) {
-  // Maximum size that ValidateJpeg will consider valid.
-  const size_t kMaxJpegSize = 0x7fffffffull;
-  const size_t kBackSearchSize = 1024;
-  if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) {
-    // ERROR: Invalid jpeg size: src_size_mjpg
-    return LIBYUV_FALSE;
-  }
-  // SOI marker
-  if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) {
-    // ERROR: Invalid jpeg initial start code
-    return LIBYUV_FALSE;
-  }
-
-  // Look for the End Of Image (EOI) marker near the end of the buffer.
-  if (src_size_mjpg > kBackSearchSize) {
-    if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) {
-      return LIBYUV_TRUE;  // Success: Valid jpeg.
-    }
-    // Reduce search size for forward search.
-    src_size_mjpg = src_size_mjpg - kBackSearchSize + 1;
-  }
-  // Step over SOI marker and scan for EOI.
-  return ScanEOI(src_mjpg + 2, src_size_mjpg - 2);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/planar_functions.cc b/thirdparty/libyuv/source/planar_functions.cc
deleted file mode 100644
index 7cea06c..0000000
--- a/thirdparty/libyuv/source/planar_functions.cc
+++ /dev/null
@@ -1,5063 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/planar_functions.h"
-
-#include <assert.h>
-#include <string.h>  // for memset()
-
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"  // for ScaleRowDown2
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy a plane of data
-LIBYUV_API
-void CopyPlane(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height) {
-  int y;
-  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-  // Nothing to do.
-  if (src_y == dst_y && src_stride_y == dst_stride_y) {
-    return;
-  }
-
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
-
-  // Copy plane
-  for (y = 0; y < height; ++y) {
-    CopyRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
-// TODO(fbarchard): Consider support for negative height.
-// TODO(fbarchard): Consider stride measured in bytes.
-LIBYUV_API
-void CopyPlane_16(const uint16_t* src_y,
-                  int src_stride_y,
-                  uint16_t* dst_y,
-                  int dst_stride_y,
-                  int width,
-                  int height) {
-  int y;
-  void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-#if defined(HAS_COPYROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_16_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_16_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_16_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_16_NEON;
-  }
-#endif
-
-  // Copy plane
-  for (y = 0; y < height; ++y) {
-    CopyRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
-// Convert a plane of 16 bit data to 8 bit
-LIBYUV_API
-void Convert16To8Plane(const uint16_t* src_y,
-                       int src_stride_y,
-                       uint8_t* dst_y,
-                       int dst_stride_y,
-                       int scale,  // 16384 for 10 bits
-                       int width,
-                       int height) {
-  int y;
-  void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
-                          int width) = Convert16To8Row_C;
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-#if defined(HAS_CONVERT16TO8ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    Convert16To8Row = Convert16To8Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      Convert16To8Row = Convert16To8Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_CONVERT16TO8ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    Convert16To8Row = Convert16To8Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      Convert16To8Row = Convert16To8Row_AVX2;
-    }
-  }
-#endif
-
-  // Convert plane
-  for (y = 0; y < height; ++y) {
-    Convert16To8Row(src_y, dst_y, scale, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
-// Convert a plane of 8 bit data to 16 bit
-LIBYUV_API
-void Convert8To16Plane(const uint8_t* src_y,
-                       int src_stride_y,
-                       uint16_t* dst_y,
-                       int dst_stride_y,
-                       int scale,  // 16384 for 10 bits
-                       int width,
-                       int height) {
-  int y;
-  void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
-                          int width) = Convert8To16Row_C;
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-#if defined(HAS_CONVERT8TO16ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    Convert8To16Row = Convert8To16Row_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      Convert8To16Row = Convert8To16Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_CONVERT8TO16ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    Convert8To16Row = Convert8To16Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      Convert8To16Row = Convert8To16Row_AVX2;
-    }
-  }
-#endif
-
-  // Convert plane
-  for (y = 0; y < height; ++y) {
-    Convert8To16Row(src_y, dst_y, scale, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
-// Copy I422.
-LIBYUV_API
-int I422Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_u,
-             int src_stride_u,
-             const uint8_t* src_v,
-             int src_stride_v,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_u,
-             int dst_stride_u,
-             uint8_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height) {
-  int halfwidth = (width + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
-  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
-  return 0;
-}
-
-// Copy I444.
-LIBYUV_API
-int I444Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_u,
-             int src_stride_u,
-             const uint8_t* src_v,
-             int src_stride_v,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_u,
-             int dst_stride_u,
-             uint8_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height) {
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-  return 0;
-}
-
-// Copy I400.
-LIBYUV_API
-int I400ToI400(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height) {
-  if (!src_y || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  return 0;
-}
-
-// Convert I420 to I400.
-LIBYUV_API
-int I420ToI400(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height) {
-  (void)src_u;
-  (void)src_stride_u;
-  (void)src_v;
-  (void)src_stride_v;
-  if (!src_y || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  return 0;
-}
-
-// Copy NV12. Supports inverting.
-int NV12Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_uv,
-             int src_stride_uv,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_uv,
-             int dst_stride_uv,
-             int width,
-             int height) {
-  if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
-    return -1;
-  }
-
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
-    src_stride_y = -src_stride_y;
-    src_stride_uv = -src_stride_uv;
-  }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2,
-            halfheight);
-  return 0;
-}
-
-// Copy NV21. Supports inverting.
-int NV21Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_vu,
-             int src_stride_vu,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_vu,
-             int dst_stride_vu,
-             int width,
-             int height) {
-  return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
-                  dst_stride_y, dst_vu, dst_stride_vu, width, height);
-}
-
-// Support function for NV12 etc UV channels.
-// Width and height are plane sizes (typically half pixel width).
-LIBYUV_API
-void SplitUVPlane(const uint8_t* src_uv,
-                  int src_stride_uv,
-                  uint8_t* dst_u,
-                  int dst_stride_u,
-                  uint8_t* dst_v,
-                  int dst_stride_v,
-                  int width,
-                  int height) {
-  int y;
-  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
-                     int width) = SplitUVRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_u = dst_u + (height - 1) * dst_stride_u;
-    dst_v = dst_v + (height - 1) * dst_stride_v;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  // Coalesce rows.
-  if (src_stride_uv == width * 2 && dst_stride_u == width &&
-      dst_stride_v == width) {
-    width *= height;
-    height = 1;
-    src_stride_uv = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_SPLITUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SplitUVRow = SplitUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SplitUVRow = SplitUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    SplitUVRow = SplitUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    // Copy a row of UV.
-    SplitUVRow(src_uv, dst_u, dst_v, width);
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-    src_uv += src_stride_uv;
-  }
-}
-
-LIBYUV_API
-void MergeUVPlane(const uint8_t* src_u,
-                  int src_stride_u,
-                  const uint8_t* src_v,
-                  int src_stride_v,
-                  uint8_t* dst_uv,
-                  int dst_stride_uv,
-                  int width,
-                  int height) {
-  int y;
-  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
-                     uint8_t* dst_uv, int width) = MergeUVRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_uv = dst_uv + (height - 1) * dst_stride_uv;
-    dst_stride_uv = -dst_stride_uv;
-  }
-  // Coalesce rows.
-  if (src_stride_u == width && src_stride_v == width &&
-      dst_stride_uv == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_u = src_stride_v = dst_stride_uv = 0;
-  }
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      MergeUVRow = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MergeUVRow = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      MergeUVRow = MergeUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      MergeUVRow = MergeUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MergeUVRow = MergeUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      MergeUVRow = MergeUVRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    // Merge a row of U and V into a row of UV.
-    MergeUVRow(src_u, src_v, dst_uv, width);
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uv += dst_stride_uv;
-  }
-}
-
-// Support function for P010 etc UV channels.
-// Width and height are plane sizes (typically half pixel width).
-LIBYUV_API
-void SplitUVPlane_16(const uint16_t* src_uv,
-                     int src_stride_uv,
-                     uint16_t* dst_u,
-                     int dst_stride_u,
-                     uint16_t* dst_v,
-                     int dst_stride_v,
-                     int width,
-                     int height,
-                     int depth) {
-  int y;
-  void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
-                        uint16_t* dst_v, int depth, int width) =
-      SplitUVRow_16_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_u = dst_u + (height - 1) * dst_stride_u;
-    dst_v = dst_v + (height - 1) * dst_stride_v;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  // Coalesce rows.
-  if (src_stride_uv == width * 2 && dst_stride_u == width &&
-      dst_stride_v == width) {
-    width *= height;
-    height = 1;
-    src_stride_uv = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_SPLITUVROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow_16 = SplitUVRow_16_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow_16 = SplitUVRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow_16 = SplitUVRow_16_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      SplitUVRow_16 = SplitUVRow_16_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    // Copy a row of UV.
-    SplitUVRow_16(src_uv, dst_u, dst_v, depth, width);
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-    src_uv += src_stride_uv;
-  }
-}
-
-LIBYUV_API
-void MergeUVPlane_16(const uint16_t* src_u,
-                     int src_stride_u,
-                     const uint16_t* src_v,
-                     int src_stride_v,
-                     uint16_t* dst_uv,
-                     int dst_stride_uv,
-                     int width,
-                     int height,
-                     int depth) {
-  int y;
-  void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v,
-                        uint16_t* dst_uv, int depth, int width) =
-      MergeUVRow_16_C;
-  assert(depth >= 8);
-  assert(depth <= 16);
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_uv = dst_uv + (height - 1) * dst_stride_uv;
-    dst_stride_uv = -dst_stride_uv;
-  }
-  // Coalesce rows.
-  if (src_stride_u == width && src_stride_v == width &&
-      dst_stride_uv == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_u = src_stride_v = dst_stride_uv = 0;
-  }
-#if defined(HAS_MERGEUVROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      MergeUVRow_16 = MergeUVRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow_16 = MergeUVRow_16_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      MergeUVRow_16 = MergeUVRow_16_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    // Merge a row of U and V into a row of UV.
-    MergeUVRow_16(src_u, src_v, dst_uv, depth, width);
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uv += dst_stride_uv;
-  }
-}
-
-// Convert plane from lsb to msb
-LIBYUV_API
-void ConvertToMSBPlane_16(const uint16_t* src_y,
-                          int src_stride_y,
-                          uint16_t* dst_y,
-                          int dst_stride_y,
-                          int width,
-                          int height,
-                          int depth) {
-  int y;
-  int scale = 1 << (16 - depth);
-  void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale,
-                         int width) = MultiplyRow_16_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-
-#if defined(HAS_MULTIPLYROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MultiplyRow_16 = MultiplyRow_16_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MultiplyRow_16 = MultiplyRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MULTIPLYROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MultiplyRow_16 = MultiplyRow_16_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      MultiplyRow_16 = MultiplyRow_16_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    MultiplyRow_16(src_y, dst_y, scale, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
-// Convert plane from msb to lsb
-LIBYUV_API
-void ConvertToLSBPlane_16(const uint16_t* src_y,
-                          int src_stride_y,
-                          uint16_t* dst_y,
-                          int dst_stride_y,
-                          int width,
-                          int height,
-                          int depth) {
-  int y;
-  int scale = 1 << depth;
-  void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
-                    int width) = DivideRow_16_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-
-#if defined(HAS_DIVIDEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    DivideRow = DivideRow_16_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      DivideRow = DivideRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_DIVIDEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    DivideRow = DivideRow_16_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      DivideRow = DivideRow_16_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    DivideRow(src_y, dst_y, scale, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
-// Swap U and V channels in interleaved UV plane.
-LIBYUV_API
-void SwapUVPlane(const uint8_t* src_uv,
-                 int src_stride_uv,
-                 uint8_t* dst_vu,
-                 int dst_stride_vu,
-                 int width,
-                 int height) {
-  int y;
-  void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
-      SwapUVRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uv = src_uv + (height - 1) * src_stride_uv;
-    src_stride_uv = -src_stride_uv;
-  }
-  // Coalesce rows.
-  if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_uv = dst_stride_vu = 0;
-  }
-
-#if defined(HAS_SWAPUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    SwapUVRow = SwapUVRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      SwapUVRow = SwapUVRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SWAPUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SwapUVRow = SwapUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      SwapUVRow = SwapUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SWAPUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SwapUVRow = SwapUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SwapUVRow = SwapUVRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    SwapUVRow(src_uv, dst_vu, width);
-    src_uv += src_stride_uv;
-    dst_vu += dst_stride_vu;
-  }
-}
-
-// Convert NV21 to NV12.
-LIBYUV_API
-int NV21ToNV12(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_vu || !dst_uv || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_vu = src_vu + (halfheight - 1) * src_stride_vu;
-    src_stride_vu = -src_stride_vu;
-  }
-
-  SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
-              halfheight);
-  return 0;
-}
-
-// Support function for NV12 etc RGB channels.
-// Width and height are plane sizes (typically half pixel width).
-LIBYUV_API
-void SplitRGBPlane(const uint8_t* src_rgb,
-                   int src_stride_rgb,
-                   uint8_t* dst_r,
-                   int dst_stride_r,
-                   uint8_t* dst_g,
-                   int dst_stride_g,
-                   uint8_t* dst_b,
-                   int dst_stride_b,
-                   int width,
-                   int height) {
-  int y;
-  void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
-                      uint8_t* dst_b, int width) = SplitRGBRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_r = dst_r + (height - 1) * dst_stride_r;
-    dst_g = dst_g + (height - 1) * dst_stride_g;
-    dst_b = dst_b + (height - 1) * dst_stride_b;
-    dst_stride_r = -dst_stride_r;
-    dst_stride_g = -dst_stride_g;
-    dst_stride_b = -dst_stride_b;
-  }
-  // Coalesce rows.
-  if (src_stride_rgb == width * 3 && dst_stride_r == width &&
-      dst_stride_g == width && dst_stride_b == width) {
-    width *= height;
-    height = 1;
-    src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
-  }
-#if defined(HAS_SPLITRGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    SplitRGBRow = SplitRGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      SplitRGBRow = SplitRGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SPLITRGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SplitRGBRow = SplitRGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      SplitRGBRow = SplitRGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SPLITRGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitRGBRow = SplitRGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SplitRGBRow = SplitRGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    // Copy a row of RGB.
-    SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
-    dst_r += dst_stride_r;
-    dst_g += dst_stride_g;
-    dst_b += dst_stride_b;
-    src_rgb += src_stride_rgb;
-  }
-}
-
-LIBYUV_API
-void MergeRGBPlane(const uint8_t* src_r,
-                   int src_stride_r,
-                   const uint8_t* src_g,
-                   int src_stride_g,
-                   const uint8_t* src_b,
-                   int src_stride_b,
-                   uint8_t* dst_rgb,
-                   int dst_stride_rgb,
-                   int width,
-                   int height) {
-  int y;
-  void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
-                      const uint8_t* src_b, uint8_t* dst_rgb, int width) =
-      MergeRGBRow_C;
-  // Coalesce rows.
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
-    dst_stride_rgb = -dst_stride_rgb;
-  }
-  // Coalesce rows.
-  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      dst_stride_rgb == width * 3) {
-    width *= height;
-    height = 1;
-    src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
-  }
-#if defined(HAS_MERGERGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MergeRGBRow = MergeRGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      MergeRGBRow = MergeRGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_MERGERGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeRGBRow = MergeRGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      MergeRGBRow = MergeRGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGERGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeRGBRow = MergeRGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      MergeRGBRow = MergeRGBRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    // Merge a row of U and V into a row of RGB.
-    MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
-    src_r += src_stride_r;
-    src_g += src_stride_g;
-    src_b += src_stride_b;
-    dst_rgb += dst_stride_rgb;
-  }
-}
-
-LIBYUV_NOINLINE
-void SplitARGBPlaneAlpha(const uint8_t* src_argb,
-                         int src_stride_argb,
-                         uint8_t* dst_r,
-                         int dst_stride_r,
-                         uint8_t* dst_g,
-                         int dst_stride_g,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         int width,
-                         int height) {
-  int y;
-  void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
-                       uint8_t* dst_b, uint8_t* dst_a, int width) =
-      SplitARGBRow_C;
-
-  assert(height > 0);
-
-  if (src_stride_argb == width * 4 && dst_stride_r == width &&
-      dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
-        dst_stride_a = 0;
-  }
-
-#if defined(HAS_SPLITARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitARGBRow = SplitARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      SplitARGBRow = SplitARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    SplitARGBRow = SplitARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      SplitARGBRow = SplitARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SPLITARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitARGBRow = SplitARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      SplitARGBRow = SplitARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitARGBRow = SplitARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SplitARGBRow = SplitARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
-    dst_r += dst_stride_r;
-    dst_g += dst_stride_g;
-    dst_b += dst_stride_b;
-    dst_a += dst_stride_a;
-    src_argb += src_stride_argb;
-  }
-}
-
-LIBYUV_NOINLINE
-void SplitARGBPlaneOpaque(const uint8_t* src_argb,
-                          int src_stride_argb,
-                          uint8_t* dst_r,
-                          int dst_stride_r,
-                          uint8_t* dst_g,
-                          int dst_stride_g,
-                          uint8_t* dst_b,
-                          int dst_stride_b,
-                          int width,
-                          int height) {
-  int y;
-  void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
-                       uint8_t* dst_b, int width) = SplitXRGBRow_C;
-  assert(height > 0);
-
-  if (src_stride_argb == width * 4 && dst_stride_r == width &&
-      dst_stride_g == width && dst_stride_b == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
-  }
-
-#if defined(HAS_SPLITXRGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitXRGBRow = SplitXRGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      SplitXRGBRow = SplitXRGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITXRGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    SplitXRGBRow = SplitXRGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      SplitXRGBRow = SplitXRGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SPLITXRGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitXRGBRow = SplitXRGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      SplitXRGBRow = SplitXRGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITXRGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitXRGBRow = SplitXRGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SplitXRGBRow = SplitXRGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
-    dst_r += dst_stride_r;
-    dst_g += dst_stride_g;
-    dst_b += dst_stride_b;
-    src_argb += src_stride_argb;
-  }
-}
-
-LIBYUV_API
-void SplitARGBPlane(const uint8_t* src_argb,
-                    int src_stride_argb,
-                    uint8_t* dst_r,
-                    int dst_stride_r,
-                    uint8_t* dst_g,
-                    int dst_stride_g,
-                    uint8_t* dst_b,
-                    int dst_stride_b,
-                    uint8_t* dst_a,
-                    int dst_stride_a,
-                    int width,
-                    int height) {
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_r = dst_r + (height - 1) * dst_stride_r;
-    dst_g = dst_g + (height - 1) * dst_stride_g;
-    dst_b = dst_b + (height - 1) * dst_stride_b;
-    dst_a = dst_a + (height - 1) * dst_stride_a;
-    dst_stride_r = -dst_stride_r;
-    dst_stride_g = -dst_stride_g;
-    dst_stride_b = -dst_stride_b;
-    dst_stride_a = -dst_stride_a;
-  }
-
-  if (dst_a == NULL) {
-    SplitARGBPlaneOpaque(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g,
-                         dst_stride_g, dst_b, dst_stride_b, width, height);
-  } else {
-    SplitARGBPlaneAlpha(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g,
-                        dst_stride_g, dst_b, dst_stride_b, dst_a, dst_stride_a,
-                        width, height);
-  }
-}
-
-LIBYUV_NOINLINE
-void MergeARGBPlaneAlpha(const uint8_t* src_r,
-                         int src_stride_r,
-                         const uint8_t* src_g,
-                         int src_stride_g,
-                         const uint8_t* src_b,
-                         int src_stride_b,
-                         const uint8_t* src_a,
-                         int src_stride_a,
-                         uint8_t* dst_argb,
-                         int dst_stride_argb,
-                         int width,
-                         int height) {
-  int y;
-  void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g,
-                       const uint8_t* src_b, const uint8_t* src_a,
-                       uint8_t* dst_argb, int width) = MergeARGBRow_C;
-
-  assert(height > 0);
-
-  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      src_stride_a == width && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_r = src_stride_g = src_stride_b = src_stride_a =
-        dst_stride_argb = 0;
-  }
-#if defined(HAS_MERGEARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeARGBRow = MergeARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      MergeARGBRow = MergeARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeARGBRow = MergeARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      MergeARGBRow = MergeARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeARGBRow = MergeARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      MergeARGBRow = MergeARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
-    src_r += src_stride_r;
-    src_g += src_stride_g;
-    src_b += src_stride_b;
-    src_a += src_stride_a;
-    dst_argb += dst_stride_argb;
-  }
-}
-
-LIBYUV_NOINLINE
-void MergeARGBPlaneOpaque(const uint8_t* src_r,
-                          int src_stride_r,
-                          const uint8_t* src_g,
-                          int src_stride_g,
-                          const uint8_t* src_b,
-                          int src_stride_b,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          int width,
-                          int height) {
-  int y;
-  void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
-                       const uint8_t* src_b, uint8_t* dst_argb, int width) =
-      MergeXRGBRow_C;
-
-  assert(height > 0);
-
-  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
-  }
-#if defined(HAS_MERGEXRGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeXRGBRow = MergeXRGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      MergeXRGBRow = MergeXRGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEXRGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeXRGBRow = MergeXRGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      MergeXRGBRow = MergeXRGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEXRGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeXRGBRow = MergeXRGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      MergeXRGBRow = MergeXRGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
-    src_r += src_stride_r;
-    src_g += src_stride_g;
-    src_b += src_stride_b;
-    dst_argb += dst_stride_argb;
-  }
-}
-
-LIBYUV_API
-void MergeARGBPlane(const uint8_t* src_r,
-                    int src_stride_r,
-                    const uint8_t* src_g,
-                    int src_stride_g,
-                    const uint8_t* src_b,
-                    int src_stride_b,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height) {
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-
-  if (src_a == NULL) {
-    MergeARGBPlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
-                         src_stride_b, dst_argb, dst_stride_argb, width,
-                         height);
-  } else {
-    MergeARGBPlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
-                        src_stride_b, src_a, src_stride_a, dst_argb,
-                        dst_stride_argb, width, height);
-  }
-}
-
-// TODO(yuan): Support 2 bit alpha channel.
-LIBYUV_API
-void MergeXR30Plane(const uint16_t* src_r,
-                    int src_stride_r,
-                    const uint16_t* src_g,
-                    int src_stride_g,
-                    const uint16_t* src_b,
-                    int src_stride_b,
-                    uint8_t* dst_ar30,
-                    int dst_stride_ar30,
-                    int width,
-                    int height,
-                    int depth) {
-  int y;
-  void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g,
-                       const uint16_t* src_b, uint8_t* dst_ar30, int depth,
-                       int width) = MergeXR30Row_C;
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-  // Coalesce rows.
-  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      dst_stride_ar30 == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0;
-  }
-#if defined(HAS_MERGEXR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeXR30Row = MergeXR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      MergeXR30Row = MergeXR30Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEXR30ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (depth == 10) {
-      MergeXR30Row = MergeXR30Row_10_Any_NEON;
-      if (IS_ALIGNED(width, 8)) {
-        MergeXR30Row = MergeXR30Row_10_NEON;
-      }
-    } else {
-      MergeXR30Row = MergeXR30Row_Any_NEON;
-      if (IS_ALIGNED(width, 8)) {
-        MergeXR30Row = MergeXR30Row_NEON;
-      }
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width);
-    src_r += src_stride_r;
-    src_g += src_stride_g;
-    src_b += src_stride_b;
-    dst_ar30 += dst_stride_ar30;
-  }
-}
-
-LIBYUV_NOINLINE
-static void MergeAR64PlaneAlpha(const uint16_t* src_r,
-                                int src_stride_r,
-                                const uint16_t* src_g,
-                                int src_stride_g,
-                                const uint16_t* src_b,
-                                int src_stride_b,
-                                const uint16_t* src_a,
-                                int src_stride_a,
-                                uint16_t* dst_ar64,
-                                int dst_stride_ar64,
-                                int width,
-                                int height,
-                                int depth) {
-  int y;
-  void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g,
-                       const uint16_t* src_b, const uint16_t* src_a,
-                       uint16_t* dst_argb, int depth, int width) =
-      MergeAR64Row_C;
-
-  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      src_stride_a == width && dst_stride_ar64 == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_r = src_stride_g = src_stride_b = src_stride_a =
-        dst_stride_ar64 = 0;
-  }
-#if defined(HAS_MERGEAR64ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeAR64Row = MergeAR64Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      MergeAR64Row = MergeAR64Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEAR64ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeAR64Row = MergeAR64Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      MergeAR64Row = MergeAR64Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width);
-    src_r += src_stride_r;
-    src_g += src_stride_g;
-    src_b += src_stride_b;
-    src_a += src_stride_a;
-    dst_ar64 += dst_stride_ar64;
-  }
-}
-
-LIBYUV_NOINLINE
-static void MergeAR64PlaneOpaque(const uint16_t* src_r,
-                                 int src_stride_r,
-                                 const uint16_t* src_g,
-                                 int src_stride_g,
-                                 const uint16_t* src_b,
-                                 int src_stride_b,
-                                 uint16_t* dst_ar64,
-                                 int dst_stride_ar64,
-                                 int width,
-                                 int height,
-                                 int depth) {
-  int y;
-  void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g,
-                       const uint16_t* src_b, uint16_t* dst_argb, int depth,
-                       int width) = MergeXR64Row_C;
-
-  // Coalesce rows.
-  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      dst_stride_ar64 == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0;
-  }
-#if defined(HAS_MERGEXR64ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeXR64Row = MergeXR64Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      MergeXR64Row = MergeXR64Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEXR64ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeXR64Row = MergeXR64Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      MergeXR64Row = MergeXR64Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width);
-    src_r += src_stride_r;
-    src_g += src_stride_g;
-    src_b += src_stride_b;
-    dst_ar64 += dst_stride_ar64;
-  }
-}
-
-LIBYUV_API
-void MergeAR64Plane(const uint16_t* src_r,
-                    int src_stride_r,
-                    const uint16_t* src_g,
-                    int src_stride_g,
-                    const uint16_t* src_b,
-                    int src_stride_b,
-                    const uint16_t* src_a,
-                    int src_stride_a,
-                    uint16_t* dst_ar64,
-                    int dst_stride_ar64,
-                    int width,
-                    int height,
-                    int depth) {
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64;
-    dst_stride_ar64 = -dst_stride_ar64;
-  }
-
-  if (src_a == NULL) {
-    MergeAR64PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
-                         src_stride_b, dst_ar64, dst_stride_ar64, width, height,
-                         depth);
-  } else {
-    MergeAR64PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
-                        src_stride_b, src_a, src_stride_a, dst_ar64,
-                        dst_stride_ar64, width, height, depth);
-  }
-}
-
-LIBYUV_NOINLINE
-static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r,
-                                     int src_stride_r,
-                                     const uint16_t* src_g,
-                                     int src_stride_g,
-                                     const uint16_t* src_b,
-                                     int src_stride_b,
-                                     const uint16_t* src_a,
-                                     int src_stride_a,
-                                     uint8_t* dst_argb,
-                                     int dst_stride_argb,
-                                     int width,
-                                     int height,
-                                     int depth) {
-  int y;
-  void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
-                            const uint16_t* src_b, const uint16_t* src_a,
-                            uint8_t* dst_argb, int depth, int width) =
-      MergeARGB16To8Row_C;
-
-  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      src_stride_a == width && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_r = src_stride_g = src_stride_b = src_stride_a =
-        dst_stride_argb = 0;
-  }
-#if defined(HAS_MERGEARGB16TO8ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      MergeARGB16To8Row = MergeARGB16To8Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEARGB16TO8ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      MergeARGB16To8Row = MergeARGB16To8Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width);
-    src_r += src_stride_r;
-    src_g += src_stride_g;
-    src_b += src_stride_b;
-    src_a += src_stride_a;
-    dst_argb += dst_stride_argb;
-  }
-}
-
-LIBYUV_NOINLINE
-static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r,
-                                      int src_stride_r,
-                                      const uint16_t* src_g,
-                                      int src_stride_g,
-                                      const uint16_t* src_b,
-                                      int src_stride_b,
-                                      uint8_t* dst_argb,
-                                      int dst_stride_argb,
-                                      int width,
-                                      int height,
-                                      int depth) {
-  int y;
-  void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
-                            const uint16_t* src_b, uint8_t* dst_argb, int depth,
-                            int width) = MergeXRGB16To8Row_C;
-
-  // Coalesce rows.
-  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
-  }
-#if defined(HAS_MERGEXRGB16TO8ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEXRGB16TO8ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      MergeXRGB16To8Row = MergeXRGB16To8Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width);
-    src_r += src_stride_r;
-    src_g += src_stride_g;
-    src_b += src_stride_b;
-    dst_argb += dst_stride_argb;
-  }
-}
-
-LIBYUV_API
-void MergeARGB16To8Plane(const uint16_t* src_r,
-                         int src_stride_r,
-                         const uint16_t* src_g,
-                         int src_stride_g,
-                         const uint16_t* src_b,
-                         int src_stride_b,
-                         const uint16_t* src_a,
-                         int src_stride_a,
-                         uint8_t* dst_argb,
-                         int dst_stride_argb,
-                         int width,
-                         int height,
-                         int depth) {
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-
-  if (src_a == NULL) {
-    MergeARGB16To8PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
-                              src_stride_b, dst_argb, dst_stride_argb, width,
-                              height, depth);
-  } else {
-    MergeARGB16To8PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
-                             src_stride_b, src_a, src_stride_a, dst_argb,
-                             dst_stride_argb, width, height, depth);
-  }
-}
-
-// Convert YUY2 to I422.
-LIBYUV_API
-int YUY2ToI422(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u,
-                         uint8_t* dst_v, int width) = YUY2ToUV422Row_C;
-  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
-      YUY2ToYRow_C;
-  if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
-      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
-      width * height <= 32768) {
-    width *= height;
-    height = 1;
-    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
-      YUY2ToYRow = YUY2ToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
-      YUY2ToYRow = YUY2ToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToYRow = YUY2ToYRow_NEON;
-      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUV422ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    YUY2ToYRow = YUY2ToYRow_Any_MMI;
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToYRow = YUY2ToYRow_MMI;
-      YUY2ToUV422Row = YUY2ToUV422Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    YUY2ToYRow = YUY2ToYRow_Any_MSA;
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToYRow = YUY2ToYRow_MSA;
-      YUY2ToUV422Row = YUY2ToUV422Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-    src_yuy2 += src_stride_yuy2;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-// Convert UYVY to I422.
-LIBYUV_API
-int UYVYToI422(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u,
-                         uint8_t* dst_v, int width) = UYVYToUV422Row_C;
-  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
-      UYVYToYRow_C;
-  if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-  // Coalesce rows.
-  if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
-      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
-      width * height <= 32768) {
-    width *= height;
-    height = 1;
-    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
-    UYVYToYRow = UYVYToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToUV422Row = UYVYToUV422Row_SSE2;
-      UYVYToYRow = UYVYToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
-    UYVYToYRow = UYVYToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToUV422Row = UYVYToUV422Row_AVX2;
-      UYVYToYRow = UYVYToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    UYVYToYRow = UYVYToYRow_Any_NEON;
-    UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_NEON;
-      UYVYToUV422Row = UYVYToUV422Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUV422ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    UYVYToYRow = UYVYToYRow_Any_MMI;
-    UYVYToUV422Row = UYVYToUV422Row_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_MMI;
-      UYVYToUV422Row = UYVYToUV422Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    UYVYToYRow = UYVYToYRow_Any_MSA;
-    UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToYRow = UYVYToYRow_MSA;
-      UYVYToUV422Row = UYVYToUV422Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
-    UYVYToYRow(src_uyvy, dst_y, width);
-    src_uyvy += src_stride_uyvy;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-// Convert YUY2 to Y.
-LIBYUV_API
-int YUY2ToY(const uint8_t* src_yuy2,
-            int src_stride_yuy2,
-            uint8_t* dst_y,
-            int dst_stride_y,
-            int width,
-            int height) {
-  int y;
-  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
-      YUY2ToYRow_C;
-  if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_yuy2 = dst_stride_y = 0;
-  }
-#if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToYRow = YUY2ToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToYRow = YUY2ToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToYRow = YUY2ToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    YUY2ToYRow = YUY2ToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToYRow = YUY2ToYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    YUY2ToYRow = YUY2ToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToYRow = YUY2ToYRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    YUY2ToYRow(src_yuy2, dst_y, width);
-    src_yuy2 += src_stride_yuy2;
-    dst_y += dst_stride_y;
-  }
-  return 0;
-}
-
-// Mirror a plane of data.
-// See Also I400Mirror
-LIBYUV_API
-void MirrorPlane(const uint8_t* src_y,
-                 int src_stride_y,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 int width,
-                 int height) {
-  int y;
-  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-#if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorRow = MirrorRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorRow = MirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MirrorRow = MirrorRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      MirrorRow = MirrorRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MirrorRow = MirrorRow_Any_MSA;
-    if (IS_ALIGNED(width, 64)) {
-      MirrorRow = MirrorRow_MSA;
-    }
-  }
-#endif
-
-  // Mirror plane
-  for (y = 0; y < height; ++y) {
-    MirrorRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
-// Mirror a plane of UV data.
-LIBYUV_API
-void MirrorUVPlane(const uint8_t* src_uv,
-                   int src_stride_uv,
-                   uint8_t* dst_uv,
-                   int dst_stride_uv,
-                   int width,
-                   int height) {
-  int y;
-  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
-      MirrorUVRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uv = src_uv + (height - 1) * src_stride_uv;
-    src_stride_uv = -src_stride_uv;
-  }
-#if defined(HAS_MIRRORUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorUVRow = MirrorUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorUVRow = MirrorUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorUVRow = MirrorUVRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      MirrorUVRow = MirrorUVRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorUVRow = MirrorUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorUVRow = MirrorUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MirrorUVRow = MirrorUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      MirrorUVRow = MirrorUVRow_MSA;
-    }
-  }
-#endif
-
-  // MirrorUV plane
-  for (y = 0; y < height; ++y) {
-    MirrorUVRow(src_uv, dst_uv, width);
-    src_uv += src_stride_uv;
-    dst_uv += dst_stride_uv;
-  }
-}
-
-// Mirror I400 with optional flipping
-LIBYUV_API
-int I400Mirror(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height) {
-  if (!src_y || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-
-  MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  return 0;
-}
-
-// Mirror I420 with optional flipping
-LIBYUV_API
-int I420Mirror(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
-  return 0;
-}
-
-// NV12 mirror.
-LIBYUV_API
-int NV12Mirror(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
-    src_stride_y = -src_stride_y;
-    src_stride_uv = -src_stride_uv;
-  }
-
-  if (dst_y) {
-    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
-                halfheight);
-  return 0;
-}
-
-// ARGB mirror.
-LIBYUV_API
-int ARGBMirror(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
-      ARGBMirrorRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBMIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMirrorRow = ARGBMirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMirrorRow = ARGBMirrorRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMirrorRow = ARGBMirrorRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBMirrorRow = ARGBMirrorRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBMirrorRow = ARGBMirrorRow_MSA;
-    }
-  }
-#endif
-
-  // Mirror plane
-  for (y = 0; y < height; ++y) {
-    ARGBMirrorRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// RGB24 mirror.
-LIBYUV_API
-int RGB24Mirror(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  int y;
-  void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
-      RGB24MirrorRow_C;
-  if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-#if defined(HAS_RGB24MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24MirrorRow = RGB24MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24MirrorRow = RGB24MirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB24MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24MirrorRow = RGB24MirrorRow_SSSE3;
-    }
-  }
-#endif
-
-  // Mirror plane
-  for (y = 0; y < height; ++y) {
-    RGB24MirrorRow(src_rgb24, dst_rgb24, width);
-    src_rgb24 += src_stride_rgb24;
-    dst_rgb24 += dst_stride_rgb24;
-  }
-  return 0;
-}
-
-// Get a blender that optimized for the CPU and pixel count.
-// As there are 6 blenders to choose from, the caller should try to use
-// the same blend function for all pixels if possible.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend() {
-  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
-                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlendRow = ARGBBlendRow_SSSE3;
-    return ARGBBlendRow;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBBlendRow = ARGBBlendRow_NEON;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBBlendRow = ARGBBlendRow_MMI;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBBlendRow = ARGBBlendRow_MSA;
-  }
-#endif
-  return ARGBBlendRow;
-}
-
-// Alpha Blend 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBBlend(const uint8_t* src_argb0,
-              int src_stride_argb0,
-              const uint8_t* src_argb1,
-              int src_stride_argb1,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int width,
-              int height) {
-  int y;
-  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
-                       uint8_t* dst_argb, int width) = GetARGBBlend();
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
-  }
-
-  for (y = 0; y < height; ++y) {
-    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Alpha Blend plane and store to destination.
-LIBYUV_API
-int BlendPlane(const uint8_t* src_y0,
-               int src_stride_y0,
-               const uint8_t* src_y1,
-               int src_stride_y1,
-               const uint8_t* alpha,
-               int alpha_stride,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height) {
-  int y;
-  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
-                        const uint8_t* alpha, uint8_t* dst, int width) =
-      BlendPlaneRow_C;
-  if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-
-  // Coalesce rows for Y plane.
-  if (src_stride_y0 == width && src_stride_y1 == width &&
-      alpha_stride == width && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
-  }
-
-#if defined(HAS_BLENDPLANEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      BlendPlaneRow = BlendPlaneRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_BLENDPLANEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      BlendPlaneRow = BlendPlaneRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_BLENDPLANEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    BlendPlaneRow = BlendPlaneRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      BlendPlaneRow = BlendPlaneRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
-    src_y0 += src_stride_y0;
-    src_y1 += src_stride_y1;
-    alpha += alpha_stride;
-    dst_y += dst_stride_y;
-  }
-  return 0;
-}
-
-#define MAXTWIDTH 2048
-// Alpha Blend YUV images and store to destination.
-LIBYUV_API
-int I420Blend(const uint8_t* src_y0,
-              int src_stride_y0,
-              const uint8_t* src_u0,
-              int src_stride_u0,
-              const uint8_t* src_v0,
-              int src_stride_v0,
-              const uint8_t* src_y1,
-              int src_stride_y1,
-              const uint8_t* src_u1,
-              int src_stride_u1,
-              const uint8_t* src_v1,
-              int src_stride_v1,
-              const uint8_t* alpha,
-              int alpha_stride,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int width,
-              int height) {
-  int y;
-  // Half width/height for UV.
-  int halfwidth = (width + 1) >> 1;
-  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
-                        const uint8_t* alpha, uint8_t* dst, int width) =
-      BlendPlaneRow_C;
-  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                        uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
-  if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
-      !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-
-  // Blend Y plane.
-  BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride,
-             dst_y, dst_stride_y, width, height);
-
-#if defined(HAS_BLENDPLANEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      BlendPlaneRow = BlendPlaneRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_BLENDPLANEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      BlendPlaneRow = BlendPlaneRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_BLENDPLANEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    BlendPlaneRow = BlendPlaneRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      BlendPlaneRow = BlendPlaneRow_MMI;
-    }
-  }
-#endif
-  if (!IS_ALIGNED(width, 2)) {
-    ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
-  }
-#if defined(HAS_SCALEROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;
-    if (IS_ALIGNED(width, 2)) {
-      ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
-      if (IS_ALIGNED(halfwidth, 16)) {
-        ScaleRowDown2 = ScaleRowDown2Box_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;
-    if (IS_ALIGNED(width, 2)) {
-      ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
-      if (IS_ALIGNED(halfwidth, 16)) {
-        ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;
-    if (IS_ALIGNED(width, 2)) {
-      ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
-      if (IS_ALIGNED(halfwidth, 32)) {
-        ScaleRowDown2 = ScaleRowDown2Box_AVX2;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ScaleRowDown2 = ScaleRowDown2Box_Any_MMI;
-      if (IS_ALIGNED(halfwidth, 8)) {
-        ScaleRowDown2 = ScaleRowDown2Box_MMI;
-      }
-    }
-  }
-#endif
-
-  // Row buffer for intermediate alpha pixels.
-  align_buffer_64(halfalpha, halfwidth);
-  for (y = 0; y < height; y += 2) {
-    // last row of odd height image use 1 row of alpha instead of 2.
-    if (y == (height - 1)) {
-      alpha_stride = 0;
-    }
-    // Subsample 2 rows of UV to half width and half height.
-    ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
-    alpha += alpha_stride * 2;
-    BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
-    BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
-    src_u0 += src_stride_u0;
-    src_u1 += src_stride_u1;
-    dst_u += dst_stride_u;
-    src_v0 += src_stride_v0;
-    src_v1 += src_stride_v1;
-    dst_v += dst_stride_v;
-  }
-  free_aligned_buffer_64(halfalpha);
-  return 0;
-}
-
-// Multiply 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBMultiply(const uint8_t* src_argb0,
-                 int src_stride_argb0,
-                 const uint8_t* src_argb1,
-                 int src_stride_argb1,
-                 uint8_t* dst_argb,
-                 int dst_stride_argb,
-                 int width,
-                 int height) {
-  int y;
-  void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1,
-                          uint8_t* dst, int width) = ARGBMultiplyRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBMULTIPLYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMULTIPLYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMULTIPLYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMULTIPLYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMULTIPLYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_MSA;
-    }
-  }
-#endif
-
-  // Multiply plane
-  for (y = 0; y < height; ++y) {
-    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Add 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBAdd(const uint8_t* src_argb0,
-            int src_stride_argb0,
-            const uint8_t* src_argb1,
-            int src_stride_argb1,
-            uint8_t* dst_argb,
-            int dst_stride_argb,
-            int width,
-            int height) {
-  int y;
-  void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,
-                     int width) = ARGBAddRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBADDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBAddRow = ARGBAddRow_SSE2;
-  }
-#endif
-#if defined(HAS_ARGBADDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBAddRow = ARGBAddRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAddRow = ARGBAddRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBADDROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAddRow = ARGBAddRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAddRow = ARGBAddRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBADDROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAddRow = ARGBAddRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAddRow = ARGBAddRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBADDROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAddRow = ARGBAddRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAddRow = ARGBAddRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBADDROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBAddRow = ARGBAddRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAddRow = ARGBAddRow_MSA;
-    }
-  }
-#endif
-
-  // Add plane
-  for (y = 0; y < height; ++y) {
-    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Subtract 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBSubtract(const uint8_t* src_argb0,
-                 int src_stride_argb0,
-                 const uint8_t* src_argb1,
-                 int src_stride_argb1,
-                 uint8_t* dst_argb,
-                 int dst_stride_argb,
-                 int width,
-                 int height) {
-  int y;
-  void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1,
-                          uint8_t* dst, int width) = ARGBSubtractRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBSUBTRACTROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBSubtractRow = ARGBSubtractRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSUBTRACTROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBSubtractRow = ARGBSubtractRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSUBTRACTROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBSubtractRow = ARGBSubtractRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSUBTRACTROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBSubtractRow = ARGBSubtractRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBSubtractRow = ARGBSubtractRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSUBTRACTROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBSubtractRow = ARGBSubtractRow_MSA;
-    }
-  }
-#endif
-
-  // Subtract plane
-  for (y = 0; y < height; ++y) {
-    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert RAW to RGB24.
-LIBYUV_API
-int RAWToRGB24(const uint8_t* src_raw,
-               int src_stride_raw,
-               uint8_t* dst_rgb24,
-               int dst_stride_rgb24,
-               int width,
-               int height) {
-  int y;
-  void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) =
-      RAWToRGB24Row_C;
-  if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  // Coalesce rows.
-  if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {
-    width *= height;
-    height = 1;
-    src_stride_raw = dst_stride_rgb24 = 0;
-  }
-#if defined(HAS_RAWTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToRGB24Row = RAWToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RAWTORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToRGB24Row = RAWToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToRGB24Row = RAWToRGB24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RAWTORGB24ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RAWToRGB24Row = RAWToRGB24Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      RAWToRGB24Row = RAWToRGB24Row_MMI;
-    }
-  }
-#endif
-#if defined(HAS_RAWTORGB24ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToRGB24Row = RAWToRGB24Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RAWToRGB24Row(src_raw, dst_rgb24, width);
-    src_raw += src_stride_raw;
-    dst_rgb24 += dst_stride_rgb24;
-  }
-  return 0;
-}
-
-LIBYUV_API
-void SetPlane(uint8_t* dst_y,
-              int dst_stride_y,
-              int width,
-              int height,
-              uint32_t value) {
-  int y;
-  void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_stride_y = -dst_stride_y;
-  }
-  // Coalesce rows.
-  if (dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    dst_stride_y = 0;
-  }
-#if defined(HAS_SETROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SetRow = SetRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SetRow = SetRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SETROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    SetRow = SetRow_Any_X86;
-    if (IS_ALIGNED(width, 4)) {
-      SetRow = SetRow_X86;
-    }
-  }
-#endif
-#if defined(HAS_SETROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    SetRow = SetRow_ERMS;
-  }
-#endif
-#if defined(HAS_SETROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
-    SetRow = SetRow_MSA;
-  }
-#endif
-
-  // Set plane
-  for (y = 0; y < height; ++y) {
-    SetRow(dst_y, value, width);
-    dst_y += dst_stride_y;
-  }
-}
-
-// Draw a rectangle into I420
-LIBYUV_API
-int I420Rect(uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_u,
-             int dst_stride_u,
-             uint8_t* dst_v,
-             int dst_stride_v,
-             int x,
-             int y,
-             int width,
-             int height,
-             int value_y,
-             int value_u,
-             int value_v) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  uint8_t* start_y = dst_y + y * dst_stride_y + x;
-  uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
-  uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
-  if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
-      y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
-      value_v < 0 || value_v > 255) {
-    return -1;
-  }
-
-  SetPlane(start_y, dst_stride_y, width, height, value_y);
-  SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
-  SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
-  return 0;
-}
-
-// Draw a rectangle into ARGB
-LIBYUV_API
-int ARGBRect(uint8_t* dst_argb,
-             int dst_stride_argb,
-             int dst_x,
-             int dst_y,
-             int width,
-             int height,
-             uint32_t value) {
-  int y;
-  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
-      ARGBSetRow_C;
-  if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  dst_argb += dst_y * dst_stride_argb + dst_x * 4;
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-
-#if defined(HAS_ARGBSETROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBSetRow = ARGBSetRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBSetRow = ARGBSetRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSETROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    ARGBSetRow = ARGBSetRow_X86;
-  }
-#endif
-#if defined(HAS_ARGBSETROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBSetRow = ARGBSetRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBSetRow = ARGBSetRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSETROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBSetRow = ARGBSetRow_Any_MSA;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBSetRow = ARGBSetRow_MSA;
-    }
-  }
-#endif
-
-  // Set plane
-  for (y = 0; y < height; ++y) {
-    ARGBSetRow(dst_argb, value, width);
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert unattentuated ARGB to preattenuated ARGB.
-// An unattenutated ARGB alpha blend uses the formula
-// p = a * f + (1 - a) * b
-// where
-//   p is output pixel
-//   f is foreground pixel
-//   b is background pixel
-//   a is alpha value from foreground pixel
-// An preattenutated ARGB alpha blend uses the formula
-// p = f + (1 - a) * b
-// where
-//   f is foreground pixel premultiplied by alpha
-
-LIBYUV_API
-int ARGBAttenuate(const uint8_t* src_argb,
-                  int src_stride_argb,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int width,
-                  int height) {
-  int y;
-  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBAttenuateRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert preattentuated ARGB to unattenuated ARGB.
-LIBYUV_API
-int ARGBUnattenuate(const uint8_t* src_argb,
-                    int src_stride_argb,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height) {
-  int y;
-  void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                             int width) = ARGBUnattenuateRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
-    }
-  }
-#endif
-  // TODO(fbarchard): Neon version.
-
-  for (y = 0; y < height; ++y) {
-    ARGBUnattenuateRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert ARGB to Grayed ARGB.
-LIBYUV_API
-int ARGBGrayTo(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
-      ARGBGrayRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBGRAYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
-    ARGBGrayRow = ARGBGrayRow_SSSE3;
-  }
-#endif
-#if defined(HAS_ARGBGRAYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBGrayRow = ARGBGrayRow_NEON;
-  }
-#endif
-#if defined(HAS_ARGBGRAYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
-    ARGBGrayRow = ARGBGrayRow_MMI;
-  }
-#endif
-#if defined(HAS_ARGBGRAYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
-    ARGBGrayRow = ARGBGrayRow_MSA;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBGrayRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Make a rectangle of ARGB gray scale.
-LIBYUV_API
-int ARGBGray(uint8_t* dst_argb,
-             int dst_stride_argb,
-             int dst_x,
-             int dst_y,
-             int width,
-             int height) {
-  int y;
-  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
-      ARGBGrayRow_C;
-  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
-    return -1;
-  }
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBGRAYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
-    ARGBGrayRow = ARGBGrayRow_SSSE3;
-  }
-#endif
-#if defined(HAS_ARGBGRAYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBGrayRow = ARGBGrayRow_NEON;
-  }
-#endif
-#if defined(HAS_ARGBGRAYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
-    ARGBGrayRow = ARGBGrayRow_MMI;
-  }
-#endif
-#if defined(HAS_ARGBGRAYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
-    ARGBGrayRow = ARGBGrayRow_MSA;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBGrayRow(dst, dst, width);
-    dst += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Make a rectangle of ARGB Sepia tone.
-LIBYUV_API
-int ARGBSepia(uint8_t* dst_argb,
-              int dst_stride_argb,
-              int dst_x,
-              int dst_y,
-              int width,
-              int height) {
-  int y;
-  void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
-  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
-    return -1;
-  }
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBSEPIAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
-    ARGBSepiaRow = ARGBSepiaRow_SSSE3;
-  }
-#endif
-#if defined(HAS_ARGBSEPIAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBSepiaRow = ARGBSepiaRow_NEON;
-  }
-#endif
-#if defined(HAS_ARGBSEPIAROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
-    ARGBSepiaRow = ARGBSepiaRow_MMI;
-  }
-#endif
-#if defined(HAS_ARGBSEPIAROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
-    ARGBSepiaRow = ARGBSepiaRow_MSA;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBSepiaRow(dst, width);
-    dst += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Apply a 4x4 matrix to each ARGB pixel.
-// Note: Normally for shading, but can be used to swizzle or invert.
-LIBYUV_API
-int ARGBColorMatrix(const uint8_t* src_argb,
-                    int src_stride_argb,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    const int8_t* matrix_argb,
-                    int width,
-                    int height) {
-  int y;
-  void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                             const int8_t* matrix_argb, int width) =
-      ARGBColorMatrixRow_C;
-  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
-    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
-  }
-#endif
-#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
-  }
-#endif
-#if defined(HAS_ARGBCOLORMATRIXROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
-    ARGBColorMatrixRow = ARGBColorMatrixRow_MMI;
-  }
-#endif
-#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
-    ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Apply a 4x3 matrix to each ARGB pixel.
-// Deprecated.
-LIBYUV_API
-int RGBColorMatrix(uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   const int8_t* matrix_rgb,
-                   int dst_x,
-                   int dst_y,
-                   int width,
-                   int height) {
-  SIMD_ALIGNED(int8_t matrix_argb[16]);
-  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||
-      dst_y < 0) {
-    return -1;
-  }
-
-  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
-  matrix_argb[0] = matrix_rgb[0] / 2;
-  matrix_argb[1] = matrix_rgb[1] / 2;
-  matrix_argb[2] = matrix_rgb[2] / 2;
-  matrix_argb[3] = matrix_rgb[3] / 2;
-  matrix_argb[4] = matrix_rgb[4] / 2;
-  matrix_argb[5] = matrix_rgb[5] / 2;
-  matrix_argb[6] = matrix_rgb[6] / 2;
-  matrix_argb[7] = matrix_rgb[7] / 2;
-  matrix_argb[8] = matrix_rgb[8] / 2;
-  matrix_argb[9] = matrix_rgb[9] / 2;
-  matrix_argb[10] = matrix_rgb[10] / 2;
-  matrix_argb[11] = matrix_rgb[11] / 2;
-  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
-  matrix_argb[15] = 64;  // 1.0
-
-  return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst,
-                         dst_stride_argb, &matrix_argb[0], width, height);
-}
-
-// Apply a color table each ARGB pixel.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int ARGBColorTable(uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   const uint8_t* table_argb,
-                   int dst_x,
-                   int dst_y,
-                   int width,
-                   int height) {
-  int y;
-  void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
-                            int width) = ARGBColorTableRow_C;
-  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
-      dst_y < 0) {
-    return -1;
-  }
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBCOLORTABLEROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    ARGBColorTableRow = ARGBColorTableRow_X86;
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ARGBColorTableRow(dst, table_argb, width);
-    dst += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Apply a color table each ARGB pixel but preserve destination alpha.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int RGBColorTable(uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  const uint8_t* table_argb,
-                  int dst_x,
-                  int dst_y,
-                  int width,
-                  int height) {
-  int y;
-  void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
-                           int width) = RGBColorTableRow_C;
-  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
-      dst_y < 0) {
-    return -1;
-  }
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-#if defined(HAS_RGBCOLORTABLEROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    RGBColorTableRow = RGBColorTableRow_X86;
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    RGBColorTableRow(dst, table_argb, width);
-    dst += dst_stride_argb;
-  }
-  return 0;
-}
-
-// ARGBQuantize is used to posterize art.
-// e.g. rgb / qvalue * qvalue + qvalue / 2
-// But the low levels implement efficiently with 3 parameters, and could be
-// used for other high level operations.
-// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
-// where scale is 1 / interval_size as a fixed point value.
-// The divide is replaces with a multiply by reciprocal fixed point multiply.
-// Caveat - although SSE2 saturates, the C function does not and should be used
-// with care if doing anything but quantization.
-LIBYUV_API
-int ARGBQuantize(uint8_t* dst_argb,
-                 int dst_stride_argb,
-                 int scale,
-                 int interval_size,
-                 int interval_offset,
-                 int dst_x,
-                 int dst_y,
-                 int width,
-                 int height) {
-  int y;
-  void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) = ARGBQuantizeRow_C;
-  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
-      interval_size < 1 || interval_size > 255) {
-    return -1;
-  }
-  // Coalesce rows.
-  if (dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBQUANTIZEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
-    ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
-  }
-#endif
-#if defined(HAS_ARGBQUANTIZEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBQuantizeRow = ARGBQuantizeRow_NEON;
-  }
-#endif
-#if defined(HAS_ARGBQUANTIZEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
-    ARGBQuantizeRow = ARGBQuantizeRow_MSA;
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
-    dst += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Computes table of cumulative sum for image where the value is the sum
-// of all values above and to the left of the entry. Used by ARGBBlur.
-LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8_t* src_argb,
-                             int src_stride_argb,
-                             int32_t* dst_cumsum,
-                             int dst_stride32_cumsum,
-                             int width,
-                             int height) {
-  int y;
-  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
-                                  const int32_t* previous_cumsum, int width) =
-      ComputeCumulativeSumRow_C;
-  int32_t* previous_cumsum = dst_cumsum;
-  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
-    return -1;
-  }
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
-  }
-#endif
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
-  }
-#endif
-
-  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
-  for (y = 0; y < height; ++y) {
-    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
-    previous_cumsum = dst_cumsum;
-    dst_cumsum += dst_stride32_cumsum;
-    src_argb += src_stride_argb;
-  }
-  return 0;
-}
-
-// Blur ARGB image.
-// Caller should allocate CumulativeSum table of width * height * 16 bytes
-// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
-// as the buffer is treated as circular.
-LIBYUV_API
-int ARGBBlur(const uint8_t* src_argb,
-             int src_stride_argb,
-             uint8_t* dst_argb,
-             int dst_stride_argb,
-             int32_t* dst_cumsum,
-             int dst_stride32_cumsum,
-             int width,
-             int height,
-             int radius) {
-  int y;
-  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
-                                  const int32_t* previous_cumsum, int width) =
-      ComputeCumulativeSumRow_C;
-  void (*CumulativeSumToAverageRow)(
-      const int32_t* topleft, const int32_t* botleft, int width, int area,
-      uint8_t* dst, int count) = CumulativeSumToAverageRow_C;
-  int32_t* cumsum_bot_row;
-  int32_t* max_cumsum_bot_row;
-  int32_t* cumsum_top_row;
-
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  if (radius > height) {
-    radius = height;
-  }
-  if (radius > (width / 2 - 1)) {
-    radius = width / 2 - 1;
-  }
-  if (radius <= 0) {
-    return -1;
-  }
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
-    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
-  }
-#endif
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
-  }
-#endif
-  // Compute enough CumulativeSum for first row to be blurred. After this
-  // one row of CumulativeSum is updated at a time.
-  ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
-                           dst_stride32_cumsum, width, radius);
-
-  src_argb = src_argb + radius * src_stride_argb;
-  cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
-
-  max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
-  cumsum_top_row = &dst_cumsum[0];
-
-  for (y = 0; y < height; ++y) {
-    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
-    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
-    int area = radius * (bot_y - top_y);
-    int boxwidth = radius * 4;
-    int x;
-    int n;
-
-    // Increment cumsum_top_row pointer with circular buffer wrap around.
-    if (top_y) {
-      cumsum_top_row += dst_stride32_cumsum;
-      if (cumsum_top_row >= max_cumsum_bot_row) {
-        cumsum_top_row = dst_cumsum;
-      }
-    }
-    // Increment cumsum_bot_row pointer with circular buffer wrap around and
-    // then fill in a row of CumulativeSum.
-    if ((y + radius) < height) {
-      const int32_t* prev_cumsum_bot_row = cumsum_bot_row;
-      cumsum_bot_row += dst_stride32_cumsum;
-      if (cumsum_bot_row >= max_cumsum_bot_row) {
-        cumsum_bot_row = dst_cumsum;
-      }
-      ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
-                              width);
-      src_argb += src_stride_argb;
-    }
-
-    // Left clipped.
-    for (x = 0; x < radius + 1; ++x) {
-      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
-                                &dst_argb[x * 4], 1);
-      area += (bot_y - top_y);
-      boxwidth += 4;
-    }
-
-    // Middle unclipped.
-    n = (width - 1) - radius - x + 1;
-    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
-                              &dst_argb[x * 4], n);
-
-    // Right clipped.
-    for (x += n; x <= width - 1; ++x) {
-      area -= (bot_y - top_y);
-      boxwidth -= 4;
-      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
-                                cumsum_bot_row + (x - radius - 1) * 4, boxwidth,
-                                area, &dst_argb[x * 4], 1);
-    }
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Multiply ARGB image by a specified ARGB value.
-LIBYUV_API
-int ARGBShade(const uint8_t* src_argb,
-              int src_stride_argb,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int width,
-              int height,
-              uint32_t value) {
-  int y;
-  void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width,
-                       uint32_t value) = ARGBShadeRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBSHADEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
-    ARGBShadeRow = ARGBShadeRow_SSE2;
-  }
-#endif
-#if defined(HAS_ARGBSHADEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBShadeRow = ARGBShadeRow_NEON;
-  }
-#endif
-#if defined(HAS_ARGBSHADEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
-    ARGBShadeRow = ARGBShadeRow_MMI;
-  }
-#endif
-#if defined(HAS_ARGBSHADEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
-    ARGBShadeRow = ARGBShadeRow_MSA;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBShadeRow(src_argb, dst_argb, width, value);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Interpolate 2 planes by specified amount (0 to 255).
-LIBYUV_API
-int InterpolatePlane(const uint8_t* src0,
-                     int src_stride0,
-                     const uint8_t* src1,
-                     int src_stride1,
-                     uint8_t* dst,
-                     int dst_stride,
-                     int width,
-                     int height,
-                     int interpolation) {
-  int y;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst = dst + (height - 1) * dst_stride;
-    dst_stride = -dst_stride;
-  }
-  // Coalesce rows.
-  if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
-    width *= height;
-    height = 1;
-    src_stride0 = src_stride1 = dst_stride = 0;
-  }
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      InterpolateRow = InterpolateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    InterpolateRow(dst, src0, src1 - src0, width, interpolation);
-    src0 += src_stride0;
-    src1 += src_stride1;
-    dst += dst_stride;
-  }
-  return 0;
-}
-
-// Interpolate 2 ARGB images by specified amount (0 to 255).
-LIBYUV_API
-int ARGBInterpolate(const uint8_t* src_argb0,
-                    int src_stride_argb0,
-                    const uint8_t* src_argb1,
-                    int src_stride_argb1,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height,
-                    int interpolation) {
-  return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1,
-                          src_stride_argb1, dst_argb, dst_stride_argb,
-                          width * 4, height, interpolation);
-}
-
-// Interpolate 2 YUV images by specified amount (0 to 255).
-LIBYUV_API
-int I420Interpolate(const uint8_t* src0_y,
-                    int src0_stride_y,
-                    const uint8_t* src0_u,
-                    int src0_stride_u,
-                    const uint8_t* src0_v,
-                    int src0_stride_v,
-                    const uint8_t* src1_y,
-                    int src1_stride_y,
-                    const uint8_t* src1_u,
-                    int src1_stride_u,
-                    const uint8_t* src1_v,
-                    int src1_stride_v,
-                    uint8_t* dst_y,
-                    int dst_stride_y,
-                    uint8_t* dst_u,
-                    int dst_stride_u,
-                    uint8_t* dst_v,
-                    int dst_stride_v,
-                    int width,
-                    int height,
-                    int interpolation) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
-      !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
-                   dst_stride_y, width, height, interpolation);
-  InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
-                   dst_stride_u, halfwidth, halfheight, interpolation);
-  InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v,
-                   dst_stride_v, halfwidth, halfheight, interpolation);
-  return 0;
-}
-
-// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
-LIBYUV_API
-int ARGBShuffle(const uint8_t* src_bgra,
-                int src_stride_bgra,
-                uint8_t* dst_argb,
-                int dst_stride_argb,
-                const uint8_t* shuffler,
-                int width,
-                int height) {
-  int y;
-  void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb,
-                         const uint8_t* shuffler, int width) = ARGBShuffleRow_C;
-  if (!src_bgra || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
-    src_stride_bgra = -src_stride_bgra;
-  }
-  // Coalesce rows.
-  if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_bgra = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBShuffleRow = ARGBShuffleRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBShuffleRow = ARGBShuffleRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBShuffleRow = ARGBShuffleRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBShuffleRow = ARGBShuffleRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBShuffleRow = ARGBShuffleRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
-    src_bgra += src_stride_bgra;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Shuffle AR64 channel order.  e.g. AR64 to AB64.
-LIBYUV_API
-int AR64Shuffle(const uint16_t* src_ar64,
-                int src_stride_ar64,
-                uint16_t* dst_ar64,
-                int dst_stride_ar64,
-                const uint8_t* shuffler,
-                int width,
-                int height) {
-  int y;
-  void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64,
-                         const uint8_t* shuffler, int width) = AR64ShuffleRow_C;
-  if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
-    src_stride_ar64 = -src_stride_ar64;
-  }
-  // Coalesce rows.
-  if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_ar64 = dst_stride_ar64 = 0;
-  }
-  // Assembly versions can be reused if it's implemented with shuffle.
-#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      AR64ShuffleRow = ARGBShuffleRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    AR64ShuffleRow = ARGBShuffleRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      AR64ShuffleRow = ARGBShuffleRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    AR64ShuffleRow = ARGBShuffleRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
-      AR64ShuffleRow = ARGBShuffleRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    AR64ShuffleRow = ARGBShuffleRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      AR64ShuffleRow = ARGBShuffleRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler,
-                   width * 2);
-    src_ar64 += src_stride_ar64;
-    dst_ar64 += dst_stride_ar64;
-  }
-  return 0;
-}
-
-// Gauss blur a float plane using Gaussian 5x5 filter with
-// coefficients of 1, 4, 6, 4, 1.
-// Each destination pixel is a blur of the 5x5
-// pixels from the source.
-// Source edges are clamped.
-// Edge is 2 pixels on each side, and interior is multiple of 4.
-LIBYUV_API
-int GaussPlane_F32(const float* src,
-                   int src_stride,
-                   float* dst,
-                   int dst_stride,
-                   int width,
-                   int height) {
-  int y;
-  void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2,
-                       const float* src3, const float* src4, float* dst,
-                       int width) = GaussCol_F32_C;
-  void (*GaussRow_F32)(const float* src, float* dst, int width) =
-      GaussRow_F32_C;
-  if (!src || !dst || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src = src + (height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-
-#if defined(HAS_GAUSSCOL_F32_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    GaussCol_F32 = GaussCol_F32_NEON;
-  }
-#endif
-#if defined(HAS_GAUSSROW_F32_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    GaussRow_F32 = GaussRow_F32_NEON;
-  }
-#endif
-  {
-    // 2 pixels on each side, but aligned out to 16 bytes.
-    align_buffer_64(rowbuf, (4 + width + 4) * 4);
-    memset(rowbuf, 0, 16);
-    memset(rowbuf + (4 + width) * 4, 0, 16);
-    float* row = (float*)(rowbuf + 16);
-    const float* src0 = src;
-    const float* src1 = src;
-    const float* src2 = src;
-    const float* src3 = src2 + ((height > 1) ? src_stride : 0);
-    const float* src4 = src3 + ((height > 2) ? src_stride : 0);
-
-    for (y = 0; y < height; ++y) {
-      GaussCol_F32(src0, src1, src2, src3, src4, row, width);
-
-      // Extrude edge by 2 floats
-      row[-2] = row[-1] = row[0];
-      row[width + 1] = row[width] = row[width - 1];
-
-      GaussRow_F32(row - 2, dst, width);
-
-      src0 = src1;
-      src1 = src2;
-      src2 = src3;
-      src3 = src4;
-      if ((y + 2) < (height - 1)) {
-        src4 += src_stride;
-      }
-      dst += dst_stride;
-    }
-    free_aligned_buffer_64(rowbuf);
-  }
-  return 0;
-}
-
-// Sobel ARGB effect.
-static int ARGBSobelize(const uint8_t* src_argb,
-                        int src_stride_argb,
-                        uint8_t* dst_argb,
-                        int dst_stride_argb,
-                        int width,
-                        int height,
-                        void (*SobelRow)(const uint8_t* src_sobelx,
-                                         const uint8_t* src_sobely,
-                                         uint8_t* dst,
-                                         int width)) {
-  int y;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
-      ARGBToYJRow_C;
-  void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
-                    uint8_t* dst_sobely, int width) = SobelYRow_C;
-  void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
-                    const uint8_t* src_y2, uint8_t* dst_sobely, int width) =
-      SobelXRow_C;
-  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-#if defined(HAS_ARGBTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_MSA;
-    }
-  }
-#endif
-
-#if defined(HAS_SOBELYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelYRow = SobelYRow_SSE2;
-  }
-#endif
-#if defined(HAS_SOBELYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelYRow = SobelYRow_NEON;
-  }
-#endif
-#if defined(HAS_SOBELYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SobelYRow = SobelYRow_MMI;
-  }
-#endif
-#if defined(HAS_SOBELYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    SobelYRow = SobelYRow_MSA;
-  }
-#endif
-#if defined(HAS_SOBELXROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelXRow = SobelXRow_SSE2;
-  }
-#endif
-#if defined(HAS_SOBELXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelXRow = SobelXRow_NEON;
-  }
-#endif
-#if defined(HAS_SOBELXROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SobelXRow = SobelXRow_MMI;
-  }
-#endif
-#if defined(HAS_SOBELXROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    SobelXRow = SobelXRow_MSA;
-  }
-#endif
-  {
-    // 3 rows with edges before/after.
-    const int kRowSize = (width + kEdge + 31) & ~31;
-    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
-    uint8_t* row_sobelx = rows;
-    uint8_t* row_sobely = rows + kRowSize;
-    uint8_t* row_y = rows + kRowSize * 2;
-
-    // Convert first row.
-    uint8_t* row_y0 = row_y + kEdge;
-    uint8_t* row_y1 = row_y0 + kRowSize;
-    uint8_t* row_y2 = row_y1 + kRowSize;
-    ARGBToYJRow(src_argb, row_y0, width);
-    row_y0[-1] = row_y0[0];
-    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
-    ARGBToYJRow(src_argb, row_y1, width);
-    row_y1[-1] = row_y1[0];
-    memset(row_y1 + width, row_y1[width - 1], 16);
-    memset(row_y2 + width, 0, 16);
-
-    for (y = 0; y < height; ++y) {
-      // Convert next row of ARGB to G.
-      if (y < (height - 1)) {
-        src_argb += src_stride_argb;
-      }
-      ARGBToYJRow(src_argb, row_y2, width);
-      row_y2[-1] = row_y2[0];
-      row_y2[width] = row_y2[width - 1];
-
-      SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
-      SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
-      SobelRow(row_sobelx, row_sobely, dst_argb, width);
-
-      // Cycle thru circular queue of 3 row_y buffers.
-      {
-        uint8_t* row_yt = row_y0;
-        row_y0 = row_y1;
-        row_y1 = row_y2;
-        row_y2 = row_yt;
-      }
-
-      dst_argb += dst_stride_argb;
-    }
-    free_aligned_buffer_64(rows);
-  }
-  return 0;
-}
-
-// Sobel ARGB effect.
-LIBYUV_API
-int ARGBSobel(const uint8_t* src_argb,
-              int src_stride_argb,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int width,
-              int height) {
-  void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
-                   uint8_t* dst_argb, int width) = SobelRow_C;
-#if defined(HAS_SOBELROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelRow = SobelRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SobelRow = SobelRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SOBELROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelRow = SobelRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      SobelRow = SobelRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SOBELROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SobelRow = SobelRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SobelRow = SobelRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SOBELROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    SobelRow = SobelRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      SobelRow = SobelRow_MSA;
-    }
-  }
-#endif
-  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-                      width, height, SobelRow);
-}
-
-// Sobel ARGB effect with planar output.
-LIBYUV_API
-int ARGBSobelToPlane(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     int width,
-                     int height) {
-  void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
-                          uint8_t* dst_, int width) = SobelToPlaneRow_C;
-#if defined(HAS_SOBELTOPLANEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SobelToPlaneRow = SobelToPlaneRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SOBELTOPLANEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SobelToPlaneRow = SobelToPlaneRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SOBELTOPLANEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SobelToPlaneRow = SobelToPlaneRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SobelToPlaneRow = SobelToPlaneRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SOBELTOPLANEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      SobelToPlaneRow = SobelToPlaneRow_MSA;
-    }
-  }
-#endif
-  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
-                      height, SobelToPlaneRow);
-}
-
-// SobelXY ARGB effect.
-// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
-LIBYUV_API
-int ARGBSobelXY(const uint8_t* src_argb,
-                int src_stride_argb,
-                uint8_t* dst_argb,
-                int dst_stride_argb,
-                int width,
-                int height) {
-  void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
-                     uint8_t* dst_argb, int width) = SobelXYRow_C;
-#if defined(HAS_SOBELXYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelXYRow = SobelXYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SobelXYRow = SobelXYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SOBELXYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelXYRow = SobelXYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      SobelXYRow = SobelXYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SOBELXYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SobelXYRow = SobelXYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SobelXYRow = SobelXYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SOBELXYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    SobelXYRow = SobelXYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      SobelXYRow = SobelXYRow_MSA;
-    }
-  }
-#endif
-  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-                      width, height, SobelXYRow);
-}
-
-// Apply a 4x4 polynomial to each ARGB pixel.
-LIBYUV_API
-int ARGBPolynomial(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   const float* poly,
-                   int width,
-                   int height) {
-  int y;
-  void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                            const float* poly, int width) = ARGBPolynomialRow_C;
-  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
-    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
-  }
-#endif
-#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
-      IS_ALIGNED(width, 2)) {
-    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBPolynomialRow(src_argb, dst_argb, poly, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert plane of 16 bit shorts to half floats.
-// Source values are multiplied by scale before storing as half float.
-LIBYUV_API
-int HalfFloatPlane(const uint16_t* src_y,
-                   int src_stride_y,
-                   uint16_t* dst_y,
-                   int dst_stride_y,
-                   float scale,
-                   int width,
-                   int height) {
-  int y;
-  void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale,
-                       int width) = HalfFloatRow_C;
-  if (!src_y || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  src_stride_y >>= 1;
-  dst_stride_y >>= 1;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-#if defined(HAS_HALFFLOATROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    HalfFloatRow = HalfFloatRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      HalfFloatRow = HalfFloatRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_HALFFLOATROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    HalfFloatRow = HalfFloatRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      HalfFloatRow = HalfFloatRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_HALFFLOATROW_F16C)
-  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
-    HalfFloatRow =
-        (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
-    if (IS_ALIGNED(width, 16)) {
-      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
-    }
-  }
-#endif
-#if defined(HAS_HALFFLOATROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    HalfFloatRow =
-        (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_HALFFLOATROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    HalfFloatRow = HalfFloatRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      HalfFloatRow = HalfFloatRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    HalfFloatRow(src_y, dst_y, scale, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-  return 0;
-}
-
-// Convert a buffer of bytes to floats, scale the values and store as floats.
-LIBYUV_API
-int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {
-  void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,
-                         int width) = ByteToFloatRow_C;
-  if (!src_y || !dst_y || width <= 0) {
-    return -1;
-  }
-#if defined(HAS_BYTETOFLOATROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ByteToFloatRow = ByteToFloatRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ByteToFloatRow = ByteToFloatRow_NEON;
-    }
-  }
-#endif
-
-  ByteToFloatRow(src_y, dst_y, scale, width);
-  return 0;
-}
-
-// Apply a lumacolortable to each ARGB pixel.
-LIBYUV_API
-int ARGBLumaColorTable(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_argb,
-                       int dst_stride_argb,
-                       const uint8_t* luma,
-                       int width,
-                       int height) {
-  int y;
-  void (*ARGBLumaColorTableRow)(
-      const uint8_t* src_argb, uint8_t* dst_argb, int width,
-      const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;
-  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
-    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Copy Alpha from one ARGB image to another.
-LIBYUV_API
-int ARGBCopyAlpha(const uint8_t* src_argb,
-                  int src_stride_argb,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int width,
-                  int height) {
-  int y;
-  void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                           int width) = ARGBCopyAlphaRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBCOPYALPHAROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBCopyAlphaRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Extract just the alpha channel from ARGB.
-LIBYUV_API
-int ARGBExtractAlpha(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_a,
-                     int dst_stride_a,
-                     int width,
-                     int height) {
-  if (!src_argb || !dst_a || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb += (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_a == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_a = 0;
-  }
-  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
-                              int width) = ARGBExtractAlphaRow_C;
-#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
-                                               : ARGBExtractAlphaRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
-                                                : ARGBExtractAlphaRow_Any_AVX2;
-  }
-#endif
-#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
-                                                : ARGBExtractAlphaRow_Any_NEON;
-  }
-#endif
-#if defined(HAS_ARGBEXTRACTALPHAROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI
-                                               : ARGBExtractAlphaRow_Any_MMI;
-  }
-#endif
-#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
-                                                : ARGBExtractAlphaRow_Any_MSA;
-  }
-#endif
-
-  for (int y = 0; y < height; ++y) {
-    ARGBExtractAlphaRow(src_argb, dst_a, width);
-    src_argb += src_stride_argb;
-    dst_a += dst_stride_a;
-  }
-  return 0;
-}
-
-// Copy a planar Y channel to the alpha channel of a destination ARGB image.
-LIBYUV_API
-int ARGBCopyYToAlpha(const uint8_t* src_y,
-                     int src_stride_y,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     int width,
-                     int height) {
-  int y;
-  void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb,
-                              int width) = ARGBCopyYToAlphaRow_C;
-  if (!src_y || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBCopyYToAlphaRow(src_y, dst_argb, width);
-    src_y += src_stride_y;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// TODO(fbarchard): Consider if width is even Y channel can be split
-// directly. A SplitUVRow_Odd function could copy the remaining chroma.
-
-LIBYUV_API
-int YUY2ToNV12(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
-                     int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-#if defined(HAS_SPLITUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SplitUVRow = SplitUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SplitUVRow = SplitUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    SplitUVRow = SplitUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      InterpolateRow = InterpolateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-
-  {
-    int awidth = halfwidth * 2;
-    // row of y and 2 rows of uv
-    align_buffer_64(rows, awidth * 3);
-
-    for (y = 0; y < height - 1; y += 2) {
-      // Split Y from UV.
-      SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
-      memcpy(dst_y, rows, width);
-      SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
-      memcpy(dst_y + dst_stride_y, rows, width);
-      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
-      src_yuy2 += src_stride_yuy2 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      // Split Y from UV.
-      SplitUVRow(src_yuy2, rows, dst_uv, awidth);
-      memcpy(dst_y, rows, width);
-    }
-    free_aligned_buffer_64(rows);
-  }
-  return 0;
-}
-
-LIBYUV_API
-int UYVYToNV12(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
-                     int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-#if defined(HAS_SPLITUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SplitUVRow = SplitUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SplitUVRow = SplitUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    SplitUVRow = SplitUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      InterpolateRow = InterpolateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-
-  {
-    int awidth = halfwidth * 2;
-    // row of y and 2 rows of uv
-    align_buffer_64(rows, awidth * 3);
-
-    for (y = 0; y < height - 1; y += 2) {
-      // Split Y from UV.
-      SplitUVRow(src_uyvy, rows + awidth, rows, awidth);
-      memcpy(dst_y, rows, width);
-      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);
-      memcpy(dst_y + dst_stride_y, rows, width);
-      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
-      src_uyvy += src_stride_uyvy * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      // Split Y from UV.
-      SplitUVRow(src_uyvy, dst_uv, rows, awidth);
-      memcpy(dst_y, rows, width);
-    }
-    free_aligned_buffer_64(rows);
-  }
-  return 0;
-}
-
-// width and height are src size allowing odd size handling.
-LIBYUV_API
-void HalfMergeUVPlane(const uint8_t* src_u,
-                      int src_stride_u,
-                      const uint8_t* src_v,
-                      int src_stride_v,
-                      uint8_t* dst_uv,
-                      int dst_stride_uv,
-                      int width,
-                      int height) {
-  int y;
-  void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
-                         const uint8_t* src_v, int src_stride_v,
-                         uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-#if defined(HAS_HALFMERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    HalfMergeUVRow = HalfMergeUVRow_NEON;
-  }
-#endif
-#if defined(HAS_HALFMERGEUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
-    HalfMergeUVRow = HalfMergeUVRow_SSSE3;
-  }
-#endif
-#if defined(HAS_HALFMERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    HalfMergeUVRow = HalfMergeUVRow_AVX2;
-  }
-#endif
-  for (y = 0; y < height - 1; y += 2) {
-    // Merge a row of U and V into a row of UV.
-    HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
-    src_u += src_stride_u * 2;
-    src_v += src_stride_v * 2;
-    dst_uv += dst_stride_uv;
-  }
-  if (height & 1) {
-    HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/rotate.cc b/thirdparty/libyuv/source/rotate.cc
deleted file mode 100644
index 32904e4..0000000
--- a/thirdparty/libyuv/source/rotate.cc
+++ /dev/null
@@ -1,609 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate.h"
-
-#include "libyuv/convert.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-LIBYUV_API
-void TransposePlane(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-  int i = height;
-#if defined(HAS_TRANSPOSEWX16_MSA)
-  void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
-                        int dst_stride, int width) = TransposeWx16_C;
-#else
-  void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
-                       int dst_stride, int width) = TransposeWx8_C;
-#endif
-
-#if defined(HAS_TRANSPOSEWX16_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    TransposeWx16 = TransposeWx16_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      TransposeWx16 = TransposeWx16_MSA;
-    }
-  }
-#else
-#if defined(HAS_TRANSPOSEWX8_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    TransposeWx8 = TransposeWx8_NEON;
-  }
-#endif
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    TransposeWx8 = TransposeWx8_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      TransposeWx8 = TransposeWx8_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_TRANSPOSEWX8_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    TransposeWx8 = TransposeWx8_MMI;
-  }
-#endif
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      TransposeWx8 = TransposeWx8_Fast_SSSE3;
-    }
-  }
-#endif
-#endif /* defined(HAS_TRANSPOSEWX16_MSA) */
-
-#if defined(HAS_TRANSPOSEWX16_MSA)
-  // Work across the source in 16x16 tiles
-  while (i >= 16) {
-    TransposeWx16(src, src_stride, dst, dst_stride, width);
-    src += 16 * src_stride;  // Go down 16 rows.
-    dst += 16;               // Move over 16 columns.
-    i -= 16;
-  }
-#else
-  // Work across the source in 8x8 tiles
-  while (i >= 8) {
-    TransposeWx8(src, src_stride, dst, dst_stride, width);
-    src += 8 * src_stride;  // Go down 8 rows.
-    dst += 8;               // Move over 8 columns.
-    i -= 8;
-  }
-#endif
-
-  if (i > 0) {
-    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
-  }
-}
-
-LIBYUV_API
-void RotatePlane90(const uint8_t* src,
-                   int src_stride,
-                   uint8_t* dst,
-                   int dst_stride,
-                   int width,
-                   int height) {
-  // Rotate by 90 is a transpose with the source read
-  // from bottom to top. So set the source pointer to the end
-  // of the buffer and flip the sign of the source stride.
-  src += src_stride * (height - 1);
-  src_stride = -src_stride;
-  TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane270(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-  // Rotate by 270 is a transpose with the destination written
-  // from bottom to top. So set the destination pointer to the end
-  // of the buffer and flip the sign of the destination stride.
-  dst += dst_stride * (width - 1);
-  dst_stride = -dst_stride;
-  TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane180(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-  // Swap first and last row and mirror the content. Uses a temporary row.
-  align_buffer_64(row, width);
-  const uint8_t* src_bot = src + src_stride * (height - 1);
-  uint8_t* dst_bot = dst + dst_stride * (height - 1);
-  int half_height = (height + 1) >> 1;
-  int y;
-  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
-  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-#if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorRow = MirrorRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorRow = MirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MirrorRow = MirrorRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      MirrorRow = MirrorRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MirrorRow = MirrorRow_Any_MSA;
-    if (IS_ALIGNED(width, 64)) {
-      MirrorRow = MirrorRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI;
-  }
-#endif
-
-  // Odd height will harmlessly mirror the middle row twice.
-  for (y = 0; y < half_height; ++y) {
-    CopyRow(src, row, width);        // Copy first row into buffer
-    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
-    MirrorRow(row, dst_bot, width);  // Mirror buffer into last row
-    src += src_stride;
-    dst += dst_stride;
-    src_bot -= src_stride;
-    dst_bot -= dst_stride;
-  }
-  free_aligned_buffer_64(row);
-}
-
-LIBYUV_API
-void TransposeUV(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height) {
-  int i = height;
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
-  void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
-                          int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
-                          int width) = TransposeUVWx16_C;
-#else
-  void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
-                         int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
-                         int width) = TransposeUVWx8_C;
-#endif
-
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    TransposeUVWx16 = TransposeUVWx16_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      TransposeUVWx16 = TransposeUVWx16_MSA;
-    }
-  }
-#else
-#if defined(HAS_TRANSPOSEUVWX8_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    TransposeUVWx8 = TransposeUVWx8_NEON;
-  }
-#endif
-#if defined(HAS_TRANSPOSEUVWX8_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      TransposeUVWx8 = TransposeUVWx8_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_TRANSPOSEUVWX8_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    TransposeUVWx8 = TransposeUVWx8_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      TransposeUVWx8 = TransposeUVWx8_MMI;
-    }
-  }
-#endif
-#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
-
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
-  // Work through the source in 8x8 tiles.
-  while (i >= 16) {
-    TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
-                    width);
-    src += 16 * src_stride;  // Go down 16 rows.
-    dst_a += 16;             // Move over 8 columns.
-    dst_b += 16;             // Move over 8 columns.
-    i -= 16;
-  }
-#else
-  // Work through the source in 8x8 tiles.
-  while (i >= 8) {
-    TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
-                   width);
-    src += 8 * src_stride;  // Go down 8 rows.
-    dst_a += 8;             // Move over 8 columns.
-    dst_b += 8;             // Move over 8 columns.
-    i -= 8;
-  }
-#endif
-
-  if (i > 0) {
-    TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
-                     width, i);
-  }
-}
-
-LIBYUV_API
-void RotateUV90(const uint8_t* src,
-                int src_stride,
-                uint8_t* dst_a,
-                int dst_stride_a,
-                uint8_t* dst_b,
-                int dst_stride_b,
-                int width,
-                int height) {
-  src += src_stride * (height - 1);
-  src_stride = -src_stride;
-
-  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
-              height);
-}
-
-LIBYUV_API
-void RotateUV270(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height) {
-  dst_a += dst_stride_a * (width - 1);
-  dst_b += dst_stride_b * (width - 1);
-  dst_stride_a = -dst_stride_a;
-  dst_stride_b = -dst_stride_b;
-
-  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
-              height);
-}
-
-// Rotate 180 is a horizontal and vertical flip.
-LIBYUV_API
-void RotateUV180(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height) {
-  int i;
-  void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
-                           int width) = MirrorSplitUVRow_C;
-#if defined(HAS_MIRRORSPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    MirrorSplitUVRow = MirrorSplitUVRow_NEON;
-  }
-#endif
-#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
-    MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
-  }
-#endif
-#if defined(HAS_MIRRORSPLITUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) {
-    MirrorSplitUVRow = MirrorSplitUVRow_MMI;
-  }
-#endif
-#if defined(HAS_MIRRORSPLITUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
-    MirrorSplitUVRow = MirrorSplitUVRow_MSA;
-  }
-#endif
-
-  dst_a += dst_stride_a * (height - 1);
-  dst_b += dst_stride_b * (height - 1);
-
-  for (i = 0; i < height; ++i) {
-    MirrorSplitUVRow(src, dst_a, dst_b, width);
-    src += src_stride;
-    dst_a -= dst_stride_a;
-    dst_b -= dst_stride_b;
-  }
-}
-
-LIBYUV_API
-int RotatePlane(const uint8_t* src,
-                int src_stride,
-                uint8_t* dst,
-                int dst_stride,
-                int width,
-                int height,
-                enum RotationMode mode) {
-  if (!src || width <= 0 || height == 0 || !dst) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src = src + (height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      CopyPlane(src, src_stride, dst, dst_stride, width, height);
-      return 0;
-    case kRotate90:
-      RotatePlane90(src, src_stride, dst, dst_stride, width, height);
-      return 0;
-    case kRotate270:
-      RotatePlane270(src, src_stride, dst, dst_stride, width, height);
-      return 0;
-    case kRotate180:
-      RotatePlane180(src, src_stride, dst, dst_stride, width, height);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-LIBYUV_API
-int I420Rotate(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height,
-               enum RotationMode mode) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
-      !dst_u || !dst_v) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                      dst_v, dst_stride_v, width, height);
-    case kRotate90:
-      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
-                    halfheight);
-      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
-                    halfheight);
-      return 0;
-    case kRotate270:
-      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
-                     halfheight);
-      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
-                     halfheight);
-      return 0;
-    case kRotate180:
-      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
-                     halfheight);
-      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
-                     halfheight);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-LIBYUV_API
-int I444Rotate(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height,
-               enum libyuv::RotationMode mode) {
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
-      !dst_u || !dst_v) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  switch (mode) {
-    case libyuv::kRotate0:
-      // copy frame
-      CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-      return 0;
-    case libyuv::kRotate90:
-      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-      return 0;
-    case libyuv::kRotate270:
-      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-      return 0;
-    case libyuv::kRotate180:
-      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-LIBYUV_API
-int NV12ToI420Rotate(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     uint8_t* dst_u,
-                     int dst_stride_u,
-                     uint8_t* dst_v,
-                     int dst_stride_v,
-                     int width,
-                     int height,
-                     enum RotationMode mode) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
-      !dst_v) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
-    src_stride_y = -src_stride_y;
-    src_stride_uv = -src_stride_uv;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
-                        dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
-                        width, height);
-    case kRotate90:
-      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
-                 dst_stride_v, halfwidth, halfheight);
-      return 0;
-    case kRotate270:
-      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
-                  dst_stride_v, halfwidth, halfheight);
-      return 0;
-    case kRotate180:
-      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
-                  dst_stride_v, halfwidth, halfheight);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/rotate_any.cc b/thirdparty/libyuv/source/rotate_any.cc
deleted file mode 100644
index b3baf08..0000000
--- a/thirdparty/libyuv/source/rotate_any.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate.h"
-#include "libyuv/rotate_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define TANY(NAMEANY, TPOS_SIMD, MASK)                                        \
-  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst,              \
-               int dst_stride, int width) {                                   \
-    int r = width & MASK;                                                     \
-    int n = width - r;                                                        \
-    if (n > 0) {                                                              \
-      TPOS_SIMD(src, src_stride, dst, dst_stride, n);                         \
-    }                                                                         \
-    TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
-  }
-
-#ifdef HAS_TRANSPOSEWX8_NEON
-TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
-#endif
-#ifdef HAS_TRANSPOSEWX8_SSSE3
-TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
-#endif
-#ifdef HAS_TRANSPOSEWX8_MMI
-TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7)
-#endif
-#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
-TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
-#endif
-#ifdef HAS_TRANSPOSEWX16_MSA
-TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
-#endif
-#undef TANY
-
-#define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
-  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a,             \
-               int dst_stride_a, uint8_t* dst_b, int dst_stride_b,             \
-               int width) {                                                    \
-    int r = width & MASK;                                                      \
-    int n = width - r;                                                         \
-    if (n > 0) {                                                               \
-      TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
-    }                                                                          \
-    TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a,        \
-                     dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
-  }
-
-#ifdef HAS_TRANSPOSEUVWX8_NEON
-TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
-#endif
-#ifdef HAS_TRANSPOSEUVWX8_SSE2
-TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
-#endif
-#ifdef HAS_TRANSPOSEUVWX8_MMI
-TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7)
-#endif
-#ifdef HAS_TRANSPOSEUVWX16_MSA
-TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
-#endif
-#undef TUVANY
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/rotate_argb.cc b/thirdparty/libyuv/source/rotate_argb.cc
deleted file mode 100644
index ae65388..0000000
--- a/thirdparty/libyuv/source/rotate_argb.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate.h"
-
-#include "libyuv/convert.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-static int ARGBTranspose(const uint8_t* src_argb,
-                         int src_stride_argb,
-                         uint8_t* dst_argb,
-                         int dst_stride_argb,
-                         int width,
-                         int height) {
-  int i;
-  int src_pixel_step = src_stride_argb >> 2;
-  void (*ScaleARGBRowDownEven)(
-      const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
-      uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
-  // Check stride is a multiple of 4.
-  if (src_stride_argb & 3) {
-    return -1;
-  }
-#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
-    if (IS_ALIGNED(height, 4)) {  // Width of dest.
-      ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON;
-    if (IS_ALIGNED(height, 4)) {  // Width of dest.
-      ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI;
-    if (IS_ALIGNED(height, 4)) {  // Width of dest.
-      ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
-    if (IS_ALIGNED(height, 4)) {  // Width of dest.
-      ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA;
-    }
-  }
-#endif
-
-  for (i = 0; i < width; ++i) {  // column of source to row of dest.
-    ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
-    dst_argb += dst_stride_argb;
-    src_argb += 4;
-  }
-  return 0;
-}
-
-static int ARGBRotate90(const uint8_t* src_argb,
-                        int src_stride_argb,
-                        uint8_t* dst_argb,
-                        int dst_stride_argb,
-                        int width,
-                        int height) {
-  // Rotate by 90 is a ARGBTranspose with the source read
-  // from bottom to top. So set the source pointer to the end
-  // of the buffer and flip the sign of the source stride.
-  src_argb += src_stride_argb * (height - 1);
-  src_stride_argb = -src_stride_argb;
-  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-                       width, height);
-}
-
-static int ARGBRotate270(const uint8_t* src_argb,
-                         int src_stride_argb,
-                         uint8_t* dst_argb,
-                         int dst_stride_argb,
-                         int width,
-                         int height) {
-  // Rotate by 270 is a ARGBTranspose with the destination written
-  // from bottom to top. So set the destination pointer to the end
-  // of the buffer and flip the sign of the destination stride.
-  dst_argb += dst_stride_argb * (width - 1);
-  dst_stride_argb = -dst_stride_argb;
-  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-                       width, height);
-}
-
-static int ARGBRotate180(const uint8_t* src_argb,
-                         int src_stride_argb,
-                         uint8_t* dst_argb,
-                         int dst_stride_argb,
-                         int width,
-                         int height) {
-  // Swap first and last row and mirror the content. Uses a temporary row.
-  align_buffer_64(row, width * 4);
-  const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
-  uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
-  int half_height = (height + 1) >> 1;
-  int y;
-  void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
-      ARGBMirrorRow_C;
-  void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
-      CopyRow_C;
-#if defined(HAS_ARGBMIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMirrorRow = ARGBMirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMirrorRow = ARGBMirrorRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMirrorRow = ARGBMirrorRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBMirrorRow = ARGBMirrorRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBMirrorRow = ARGBMirrorRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
-
-  // Odd height will harmlessly mirror the middle row twice.
-  for (y = 0; y < half_height; ++y) {
-    ARGBMirrorRow(src_argb, row, width);      // Mirror first row into a buffer
-    ARGBMirrorRow(src_bot, dst_argb, width);  // Mirror last row into first row
-    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-    src_bot -= src_stride_argb;
-    dst_bot -= dst_stride_argb;
-  }
-  free_aligned_buffer_64(row);
-  return 0;
-}
-
-LIBYUV_API
-int ARGBRotate(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height,
-               enum RotationMode mode) {
-  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-                      width, height);
-    case kRotate90:
-      return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-                          width, height);
-    case kRotate270:
-      return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-                           width, height);
-    case kRotate180:
-      return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-                           width, height);
-    default:
-      break;
-  }
-  return -1;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/rotate_common.cc b/thirdparty/libyuv/source/rotate_common.cc
deleted file mode 100644
index ff212ad..0000000
--- a/thirdparty/libyuv/source/rotate_common.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-void TransposeWx8_C(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst[0] = src[0 * src_stride];
-    dst[1] = src[1 * src_stride];
-    dst[2] = src[2 * src_stride];
-    dst[3] = src[3 * src_stride];
-    dst[4] = src[4 * src_stride];
-    dst[5] = src[5 * src_stride];
-    dst[6] = src[6 * src_stride];
-    dst[7] = src[7 * src_stride];
-    ++src;
-    dst += dst_stride;
-  }
-}
-
-void TransposeUVWx8_C(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst_a,
-                      int dst_stride_a,
-                      uint8_t* dst_b,
-                      int dst_stride_b,
-                      int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst_a[0] = src[0 * src_stride + 0];
-    dst_b[0] = src[0 * src_stride + 1];
-    dst_a[1] = src[1 * src_stride + 0];
-    dst_b[1] = src[1 * src_stride + 1];
-    dst_a[2] = src[2 * src_stride + 0];
-    dst_b[2] = src[2 * src_stride + 1];
-    dst_a[3] = src[3 * src_stride + 0];
-    dst_b[3] = src[3 * src_stride + 1];
-    dst_a[4] = src[4 * src_stride + 0];
-    dst_b[4] = src[4 * src_stride + 1];
-    dst_a[5] = src[5 * src_stride + 0];
-    dst_b[5] = src[5 * src_stride + 1];
-    dst_a[6] = src[6 * src_stride + 0];
-    dst_b[6] = src[6 * src_stride + 1];
-    dst_a[7] = src[7 * src_stride + 0];
-    dst_b[7] = src[7 * src_stride + 1];
-    src += 2;
-    dst_a += dst_stride_a;
-    dst_b += dst_stride_b;
-  }
-}
-
-void TransposeWxH_C(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int j;
-    for (j = 0; j < height; ++j) {
-      dst[i * dst_stride + j] = src[j * src_stride + i];
-    }
-  }
-}
-
-void TransposeUVWxH_C(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst_a,
-                      int dst_stride_a,
-                      uint8_t* dst_b,
-                      int dst_stride_b,
-                      int width,
-                      int height) {
-  int i;
-  for (i = 0; i < width * 2; i += 2) {
-    int j;
-    for (j = 0; j < height; ++j) {
-      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
-      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
-    }
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/rotate_gcc.cc b/thirdparty/libyuv/source/rotate_gcc.cc
deleted file mode 100644
index 8401d4f..0000000
--- a/thirdparty/libyuv/source/rotate_gcc.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__))
-
-// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
-void TransposeWx8_SSSE3(const uint8_t* src,
-                        int src_stride,
-                        uint8_t* dst,
-                        int dst_stride,
-                        int width) {
-  asm volatile(
-      // Read in the data from the source pointer.
-      // First round of bit swap.
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"
-      "movq        (%0,%3),%%xmm1                \n"
-      "lea         (%0,%3,2),%0                  \n"
-      "punpcklbw   %%xmm1,%%xmm0                 \n"
-      "movq        (%0),%%xmm2                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "palignr     $0x8,%%xmm1,%%xmm1            \n"
-      "movq        (%0,%3),%%xmm3                \n"
-      "lea         (%0,%3,2),%0                  \n"
-      "punpcklbw   %%xmm3,%%xmm2                 \n"
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "movq        (%0),%%xmm4                   \n"
-      "palignr     $0x8,%%xmm3,%%xmm3            \n"
-      "movq        (%0,%3),%%xmm5                \n"
-      "lea         (%0,%3,2),%0                  \n"
-      "punpcklbw   %%xmm5,%%xmm4                 \n"
-      "movdqa      %%xmm4,%%xmm5                 \n"
-      "movq        (%0),%%xmm6                   \n"
-      "palignr     $0x8,%%xmm5,%%xmm5            \n"
-      "movq        (%0,%3),%%xmm7                \n"
-      "lea         (%0,%3,2),%0                  \n"
-      "punpcklbw   %%xmm7,%%xmm6                 \n"
-      "neg         %3                            \n"
-      "movdqa      %%xmm6,%%xmm7                 \n"
-      "lea         0x8(%0,%3,8),%0               \n"
-      "palignr     $0x8,%%xmm7,%%xmm7            \n"
-      "neg         %3                            \n"
-      // Second round of bit swap.
-      "punpcklwd   %%xmm2,%%xmm0                 \n"
-      "punpcklwd   %%xmm3,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "palignr     $0x8,%%xmm2,%%xmm2            \n"
-      "palignr     $0x8,%%xmm3,%%xmm3            \n"
-      "punpcklwd   %%xmm6,%%xmm4                 \n"
-      "punpcklwd   %%xmm7,%%xmm5                 \n"
-      "movdqa      %%xmm4,%%xmm6                 \n"
-      "movdqa      %%xmm5,%%xmm7                 \n"
-      "palignr     $0x8,%%xmm6,%%xmm6            \n"
-      "palignr     $0x8,%%xmm7,%%xmm7            \n"
-      // Third round of bit swap.
-      // Write to the destination pointer.
-      "punpckldq   %%xmm4,%%xmm0                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "movdqa      %%xmm0,%%xmm4                 \n"
-      "palignr     $0x8,%%xmm4,%%xmm4            \n"
-      "movq        %%xmm4,(%1,%4)                \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "punpckldq   %%xmm6,%%xmm2                 \n"
-      "movdqa      %%xmm2,%%xmm6                 \n"
-      "movq        %%xmm2,(%1)                   \n"
-      "palignr     $0x8,%%xmm6,%%xmm6            \n"
-      "punpckldq   %%xmm5,%%xmm1                 \n"
-      "movq        %%xmm6,(%1,%4)                \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "movdqa      %%xmm1,%%xmm5                 \n"
-      "movq        %%xmm1,(%1)                   \n"
-      "palignr     $0x8,%%xmm5,%%xmm5            \n"
-      "movq        %%xmm5,(%1,%4)                \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "punpckldq   %%xmm7,%%xmm3                 \n"
-      "movq        %%xmm3,(%1)                   \n"
-      "movdqa      %%xmm3,%%xmm7                 \n"
-      "palignr     $0x8,%%xmm7,%%xmm7            \n"
-      "sub         $0x8,%2                       \n"
-      "movq        %%xmm7,(%1,%4)                \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "jg          1b                            \n"
-      : "+r"(src),                    // %0
-        "+r"(dst),                    // %1
-        "+r"(width)                   // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride))   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
-
-// Transpose 16x8. 64 bit
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-void TransposeWx8_Fast_SSSE3(const uint8_t* src,
-                             int src_stride,
-                             uint8_t* dst,
-                             int dst_stride,
-                             int width) {
-  asm volatile(
-      // Read in the data from the source pointer.
-      // First round of bit swap.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      (%0,%3),%%xmm1                \n"
-      "lea         (%0,%3,2),%0                  \n"
-      "movdqa      %%xmm0,%%xmm8                 \n"
-      "punpcklbw   %%xmm1,%%xmm0                 \n"
-      "punpckhbw   %%xmm1,%%xmm8                 \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm8,%%xmm9                 \n"
-      "palignr     $0x8,%%xmm1,%%xmm1            \n"
-      "palignr     $0x8,%%xmm9,%%xmm9            \n"
-      "movdqu      (%0,%3),%%xmm3                \n"
-      "lea         (%0,%3,2),%0                  \n"
-      "movdqa      %%xmm2,%%xmm10                \n"
-      "punpcklbw   %%xmm3,%%xmm2                 \n"
-      "punpckhbw   %%xmm3,%%xmm10                \n"
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "movdqa      %%xmm10,%%xmm11               \n"
-      "movdqu      (%0),%%xmm4                   \n"
-      "palignr     $0x8,%%xmm3,%%xmm3            \n"
-      "palignr     $0x8,%%xmm11,%%xmm11          \n"
-      "movdqu      (%0,%3),%%xmm5                \n"
-      "lea         (%0,%3,2),%0                  \n"
-      "movdqa      %%xmm4,%%xmm12                \n"
-      "punpcklbw   %%xmm5,%%xmm4                 \n"
-      "punpckhbw   %%xmm5,%%xmm12                \n"
-      "movdqa      %%xmm4,%%xmm5                 \n"
-      "movdqa      %%xmm12,%%xmm13               \n"
-      "movdqu      (%0),%%xmm6                   \n"
-      "palignr     $0x8,%%xmm5,%%xmm5            \n"
-      "palignr     $0x8,%%xmm13,%%xmm13          \n"
-      "movdqu      (%0,%3),%%xmm7                \n"
-      "lea         (%0,%3,2),%0                  \n"
-      "movdqa      %%xmm6,%%xmm14                \n"
-      "punpcklbw   %%xmm7,%%xmm6                 \n"
-      "punpckhbw   %%xmm7,%%xmm14                \n"
-      "neg         %3                            \n"
-      "movdqa      %%xmm6,%%xmm7                 \n"
-      "movdqa      %%xmm14,%%xmm15               \n"
-      "lea         0x10(%0,%3,8),%0              \n"
-      "palignr     $0x8,%%xmm7,%%xmm7            \n"
-      "palignr     $0x8,%%xmm15,%%xmm15          \n"
-      "neg         %3                            \n"
-      // Second round of bit swap.
-      "punpcklwd   %%xmm2,%%xmm0                 \n"
-      "punpcklwd   %%xmm3,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "palignr     $0x8,%%xmm2,%%xmm2            \n"
-      "palignr     $0x8,%%xmm3,%%xmm3            \n"
-      "punpcklwd   %%xmm6,%%xmm4                 \n"
-      "punpcklwd   %%xmm7,%%xmm5                 \n"
-      "movdqa      %%xmm4,%%xmm6                 \n"
-      "movdqa      %%xmm5,%%xmm7                 \n"
-      "palignr     $0x8,%%xmm6,%%xmm6            \n"
-      "palignr     $0x8,%%xmm7,%%xmm7            \n"
-      "punpcklwd   %%xmm10,%%xmm8                \n"
-      "punpcklwd   %%xmm11,%%xmm9                \n"
-      "movdqa      %%xmm8,%%xmm10                \n"
-      "movdqa      %%xmm9,%%xmm11                \n"
-      "palignr     $0x8,%%xmm10,%%xmm10          \n"
-      "palignr     $0x8,%%xmm11,%%xmm11          \n"
-      "punpcklwd   %%xmm14,%%xmm12               \n"
-      "punpcklwd   %%xmm15,%%xmm13               \n"
-      "movdqa      %%xmm12,%%xmm14               \n"
-      "movdqa      %%xmm13,%%xmm15               \n"
-      "palignr     $0x8,%%xmm14,%%xmm14          \n"
-      "palignr     $0x8,%%xmm15,%%xmm15          \n"
-      // Third round of bit swap.
-      // Write to the destination pointer.
-      "punpckldq   %%xmm4,%%xmm0                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "movdqa      %%xmm0,%%xmm4                 \n"
-      "palignr     $0x8,%%xmm4,%%xmm4            \n"
-      "movq        %%xmm4,(%1,%4)                \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "punpckldq   %%xmm6,%%xmm2                 \n"
-      "movdqa      %%xmm2,%%xmm6                 \n"
-      "movq        %%xmm2,(%1)                   \n"
-      "palignr     $0x8,%%xmm6,%%xmm6            \n"
-      "punpckldq   %%xmm5,%%xmm1                 \n"
-      "movq        %%xmm6,(%1,%4)                \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "movdqa      %%xmm1,%%xmm5                 \n"
-      "movq        %%xmm1,(%1)                   \n"
-      "palignr     $0x8,%%xmm5,%%xmm5            \n"
-      "movq        %%xmm5,(%1,%4)                \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "punpckldq   %%xmm7,%%xmm3                 \n"
-      "movq        %%xmm3,(%1)                   \n"
-      "movdqa      %%xmm3,%%xmm7                 \n"
-      "palignr     $0x8,%%xmm7,%%xmm7            \n"
-      "movq        %%xmm7,(%1,%4)                \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "punpckldq   %%xmm12,%%xmm8                \n"
-      "movq        %%xmm8,(%1)                   \n"
-      "movdqa      %%xmm8,%%xmm12                \n"
-      "palignr     $0x8,%%xmm12,%%xmm12          \n"
-      "movq        %%xmm12,(%1,%4)               \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "punpckldq   %%xmm14,%%xmm10               \n"
-      "movdqa      %%xmm10,%%xmm14               \n"
-      "movq        %%xmm10,(%1)                  \n"
-      "palignr     $0x8,%%xmm14,%%xmm14          \n"
-      "punpckldq   %%xmm13,%%xmm9                \n"
-      "movq        %%xmm14,(%1,%4)               \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "movdqa      %%xmm9,%%xmm13                \n"
-      "movq        %%xmm9,(%1)                   \n"
-      "palignr     $0x8,%%xmm13,%%xmm13          \n"
-      "movq        %%xmm13,(%1,%4)               \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "punpckldq   %%xmm15,%%xmm11               \n"
-      "movq        %%xmm11,(%1)                  \n"
-      "movdqa      %%xmm11,%%xmm15               \n"
-      "palignr     $0x8,%%xmm15,%%xmm15          \n"
-      "sub         $0x10,%2                      \n"
-      "movq        %%xmm15,(%1,%4)               \n"
-      "lea         (%1,%4,2),%1                  \n"
-      "jg          1b                            \n"
-      : "+r"(src),                    // %0
-        "+r"(dst),                    // %1
-        "+r"(width)                   // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride))   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-        "xmm15");
-}
-#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-
-// Transpose UV 8x8.  64 bit.
-#if defined(HAS_TRANSPOSEUVWX8_SSE2)
-void TransposeUVWx8_SSE2(const uint8_t* src,
-                         int src_stride,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         int width) {
-  asm volatile(
-      // Read in the data from the source pointer.
-      // First round of bit swap.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      (%0,%4),%%xmm1                \n"
-      "lea         (%0,%4,2),%0                  \n"
-      "movdqa      %%xmm0,%%xmm8                 \n"
-      "punpcklbw   %%xmm1,%%xmm0                 \n"
-      "punpckhbw   %%xmm1,%%xmm8                 \n"
-      "movdqa      %%xmm8,%%xmm1                 \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "movdqu      (%0,%4),%%xmm3                \n"
-      "lea         (%0,%4,2),%0                  \n"
-      "movdqa      %%xmm2,%%xmm8                 \n"
-      "punpcklbw   %%xmm3,%%xmm2                 \n"
-      "punpckhbw   %%xmm3,%%xmm8                 \n"
-      "movdqa      %%xmm8,%%xmm3                 \n"
-      "movdqu      (%0),%%xmm4                   \n"
-      "movdqu      (%0,%4),%%xmm5                \n"
-      "lea         (%0,%4,2),%0                  \n"
-      "movdqa      %%xmm4,%%xmm8                 \n"
-      "punpcklbw   %%xmm5,%%xmm4                 \n"
-      "punpckhbw   %%xmm5,%%xmm8                 \n"
-      "movdqa      %%xmm8,%%xmm5                 \n"
-      "movdqu      (%0),%%xmm6                   \n"
-      "movdqu      (%0,%4),%%xmm7                \n"
-      "lea         (%0,%4,2),%0                  \n"
-      "movdqa      %%xmm6,%%xmm8                 \n"
-      "punpcklbw   %%xmm7,%%xmm6                 \n"
-      "neg         %4                            \n"
-      "lea         0x10(%0,%4,8),%0              \n"
-      "punpckhbw   %%xmm7,%%xmm8                 \n"
-      "movdqa      %%xmm8,%%xmm7                 \n"
-      "neg         %4                            \n"
-      // Second round of bit swap.
-      "movdqa      %%xmm0,%%xmm8                 \n"
-      "movdqa      %%xmm1,%%xmm9                 \n"
-      "punpckhwd   %%xmm2,%%xmm8                 \n"
-      "punpckhwd   %%xmm3,%%xmm9                 \n"
-      "punpcklwd   %%xmm2,%%xmm0                 \n"
-      "punpcklwd   %%xmm3,%%xmm1                 \n"
-      "movdqa      %%xmm8,%%xmm2                 \n"
-      "movdqa      %%xmm9,%%xmm3                 \n"
-      "movdqa      %%xmm4,%%xmm8                 \n"
-      "movdqa      %%xmm5,%%xmm9                 \n"
-      "punpckhwd   %%xmm6,%%xmm8                 \n"
-      "punpckhwd   %%xmm7,%%xmm9                 \n"
-      "punpcklwd   %%xmm6,%%xmm4                 \n"
-      "punpcklwd   %%xmm7,%%xmm5                 \n"
-      "movdqa      %%xmm8,%%xmm6                 \n"
-      "movdqa      %%xmm9,%%xmm7                 \n"
-      // Third round of bit swap.
-      // Write to the destination pointer.
-      "movdqa      %%xmm0,%%xmm8                 \n"
-      "punpckldq   %%xmm4,%%xmm0                 \n"
-      "movlpd      %%xmm0,(%1)                   \n"  // Write back U channel
-      "movhpd      %%xmm0,(%2)                   \n"  // Write back V channel
-      "punpckhdq   %%xmm4,%%xmm8                 \n"
-      "movlpd      %%xmm8,(%1,%5)                \n"
-      "lea         (%1,%5,2),%1                  \n"
-      "movhpd      %%xmm8,(%2,%6)                \n"
-      "lea         (%2,%6,2),%2                  \n"
-      "movdqa      %%xmm2,%%xmm8                 \n"
-      "punpckldq   %%xmm6,%%xmm2                 \n"
-      "movlpd      %%xmm2,(%1)                   \n"
-      "movhpd      %%xmm2,(%2)                   \n"
-      "punpckhdq   %%xmm6,%%xmm8                 \n"
-      "movlpd      %%xmm8,(%1,%5)                \n"
-      "lea         (%1,%5,2),%1                  \n"
-      "movhpd      %%xmm8,(%2,%6)                \n"
-      "lea         (%2,%6,2),%2                  \n"
-      "movdqa      %%xmm1,%%xmm8                 \n"
-      "punpckldq   %%xmm5,%%xmm1                 \n"
-      "movlpd      %%xmm1,(%1)                   \n"
-      "movhpd      %%xmm1,(%2)                   \n"
-      "punpckhdq   %%xmm5,%%xmm8                 \n"
-      "movlpd      %%xmm8,(%1,%5)                \n"
-      "lea         (%1,%5,2),%1                  \n"
-      "movhpd      %%xmm8,(%2,%6)                \n"
-      "lea         (%2,%6,2),%2                  \n"
-      "movdqa      %%xmm3,%%xmm8                 \n"
-      "punpckldq   %%xmm7,%%xmm3                 \n"
-      "movlpd      %%xmm3,(%1)                   \n"
-      "movhpd      %%xmm3,(%2)                   \n"
-      "punpckhdq   %%xmm7,%%xmm8                 \n"
-      "sub         $0x8,%3                       \n"
-      "movlpd      %%xmm8,(%1,%5)                \n"
-      "lea         (%1,%5,2),%1                  \n"
-      "movhpd      %%xmm8,(%2,%6)                \n"
-      "lea         (%2,%6,2),%2                  \n"
-      "jg          1b                            \n"
-      : "+r"(src),                      // %0
-        "+r"(dst_a),                    // %1
-        "+r"(dst_b),                    // %2
-        "+r"(width)                     // %3
-      : "r"((intptr_t)(src_stride)),    // %4
-        "r"((intptr_t)(dst_stride_a)),  // %5
-        "r"((intptr_t)(dst_stride_b))   // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7", "xmm8", "xmm9");
-}
-#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/rotate_mmi.cc b/thirdparty/libyuv/source/rotate_mmi.cc
deleted file mode 100644
index f8de608..0000000
--- a/thirdparty/libyuv/source/rotate_mmi.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-void TransposeWx8_MMI(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst,
-                      int dst_stride,
-                      int width) {
-  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
-  uint8_t* src_tmp = nullptr;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
-      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (00 10 01 11 02 12 03 13) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (04 14 05 15 06 16 07 17) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (20 30 21 31 22 32 23 33) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (24 34 25 35 26 36 27 37) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp4 = (00 10 20 30 01 11 21 31) */
-      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp5 = (02 12 22 32 03 13 23 33) */
-      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp6 = (04 14 24 34 05 15 25 35) */
-      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
-      /* tmp7 = (06 16 26 36 07 17 27 37) */
-      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (40 50 41 51 42 52 43 53) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (44 54 45 55 46 56 47 57) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (60 70 61 71 62 72 63 73) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (64 74 65 75 66 76 67 77) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp8 = (40 50 60 70 41 51 61 71) */
-      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp9 = (42 52 62 72 43 53 63 73) */
-      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp10 = (44 54 64 74 45 55 65 75) */
-      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
-      /* tmp11 = (46 56 66 76 47 57 67 77) */
-      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
-
-      /* tmp0 = (00 10 20 30 40 50 60 70) */
-      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
-      /* tmp1 = (01 11 21 31 41 51 61 71) */
-      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (02 12 22 32 42 52 62 72) */
-      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
-      /* tmp1 = (03 13 23 33 43 53 63 73) */
-      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (04 14 24 34 44 54 64 74) */
-      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
-      /* tmp1 = (05 15 25 35 45 55 65 75) */
-      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (06 16 26 36 46 56 66 76) */
-      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
-      /* tmp1 = (07 17 27 37 47 57 67 77) */
-      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "daddi      %[src],          %[src],          0x08            \n\t"
-      "daddi      %[width],        %[width],       -0x08            \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
-        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
-        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
-        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
-        [src_tmp] "+&r"(src_tmp)
-      : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
-        [dst_stride] "r"(dst_stride)
-      : "memory");
-}
-
-void TransposeUVWx8_MMI(const uint8_t* src,
-                        int src_stride,
-                        uint8_t* dst_a,
-                        int dst_stride_a,
-                        uint8_t* dst_b,
-                        int dst_stride_b,
-                        int width) {
-  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
-  uint8_t* src_tmp = nullptr;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
-      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
-      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
-      /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                  \n\t"
-
-      /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
-      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
-      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
-      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
-      /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
-      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
-
-      "dadd       %[src_tmp],     %[src_tmp],      %[src_stride]    \n\t"
-      /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
-      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
-      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
-      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
-      /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
-      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
-
-      /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
-      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
-      /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
-      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
-      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
-      /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
-      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
-      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
-      /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
-      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
-      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
-      /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
-      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "daddiu     %[src],          %[src],          0x08            \n\t"
-      "daddi      %[width],        %[width],       -0x04            \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
-        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
-        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
-        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
-        [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
-      : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
-        [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
-      : "memory");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/rotate_msa.cc b/thirdparty/libyuv/source/rotate_msa.cc
deleted file mode 100644
index 99bdca6..0000000
--- a/thirdparty/libyuv/source/rotate_msa.cc
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-
-// This module is for GCC MSA
-#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
-#include "libyuv/macros_msa.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                         \
-    out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0);     \
-    out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0);     \
-    out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2);     \
-    out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2);     \
-  }
-
-#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                         \
-    out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0);     \
-    out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0);     \
-    out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2);     \
-    out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2);     \
-  }
-
-#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                         \
-    out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0);     \
-    out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0);     \
-    out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2);     \
-    out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2);     \
-  }
-
-#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                         \
-    out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0);     \
-    out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0);     \
-    out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2);     \
-    out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
-  }
-
-void TransposeWx16_C(const uint8_t* src,
-                     int src_stride,
-                     uint8_t* dst,
-                     int dst_stride,
-                     int width) {
-  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
-  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
-                 width);
-}
-
-void TransposeUVWx16_C(const uint8_t* src,
-                       int src_stride,
-                       uint8_t* dst_a,
-                       int dst_stride_a,
-                       uint8_t* dst_b,
-                       int dst_stride_b,
-                       int width) {
-  TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
-                   width);
-  TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
-                   dst_stride_a, (dst_b + 8), dst_stride_b, width);
-}
-
-void TransposeWx16_MSA(const uint8_t* src,
-                       int src_stride,
-                       uint8_t* dst,
-                       int dst_stride,
-                       int width) {
-  int x;
-  const uint8_t* s;
-  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
-  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
-
-  for (x = 0; x < width; x += 16) {
-    s = src;
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
-    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
-    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
-    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
-    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
-    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
-    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
-    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
-    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
-    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
-    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
-    dst += dst_stride * 4;
-    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
-    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
-    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
-    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
-    dst += dst_stride * 4;
-    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
-    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
-    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
-    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
-    dst += dst_stride * 4;
-    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
-    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
-    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
-    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
-    src += 16;
-    dst += dst_stride * 4;
-  }
-}
-
-void TransposeUVWx16_MSA(const uint8_t* src,
-                         int src_stride,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         int width) {
-  int x;
-  const uint8_t* s;
-  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
-  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
-
-  for (x = 0; x < width; x += 8) {
-    s = src;
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
-    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
-    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
-    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
-    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
-    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    s += src_stride;
-    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
-    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
-    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
-    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
-    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
-    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
-    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
-    dst_a += dst_stride_a * 2;
-    dst_b += dst_stride_b * 2;
-    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
-    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
-    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
-    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
-    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
-    dst_a += dst_stride_a * 2;
-    dst_b += dst_stride_b * 2;
-    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
-    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
-    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
-    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
-    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
-    dst_a += dst_stride_a * 2;
-    dst_b += dst_stride_b * 2;
-    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
-    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
-    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
-    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
-    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
-    src += 16;
-    dst_a += dst_stride_a * 2;
-    dst_b += dst_stride_b * 2;
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/thirdparty/libyuv/source/rotate_neon.cc b/thirdparty/libyuv/source/rotate_neon.cc
deleted file mode 100644
index 844df2b..0000000
--- a/thirdparty/libyuv/source/rotate_neon.cc
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
-                                        2, 6, 10, 14, 3, 7, 11, 15};
-
-void TransposeWx8_NEON(const uint8_t* src,
-                       int src_stride,
-                       uint8_t* dst,
-                       int dst_stride,
-                       int width) {
-  const uint8_t* src_temp;
-  asm volatile(
-      // loops are on blocks of 8. loop will stop when
-      // counter gets to or below 0. starting the counter
-      // at w-8 allow for this
-      "sub         %5, #8                        \n"
-
-      // handle 8x8 blocks. this should be the majority of the plane
-      "1:                                        \n"
-      "mov         %0, %1                        \n"
-
-      "vld1.8      {d0}, [%0], %2                \n"
-      "vld1.8      {d1}, [%0], %2                \n"
-      "vld1.8      {d2}, [%0], %2                \n"
-      "vld1.8      {d3}, [%0], %2                \n"
-      "vld1.8      {d4}, [%0], %2                \n"
-      "vld1.8      {d5}, [%0], %2                \n"
-      "vld1.8      {d6}, [%0], %2                \n"
-      "vld1.8      {d7}, [%0]                    \n"
-
-      "vtrn.8      d1, d0                        \n"
-      "vtrn.8      d3, d2                        \n"
-      "vtrn.8      d5, d4                        \n"
-      "vtrn.8      d7, d6                        \n"
-
-      "vtrn.16     d1, d3                        \n"
-      "vtrn.16     d0, d2                        \n"
-      "vtrn.16     d5, d7                        \n"
-      "vtrn.16     d4, d6                        \n"
-
-      "vtrn.32     d1, d5                        \n"
-      "vtrn.32     d0, d4                        \n"
-      "vtrn.32     d3, d7                        \n"
-      "vtrn.32     d2, d6                        \n"
-
-      "vrev16.8    q0, q0                        \n"
-      "vrev16.8    q1, q1                        \n"
-      "vrev16.8    q2, q2                        \n"
-      "vrev16.8    q3, q3                        \n"
-
-      "mov         %0, %3                        \n"
-
-      "vst1.8      {d1}, [%0], %4                \n"
-      "vst1.8      {d0}, [%0], %4                \n"
-      "vst1.8      {d3}, [%0], %4                \n"
-      "vst1.8      {d2}, [%0], %4                \n"
-      "vst1.8      {d5}, [%0], %4                \n"
-      "vst1.8      {d4}, [%0], %4                \n"
-      "vst1.8      {d7}, [%0], %4                \n"
-      "vst1.8      {d6}, [%0]                    \n"
-
-      "add         %1, #8                        \n"  // src += 8
-      "add         %3, %3, %4, lsl #3            \n"  // dst += 8 * dst_stride
-      "subs        %5,  #8                       \n"  // w   -= 8
-      "bge         1b                            \n"
-
-      // add 8 back to counter. if the result is 0 there are
-      // no residuals.
-      "adds        %5, #8                        \n"
-      "beq         4f                            \n"
-
-      // some residual, so between 1 and 7 lines left to transpose
-      "cmp         %5, #2                        \n"
-      "blt         3f                            \n"
-
-      "cmp         %5, #4                        \n"
-      "blt         2f                            \n"
-
-      // 4x8 block
-      "mov         %0, %1                        \n"
-      "vld1.32     {d0[0]}, [%0], %2             \n"
-      "vld1.32     {d0[1]}, [%0], %2             \n"
-      "vld1.32     {d1[0]}, [%0], %2             \n"
-      "vld1.32     {d1[1]}, [%0], %2             \n"
-      "vld1.32     {d2[0]}, [%0], %2             \n"
-      "vld1.32     {d2[1]}, [%0], %2             \n"
-      "vld1.32     {d3[0]}, [%0], %2             \n"
-      "vld1.32     {d3[1]}, [%0]                 \n"
-
-      "mov         %0, %3                        \n"
-
-      "vld1.8      {q3}, [%6]                    \n"
-
-      "vtbl.8      d4, {d0, d1}, d6              \n"
-      "vtbl.8      d5, {d0, d1}, d7              \n"
-      "vtbl.8      d0, {d2, d3}, d6              \n"
-      "vtbl.8      d1, {d2, d3}, d7              \n"
-
-      // TODO(frkoenig): Rework shuffle above to
-      // write out with 4 instead of 8 writes.
-      "vst1.32     {d4[0]}, [%0], %4             \n"
-      "vst1.32     {d4[1]}, [%0], %4             \n"
-      "vst1.32     {d5[0]}, [%0], %4             \n"
-      "vst1.32     {d5[1]}, [%0]                 \n"
-
-      "add         %0, %3, #4                    \n"
-      "vst1.32     {d0[0]}, [%0], %4             \n"
-      "vst1.32     {d0[1]}, [%0], %4             \n"
-      "vst1.32     {d1[0]}, [%0], %4             \n"
-      "vst1.32     {d1[1]}, [%0]                 \n"
-
-      "add         %1, #4                        \n"  // src += 4
-      "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
-      "subs        %5,  #4                       \n"  // w   -= 4
-      "beq         4f                            \n"
-
-      // some residual, check to see if it includes a 2x8 block,
-      // or less
-      "cmp         %5, #2                        \n"
-      "blt         3f                            \n"
-
-      // 2x8 block
-      "2:                                        \n"
-      "mov         %0, %1                        \n"
-      "vld1.16     {d0[0]}, [%0], %2             \n"
-      "vld1.16     {d1[0]}, [%0], %2             \n"
-      "vld1.16     {d0[1]}, [%0], %2             \n"
-      "vld1.16     {d1[1]}, [%0], %2             \n"
-      "vld1.16     {d0[2]}, [%0], %2             \n"
-      "vld1.16     {d1[2]}, [%0], %2             \n"
-      "vld1.16     {d0[3]}, [%0], %2             \n"
-      "vld1.16     {d1[3]}, [%0]                 \n"
-
-      "vtrn.8      d0, d1                        \n"
-
-      "mov         %0, %3                        \n"
-
-      "vst1.64     {d0}, [%0], %4                \n"
-      "vst1.64     {d1}, [%0]                    \n"
-
-      "add         %1, #2                        \n"  // src += 2
-      "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
-      "subs        %5,  #2                       \n"  // w   -= 2
-      "beq         4f                            \n"
-
-      // 1x8 block
-      "3:                                        \n"
-      "vld1.8      {d0[0]}, [%1], %2             \n"
-      "vld1.8      {d0[1]}, [%1], %2             \n"
-      "vld1.8      {d0[2]}, [%1], %2             \n"
-      "vld1.8      {d0[3]}, [%1], %2             \n"
-      "vld1.8      {d0[4]}, [%1], %2             \n"
-      "vld1.8      {d0[5]}, [%1], %2             \n"
-      "vld1.8      {d0[6]}, [%1], %2             \n"
-      "vld1.8      {d0[7]}, [%1]                 \n"
-
-      "vst1.64     {d0}, [%3]                    \n"
-
-      "4:                                        \n"
-
-      : "=&r"(src_temp),         // %0
-        "+r"(src),               // %1
-        "+r"(src_stride),        // %2
-        "+r"(dst),               // %3
-        "+r"(dst_stride),        // %4
-        "+r"(width)              // %5
-      : "r"(&kVTbl4x4Transpose)  // %6
-      : "memory", "cc", "q0", "q1", "q2", "q3");
-}
-
-static const uvec8 kVTbl4x4TransposeDi = {0, 8,  1, 9,  2, 10, 3, 11,
-                                          4, 12, 5, 13, 6, 14, 7, 15};
-
-void TransposeUVWx8_NEON(const uint8_t* src,
-                         int src_stride,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         int width) {
-  const uint8_t* src_temp;
-  asm volatile(
-      // loops are on blocks of 8. loop will stop when
-      // counter gets to or below 0. starting the counter
-      // at w-8 allow for this
-      "sub         %7, #8                        \n"
-
-      // handle 8x8 blocks. this should be the majority of the plane
-      "1:                                        \n"
-      "mov         %0, %1                        \n"
-
-      "vld2.8      {d0,  d1},  [%0], %2          \n"
-      "vld2.8      {d2,  d3},  [%0], %2          \n"
-      "vld2.8      {d4,  d5},  [%0], %2          \n"
-      "vld2.8      {d6,  d7},  [%0], %2          \n"
-      "vld2.8      {d16, d17}, [%0], %2          \n"
-      "vld2.8      {d18, d19}, [%0], %2          \n"
-      "vld2.8      {d20, d21}, [%0], %2          \n"
-      "vld2.8      {d22, d23}, [%0]              \n"
-
-      "vtrn.8      q1, q0                        \n"
-      "vtrn.8      q3, q2                        \n"
-      "vtrn.8      q9, q8                        \n"
-      "vtrn.8      q11, q10                      \n"
-
-      "vtrn.16     q1, q3                        \n"
-      "vtrn.16     q0, q2                        \n"
-      "vtrn.16     q9, q11                       \n"
-      "vtrn.16     q8, q10                       \n"
-
-      "vtrn.32     q1, q9                        \n"
-      "vtrn.32     q0, q8                        \n"
-      "vtrn.32     q3, q11                       \n"
-      "vtrn.32     q2, q10                       \n"
-
-      "vrev16.8    q0, q0                        \n"
-      "vrev16.8    q1, q1                        \n"
-      "vrev16.8    q2, q2                        \n"
-      "vrev16.8    q3, q3                        \n"
-      "vrev16.8    q8, q8                        \n"
-      "vrev16.8    q9, q9                        \n"
-      "vrev16.8    q10, q10                      \n"
-      "vrev16.8    q11, q11                      \n"
-
-      "mov         %0, %3                        \n"
-
-      "vst1.8      {d2},  [%0], %4               \n"
-      "vst1.8      {d0},  [%0], %4               \n"
-      "vst1.8      {d6},  [%0], %4               \n"
-      "vst1.8      {d4},  [%0], %4               \n"
-      "vst1.8      {d18}, [%0], %4               \n"
-      "vst1.8      {d16}, [%0], %4               \n"
-      "vst1.8      {d22}, [%0], %4               \n"
-      "vst1.8      {d20}, [%0]                   \n"
-
-      "mov         %0, %5                        \n"
-
-      "vst1.8      {d3},  [%0], %6               \n"
-      "vst1.8      {d1},  [%0], %6               \n"
-      "vst1.8      {d7},  [%0], %6               \n"
-      "vst1.8      {d5},  [%0], %6               \n"
-      "vst1.8      {d19}, [%0], %6               \n"
-      "vst1.8      {d17}, [%0], %6               \n"
-      "vst1.8      {d23}, [%0], %6               \n"
-      "vst1.8      {d21}, [%0]                   \n"
-
-      "add         %1, #8*2                      \n"  // src   += 8*2
-      "add         %3, %3, %4, lsl #3            \n"  // dst_a += 8 *
-                                                      // dst_stride_a
-      "add         %5, %5, %6, lsl #3            \n"  // dst_b += 8 *
-                                                      // dst_stride_b
-      "subs        %7,  #8                       \n"  // w     -= 8
-      "bge         1b                            \n"
-
-      // add 8 back to counter. if the result is 0 there are
-      // no residuals.
-      "adds        %7, #8                        \n"
-      "beq         4f                            \n"
-
-      // some residual, so between 1 and 7 lines left to transpose
-      "cmp         %7, #2                        \n"
-      "blt         3f                            \n"
-
-      "cmp         %7, #4                        \n"
-      "blt         2f                            \n"
-
-      // TODO(frkoenig): Clean this up
-      // 4x8 block
-      "mov         %0, %1                        \n"
-      "vld1.64     {d0}, [%0], %2                \n"
-      "vld1.64     {d1}, [%0], %2                \n"
-      "vld1.64     {d2}, [%0], %2                \n"
-      "vld1.64     {d3}, [%0], %2                \n"
-      "vld1.64     {d4}, [%0], %2                \n"
-      "vld1.64     {d5}, [%0], %2                \n"
-      "vld1.64     {d6}, [%0], %2                \n"
-      "vld1.64     {d7}, [%0]                    \n"
-
-      "vld1.8      {q15}, [%8]                   \n"
-
-      "vtrn.8      q0, q1                        \n"
-      "vtrn.8      q2, q3                        \n"
-
-      "vtbl.8      d16, {d0, d1}, d30            \n"
-      "vtbl.8      d17, {d0, d1}, d31            \n"
-      "vtbl.8      d18, {d2, d3}, d30            \n"
-      "vtbl.8      d19, {d2, d3}, d31            \n"
-      "vtbl.8      d20, {d4, d5}, d30            \n"
-      "vtbl.8      d21, {d4, d5}, d31            \n"
-      "vtbl.8      d22, {d6, d7}, d30            \n"
-      "vtbl.8      d23, {d6, d7}, d31            \n"
-
-      "mov         %0, %3                        \n"
-
-      "vst1.32     {d16[0]},  [%0], %4           \n"
-      "vst1.32     {d16[1]},  [%0], %4           \n"
-      "vst1.32     {d17[0]},  [%0], %4           \n"
-      "vst1.32     {d17[1]},  [%0], %4           \n"
-
-      "add         %0, %3, #4                    \n"
-      "vst1.32     {d20[0]}, [%0], %4            \n"
-      "vst1.32     {d20[1]}, [%0], %4            \n"
-      "vst1.32     {d21[0]}, [%0], %4            \n"
-      "vst1.32     {d21[1]}, [%0]                \n"
-
-      "mov         %0, %5                        \n"
-
-      "vst1.32     {d18[0]}, [%0], %6            \n"
-      "vst1.32     {d18[1]}, [%0], %6            \n"
-      "vst1.32     {d19[0]}, [%0], %6            \n"
-      "vst1.32     {d19[1]}, [%0], %6            \n"
-
-      "add         %0, %5, #4                    \n"
-      "vst1.32     {d22[0]},  [%0], %6           \n"
-      "vst1.32     {d22[1]},  [%0], %6           \n"
-      "vst1.32     {d23[0]},  [%0], %6           \n"
-      "vst1.32     {d23[1]},  [%0]               \n"
-
-      "add         %1, #4*2                      \n"  // src   += 4 * 2
-      "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 *
-                                                      // dst_stride_a
-      "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 *
-                                                      // dst_stride_b
-      "subs        %7,  #4                       \n"  // w     -= 4
-      "beq         4f                            \n"
-
-      // some residual, check to see if it includes a 2x8 block,
-      // or less
-      "cmp         %7, #2                        \n"
-      "blt         3f                            \n"
-
-      // 2x8 block
-      "2:                                        \n"
-      "mov         %0, %1                        \n"
-      "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
-      "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
-      "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
-      "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
-      "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
-      "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
-      "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
-      "vld2.16     {d1[3], d3[3]}, [%0]          \n"
-
-      "vtrn.8      d0, d1                        \n"
-      "vtrn.8      d2, d3                        \n"
-
-      "mov         %0, %3                        \n"
-
-      "vst1.64     {d0}, [%0], %4                \n"
-      "vst1.64     {d2}, [%0]                    \n"
-
-      "mov         %0, %5                        \n"
-
-      "vst1.64     {d1}, [%0], %6                \n"
-      "vst1.64     {d3}, [%0]                    \n"
-
-      "add         %1, #2*2                      \n"  // src   += 2 * 2
-      "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 *
-                                                      // dst_stride_a
-      "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 *
-                                                      // dst_stride_b
-      "subs        %7,  #2                       \n"  // w     -= 2
-      "beq         4f                            \n"
-
-      // 1x8 block
-      "3:                                        \n"
-      "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
-      "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
-      "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
-      "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
-      "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
-      "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
-      "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
-      "vld2.8      {d0[7], d1[7]}, [%1]          \n"
-
-      "vst1.64     {d0}, [%3]                    \n"
-      "vst1.64     {d1}, [%5]                    \n"
-
-      "4:                                        \n"
-
-      : "=&r"(src_temp),           // %0
-        "+r"(src),                 // %1
-        "+r"(src_stride),          // %2
-        "+r"(dst_a),               // %3
-        "+r"(dst_stride_a),        // %4
-        "+r"(dst_b),               // %5
-        "+r"(dst_stride_b),        // %6
-        "+r"(width)                // %7
-      : "r"(&kVTbl4x4TransposeDi)  // %8
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-}
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/rotate_neon64.cc b/thirdparty/libyuv/source/rotate_neon64.cc
deleted file mode 100644
index 43c1581..0000000
--- a/thirdparty/libyuv/source/rotate_neon64.cc
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
- *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon armv8 64 bit.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
-                                        2, 6, 10, 14, 3, 7, 11, 15};
-
-void TransposeWx8_NEON(const uint8_t* src,
-                       int src_stride,
-                       uint8_t* dst,
-                       int dst_stride,
-                       int width) {
-  const uint8_t* src_temp;
-  asm volatile(
-      // loops are on blocks of 8. loop will stop when
-      // counter gets to or below 0. starting the counter
-      // at w-8 allow for this
-      "sub         %w3, %w3, #8                  \n"
-
-      // handle 8x8 blocks. this should be the majority of the plane
-      "1:                                        \n"
-      "mov         %0, %1                        \n"
-
-      "ld1         {v0.8b}, [%0], %5             \n"
-      "ld1         {v1.8b}, [%0], %5             \n"
-      "ld1         {v2.8b}, [%0], %5             \n"
-      "ld1         {v3.8b}, [%0], %5             \n"
-      "ld1         {v4.8b}, [%0], %5             \n"
-      "ld1         {v5.8b}, [%0], %5             \n"
-      "ld1         {v6.8b}, [%0], %5             \n"
-      "ld1         {v7.8b}, [%0]                 \n"
-      "mov         %0, %1                        \n"
-
-      "trn2        v16.8b, v0.8b, v1.8b          \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "trn1        v17.8b, v0.8b, v1.8b          \n"
-      "add         %0, %0, %5                    \n"
-      "trn2        v18.8b, v2.8b, v3.8b          \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // row 1
-      "trn1        v19.8b, v2.8b, v3.8b          \n"
-      "add         %0, %0, %5                    \n"
-      "trn2        v20.8b, v4.8b, v5.8b          \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // row 2
-      "trn1        v21.8b, v4.8b, v5.8b          \n"
-      "add         %0, %0, %5                    \n"
-      "trn2        v22.8b, v6.8b, v7.8b          \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // row 3
-      "trn1        v23.8b, v6.8b, v7.8b          \n"
-      "add         %0, %0, %5                    \n"
-
-      "trn2        v3.4h, v17.4h, v19.4h         \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // row 4
-      "trn1        v1.4h, v17.4h, v19.4h         \n"
-      "add         %0, %0, %5                    \n"
-      "trn2        v2.4h, v16.4h, v18.4h         \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // row 5
-      "trn1        v0.4h, v16.4h, v18.4h         \n"
-      "add         %0, %0, %5                    \n"
-      "trn2        v7.4h, v21.4h, v23.4h         \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // row 6
-      "trn1        v5.4h, v21.4h, v23.4h         \n"
-      "add         %0, %0, %5                    \n"
-      "trn2        v6.4h, v20.4h, v22.4h         \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // row 7
-      "trn1        v4.4h, v20.4h, v22.4h         \n"
-
-      "trn2        v21.2s, v1.2s, v5.2s          \n"
-      "trn1        v17.2s, v1.2s, v5.2s          \n"
-      "trn2        v20.2s, v0.2s, v4.2s          \n"
-      "trn1        v16.2s, v0.2s, v4.2s          \n"
-      "trn2        v23.2s, v3.2s, v7.2s          \n"
-      "trn1        v19.2s, v3.2s, v7.2s          \n"
-      "trn2        v22.2s, v2.2s, v6.2s          \n"
-      "trn1        v18.2s, v2.2s, v6.2s          \n"
-
-      "mov         %0, %2                        \n"
-
-      "st1         {v17.8b}, [%0], %6            \n"
-      "st1         {v16.8b}, [%0], %6            \n"
-      "st1         {v19.8b}, [%0], %6            \n"
-      "st1         {v18.8b}, [%0], %6            \n"
-      "st1         {v21.8b}, [%0], %6            \n"
-      "st1         {v20.8b}, [%0], %6            \n"
-      "st1         {v23.8b}, [%0], %6            \n"
-      "st1         {v22.8b}, [%0]                \n"
-
-      "add         %1, %1, #8                    \n"  // src += 8
-      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
-      "subs        %w3, %w3, #8                  \n"  // w   -= 8
-      "b.ge        1b                            \n"
-
-      // add 8 back to counter. if the result is 0 there are
-      // no residuals.
-      "adds        %w3, %w3, #8                  \n"
-      "b.eq        4f                            \n"
-
-      // some residual, so between 1 and 7 lines left to transpose
-      "cmp         %w3, #2                       \n"
-      "b.lt        3f                            \n"
-
-      "cmp         %w3, #4                       \n"
-      "b.lt        2f                            \n"
-
-      // 4x8 block
-      "mov         %0, %1                        \n"
-      "ld1         {v0.s}[0], [%0], %5           \n"
-      "ld1         {v0.s}[1], [%0], %5           \n"
-      "ld1         {v0.s}[2], [%0], %5           \n"
-      "ld1         {v0.s}[3], [%0], %5           \n"
-      "ld1         {v1.s}[0], [%0], %5           \n"
-      "ld1         {v1.s}[1], [%0], %5           \n"
-      "ld1         {v1.s}[2], [%0], %5           \n"
-      "ld1         {v1.s}[3], [%0]               \n"
-
-      "mov         %0, %2                        \n"
-
-      "ld1         {v2.16b}, [%4]                \n"
-
-      "tbl         v3.16b, {v0.16b}, v2.16b      \n"
-      "tbl         v0.16b, {v1.16b}, v2.16b      \n"
-
-      // TODO(frkoenig): Rework shuffle above to
-      // write out with 4 instead of 8 writes.
-      "st1 {v3.s}[0], [%0], %6                     \n"
-      "st1 {v3.s}[1], [%0], %6                     \n"
-      "st1 {v3.s}[2], [%0], %6                     \n"
-      "st1 {v3.s}[3], [%0]                         \n"
-
-      "add         %0, %2, #4                      \n"
-      "st1 {v0.s}[0], [%0], %6                     \n"
-      "st1 {v0.s}[1], [%0], %6                     \n"
-      "st1 {v0.s}[2], [%0], %6                     \n"
-      "st1 {v0.s}[3], [%0]                         \n"
-
-      "add         %1, %1, #4                      \n"  // src += 4
-      "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
-      "subs        %w3, %w3, #4                    \n"  // w   -= 4
-      "b.eq        4f                              \n"
-
-      // some residual, check to see if it includes a 2x8 block,
-      // or less
-      "cmp         %w3, #2                         \n"
-      "b.lt        3f                              \n"
-
-      // 2x8 block
-      "2:                                          \n"
-      "mov         %0, %1                          \n"
-      "ld1     {v0.h}[0], [%0], %5                 \n"
-      "ld1     {v1.h}[0], [%0], %5                 \n"
-      "ld1     {v0.h}[1], [%0], %5                 \n"
-      "ld1     {v1.h}[1], [%0], %5                 \n"
-      "ld1     {v0.h}[2], [%0], %5                 \n"
-      "ld1     {v1.h}[2], [%0], %5                 \n"
-      "ld1     {v0.h}[3], [%0], %5                 \n"
-      "ld1     {v1.h}[3], [%0]                     \n"
-
-      "trn2    v2.8b, v0.8b, v1.8b                 \n"
-      "trn1    v3.8b, v0.8b, v1.8b                 \n"
-
-      "mov         %0, %2                          \n"
-
-      "st1     {v3.8b}, [%0], %6                   \n"
-      "st1     {v2.8b}, [%0]                       \n"
-
-      "add         %1, %1, #2                      \n"  // src += 2
-      "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
-      "subs        %w3, %w3,  #2                   \n"  // w   -= 2
-      "b.eq        4f                              \n"
-
-      // 1x8 block
-      "3:                                          \n"
-      "ld1         {v0.b}[0], [%1], %5             \n"
-      "ld1         {v0.b}[1], [%1], %5             \n"
-      "ld1         {v0.b}[2], [%1], %5             \n"
-      "ld1         {v0.b}[3], [%1], %5             \n"
-      "ld1         {v0.b}[4], [%1], %5             \n"
-      "ld1         {v0.b}[5], [%1], %5             \n"
-      "ld1         {v0.b}[6], [%1], %5             \n"
-      "ld1         {v0.b}[7], [%1]                 \n"
-
-      "st1         {v0.8b}, [%2]                   \n"
-
-      "4:                                          \n"
-
-      : "=&r"(src_temp),                          // %0
-        "+r"(src),                                // %1
-        "+r"(dst),                                // %2
-        "+r"(width)                               // %3
-      : "r"(&kVTbl4x4Transpose),                  // %4
-        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18", "v19", "v20", "v21", "v22", "v23");
-}
-
-static const uint8_t kVTbl4x4TransposeDi[32] = {
-    0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
-    1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
-
-void TransposeUVWx8_NEON(const uint8_t* src,
-                         int src_stride,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         int width) {
-  const uint8_t* src_temp;
-  asm volatile(
-      // loops are on blocks of 8. loop will stop when
-      // counter gets to or below 0. starting the counter
-      // at w-8 allow for this
-      "sub         %w4, %w4, #8                  \n"
-
-      // handle 8x8 blocks. this should be the majority of the plane
-      "1:                                        \n"
-      "mov         %0, %1                        \n"
-
-      "ld1         {v0.16b}, [%0], %5            \n"
-      "ld1         {v1.16b}, [%0], %5            \n"
-      "ld1         {v2.16b}, [%0], %5            \n"
-      "ld1         {v3.16b}, [%0], %5            \n"
-      "ld1         {v4.16b}, [%0], %5            \n"
-      "ld1         {v5.16b}, [%0], %5            \n"
-      "ld1         {v6.16b}, [%0], %5            \n"
-      "ld1         {v7.16b}, [%0]                \n"
-      "mov         %0, %1                        \n"
-
-      "trn1        v16.16b, v0.16b, v1.16b       \n"
-      "trn2        v17.16b, v0.16b, v1.16b       \n"
-      "trn1        v18.16b, v2.16b, v3.16b       \n"
-      "trn2        v19.16b, v2.16b, v3.16b       \n"
-      "trn1        v20.16b, v4.16b, v5.16b       \n"
-      "trn2        v21.16b, v4.16b, v5.16b       \n"
-      "trn1        v22.16b, v6.16b, v7.16b       \n"
-      "trn2        v23.16b, v6.16b, v7.16b       \n"
-
-      "trn1        v0.8h, v16.8h, v18.8h         \n"
-      "trn2        v1.8h, v16.8h, v18.8h         \n"
-      "trn1        v2.8h, v20.8h, v22.8h         \n"
-      "trn2        v3.8h, v20.8h, v22.8h         \n"
-      "trn1        v4.8h, v17.8h, v19.8h         \n"
-      "trn2        v5.8h, v17.8h, v19.8h         \n"
-      "trn1        v6.8h, v21.8h, v23.8h         \n"
-      "trn2        v7.8h, v21.8h, v23.8h         \n"
-
-      "trn1        v16.4s, v0.4s, v2.4s          \n"
-      "trn2        v17.4s, v0.4s, v2.4s          \n"
-      "trn1        v18.4s, v1.4s, v3.4s          \n"
-      "trn2        v19.4s, v1.4s, v3.4s          \n"
-      "trn1        v20.4s, v4.4s, v6.4s          \n"
-      "trn2        v21.4s, v4.4s, v6.4s          \n"
-      "trn1        v22.4s, v5.4s, v7.4s          \n"
-      "trn2        v23.4s, v5.4s, v7.4s          \n"
-
-      "mov         %0, %2                        \n"
-
-      "st1         {v16.d}[0], [%0], %6          \n"
-      "st1         {v18.d}[0], [%0], %6          \n"
-      "st1         {v17.d}[0], [%0], %6          \n"
-      "st1         {v19.d}[0], [%0], %6          \n"
-      "st1         {v16.d}[1], [%0], %6          \n"
-      "st1         {v18.d}[1], [%0], %6          \n"
-      "st1         {v17.d}[1], [%0], %6          \n"
-      "st1         {v19.d}[1], [%0]              \n"
-
-      "mov         %0, %3                        \n"
-
-      "st1         {v20.d}[0], [%0], %7          \n"
-      "st1         {v22.d}[0], [%0], %7          \n"
-      "st1         {v21.d}[0], [%0], %7          \n"
-      "st1         {v23.d}[0], [%0], %7          \n"
-      "st1         {v20.d}[1], [%0], %7          \n"
-      "st1         {v22.d}[1], [%0], %7          \n"
-      "st1         {v21.d}[1], [%0], %7          \n"
-      "st1         {v23.d}[1], [%0]              \n"
-
-      "add         %1, %1, #16                   \n"  // src   += 8*2
-      "add         %2, %2, %6, lsl #3            \n"  // dst_a += 8 *
-                                                      // dst_stride_a
-      "add         %3, %3, %7, lsl #3            \n"  // dst_b += 8 *
-                                                      // dst_stride_b
-      "subs        %w4, %w4,  #8                 \n"  // w     -= 8
-      "b.ge        1b                            \n"
-
-      // add 8 back to counter. if the result is 0 there are
-      // no residuals.
-      "adds        %w4, %w4, #8                  \n"
-      "b.eq        4f                            \n"
-
-      // some residual, so between 1 and 7 lines left to transpose
-      "cmp         %w4, #2                       \n"
-      "b.lt        3f                            \n"
-
-      "cmp         %w4, #4                       \n"
-      "b.lt        2f                            \n"
-
-      // TODO(frkoenig): Clean this up
-      // 4x8 block
-      "mov       %0, %1                          \n"
-      "ld1       {v0.8b}, [%0], %5               \n"
-      "ld1       {v1.8b}, [%0], %5               \n"
-      "ld1       {v2.8b}, [%0], %5               \n"
-      "ld1       {v3.8b}, [%0], %5               \n"
-      "ld1       {v4.8b}, [%0], %5               \n"
-      "ld1       {v5.8b}, [%0], %5               \n"
-      "ld1       {v6.8b}, [%0], %5               \n"
-      "ld1       {v7.8b}, [%0]                   \n"
-
-      "ld1       {v30.16b}, [%8], #16            \n"
-      "ld1       {v31.16b}, [%8]                 \n"
-
-      "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
-      "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
-      "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
-      "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
-
-      "mov       %0, %2                          \n"
-
-      "st1       {v16.s}[0],  [%0], %6           \n"
-      "st1       {v16.s}[1],  [%0], %6           \n"
-      "st1       {v16.s}[2],  [%0], %6           \n"
-      "st1       {v16.s}[3],  [%0], %6           \n"
-
-      "add       %0, %2, #4                      \n"
-      "st1       {v18.s}[0], [%0], %6            \n"
-      "st1       {v18.s}[1], [%0], %6            \n"
-      "st1       {v18.s}[2], [%0], %6            \n"
-      "st1       {v18.s}[3], [%0]                \n"
-
-      "mov       %0, %3                          \n"
-
-      "st1       {v17.s}[0], [%0], %7            \n"
-      "st1       {v17.s}[1], [%0], %7            \n"
-      "st1       {v17.s}[2], [%0], %7            \n"
-      "st1       {v17.s}[3], [%0], %7            \n"
-
-      "add       %0, %3, #4                      \n"
-      "st1       {v19.s}[0],  [%0], %7           \n"
-      "st1       {v19.s}[1],  [%0], %7           \n"
-      "st1       {v19.s}[2],  [%0], %7           \n"
-      "st1       {v19.s}[3],  [%0]               \n"
-
-      "add       %1, %1, #8                      \n"  // src   += 4 * 2
-      "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 *
-                                                      // dst_stride_a
-      "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 *
-                                                      // dst_stride_b
-      "subs      %w4,  %w4,  #4                  \n"  // w     -= 4
-      "b.eq      4f                              \n"
-
-      // some residual, check to see if it includes a 2x8 block,
-      // or less
-      "cmp       %w4, #2                         \n"
-      "b.lt      3f                              \n"
-
-      // 2x8 block
-      "2:                                        \n"
-      "mov       %0, %1                          \n"
-      "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
-      "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
-      "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
-      "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
-      "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
-      "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
-      "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
-      "ld2       {v2.h, v3.h}[3], [%0]           \n"
-
-      "trn1      v4.8b, v0.8b, v2.8b             \n"
-      "trn2      v5.8b, v0.8b, v2.8b             \n"
-      "trn1      v6.8b, v1.8b, v3.8b             \n"
-      "trn2      v7.8b, v1.8b, v3.8b             \n"
-
-      "mov       %0, %2                          \n"
-
-      "st1       {v4.d}[0], [%0], %6             \n"
-      "st1       {v6.d}[0], [%0]                 \n"
-
-      "mov       %0, %3                          \n"
-
-      "st1       {v5.d}[0], [%0], %7             \n"
-      "st1       {v7.d}[0], [%0]                 \n"
-
-      "add       %1, %1, #4                      \n"  // src   += 2 * 2
-      "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 *
-                                                      // dst_stride_a
-      "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 *
-                                                      // dst_stride_b
-      "subs      %w4,  %w4,  #2                  \n"  // w     -= 2
-      "b.eq      4f                              \n"
-
-      // 1x8 block
-      "3:                                        \n"
-      "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
-      "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
-      "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
-      "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
-      "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
-      "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
-      "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
-      "ld2       {v0.b, v1.b}[7], [%1]           \n"
-
-      "st1       {v0.d}[0], [%2]                 \n"
-      "st1       {v1.d}[0], [%3]                 \n"
-
-      "4:                                        \n"
-
-      : "=&r"(src_temp),                            // %0
-        "+r"(src),                                  // %1
-        "+r"(dst_a),                                // %2
-        "+r"(dst_b),                                // %3
-        "+r"(width)                                 // %4
-      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
-        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
-        "r"(&kVTbl4x4TransposeDi)                   // %8
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
-}
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/rotate_win.cc b/thirdparty/libyuv/source/rotate_win.cc
deleted file mode 100644
index a78873f..0000000
--- a/thirdparty/libyuv/source/rotate_win.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for 32 bit Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    !defined(__clang__) && defined(_M_IX86)
-
-__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
-                                          int src_stride,
-                                          uint8_t* dst,
-                                          int dst_stride,
-                                          int width) {
-  __asm {
-    push      edi
-    push      esi
-    push      ebp
-    mov       eax, [esp + 12 + 4]  // src
-    mov       edi, [esp + 12 + 8]  // src_stride
-    mov       edx, [esp + 12 + 12]  // dst
-    mov       esi, [esp + 12 + 16]  // dst_stride
-    mov       ecx, [esp + 12 + 20]  // width
-
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    align      4
- convertloop:
-    movq      xmm0, qword ptr [eax]
-    lea       ebp, [eax + 8]
-    movq      xmm1, qword ptr [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    punpcklbw xmm0, xmm1
-    movq      xmm2, qword ptr [eax]
-    movdqa    xmm1, xmm0
-    palignr   xmm1, xmm1, 8
-    movq      xmm3, qword ptr [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    punpcklbw xmm2, xmm3
-    movdqa    xmm3, xmm2
-    movq      xmm4, qword ptr [eax]
-    palignr   xmm3, xmm3, 8
-    movq      xmm5, qword ptr [eax + edi]
-    punpcklbw xmm4, xmm5
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm5, xmm4
-    movq      xmm6, qword ptr [eax]
-    palignr   xmm5, xmm5, 8
-    movq      xmm7, qword ptr [eax + edi]
-    punpcklbw xmm6, xmm7
-    mov       eax, ebp
-    movdqa    xmm7, xmm6
-    palignr   xmm7, xmm7, 8
-    // Second round of bit swap.
-    punpcklwd xmm0, xmm2
-    punpcklwd xmm1, xmm3
-    movdqa    xmm2, xmm0
-    movdqa    xmm3, xmm1
-    palignr   xmm2, xmm2, 8
-    palignr   xmm3, xmm3, 8
-    punpcklwd xmm4, xmm6
-    punpcklwd xmm5, xmm7
-    movdqa    xmm6, xmm4
-    movdqa    xmm7, xmm5
-    palignr   xmm6, xmm6, 8
-    palignr   xmm7, xmm7, 8
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    punpckldq xmm0, xmm4
-    movq      qword ptr [edx], xmm0
-    movdqa    xmm4, xmm0
-    palignr   xmm4, xmm4, 8
-    movq      qword ptr [edx + esi], xmm4
-    lea       edx, [edx + 2 * esi]
-    punpckldq xmm2, xmm6
-    movdqa    xmm6, xmm2
-    palignr   xmm6, xmm6, 8
-    movq      qword ptr [edx], xmm2
-    punpckldq xmm1, xmm5
-    movq      qword ptr [edx + esi], xmm6
-    lea       edx, [edx + 2 * esi]
-    movdqa    xmm5, xmm1
-    movq      qword ptr [edx], xmm1
-    palignr   xmm5, xmm5, 8
-    punpckldq xmm3, xmm7
-    movq      qword ptr [edx + esi], xmm5
-    lea       edx, [edx + 2 * esi]
-    movq      qword ptr [edx], xmm3
-    movdqa    xmm7, xmm3
-    palignr   xmm7, xmm7, 8
-    sub       ecx, 8
-    movq      qword ptr [edx + esi], xmm7
-    lea       edx, [edx + 2 * esi]
-    jg        convertloop
-
-    pop       ebp
-    pop       esi
-    pop       edi
-    ret
-  }
-}
-
-__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
-                                           int src_stride,
-                                           uint8_t* dst_a,
-                                           int dst_stride_a,
-                                           uint8_t* dst_b,
-                                           int dst_stride_b,
-                                           int w) {
-  __asm {
-    push      ebx
-    push      esi
-    push      edi
-    push      ebp
-    mov       eax, [esp + 16 + 4]  // src
-    mov       edi, [esp + 16 + 8]  // src_stride
-    mov       edx, [esp + 16 + 12]  // dst_a
-    mov       esi, [esp + 16 + 16]  // dst_stride_a
-    mov       ebx, [esp + 16 + 20]  // dst_b
-    mov       ebp, [esp + 16 + 24]  // dst_stride_b
-    mov       ecx, esp
-    sub       esp, 4 + 16
-    and       esp, ~15
-    mov       [esp + 16], ecx
-    mov       ecx, [ecx + 16 + 28]  // w
-
-    align      4
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-  convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm0  // use xmm7 as temp register.
-    punpcklbw xmm0, xmm1
-    punpckhbw xmm7, xmm1
-    movdqa    xmm1, xmm7
-    movdqu    xmm2, [eax]
-    movdqu    xmm3, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm2
-    punpcklbw xmm2, xmm3
-    punpckhbw xmm7, xmm3
-    movdqa    xmm3, xmm7
-    movdqu    xmm4, [eax]
-    movdqu    xmm5, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm4
-    punpcklbw xmm4, xmm5
-    punpckhbw xmm7, xmm5
-    movdqa    xmm5, xmm7
-    movdqu    xmm6, [eax]
-    movdqu    xmm7, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqu    [esp], xmm5  // backup xmm5
-    neg       edi
-    movdqa    xmm5, xmm6  // use xmm5 as temp register.
-    punpcklbw xmm6, xmm7
-    punpckhbw xmm5, xmm7
-    movdqa    xmm7, xmm5
-    lea       eax, [eax + 8 * edi + 16]
-    neg       edi
-        // Second round of bit swap.
-    movdqa    xmm5, xmm0
-    punpcklwd xmm0, xmm2
-    punpckhwd xmm5, xmm2
-    movdqa    xmm2, xmm5
-    movdqa    xmm5, xmm1
-    punpcklwd xmm1, xmm3
-    punpckhwd xmm5, xmm3
-    movdqa    xmm3, xmm5
-    movdqa    xmm5, xmm4
-    punpcklwd xmm4, xmm6
-    punpckhwd xmm5, xmm6
-    movdqa    xmm6, xmm5
-    movdqu    xmm5, [esp]  // restore xmm5
-    movdqu    [esp], xmm6  // backup xmm6
-    movdqa    xmm6, xmm5  // use xmm6 as temp register.
-    punpcklwd xmm5, xmm7
-    punpckhwd xmm6, xmm7
-    movdqa    xmm7, xmm6
-
-        // Third round of bit swap.
-        // Write to the destination pointer.
-    movdqa    xmm6, xmm0
-    punpckldq xmm0, xmm4
-    punpckhdq xmm6, xmm4
-    movdqa    xmm4, xmm6
-    movdqu    xmm6, [esp]  // restore xmm6
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [ebx], xmm0
-    movlpd    qword ptr [edx + esi], xmm4
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm4
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm2  // use xmm0 as the temp register.
-    punpckldq xmm2, xmm6
-    movlpd    qword ptr [edx], xmm2
-    movhpd    qword ptr [ebx], xmm2
-    punpckhdq xmm0, xmm6
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm1  // use xmm0 as the temp register.
-    punpckldq xmm1, xmm5
-    movlpd    qword ptr [edx], xmm1
-    movhpd    qword ptr [ebx], xmm1
-    punpckhdq xmm0, xmm5
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm3  // use xmm0 as the temp register.
-    punpckldq xmm3, xmm7
-    movlpd    qword ptr [edx], xmm3
-    movhpd    qword ptr [ebx], xmm3
-    punpckhdq xmm0, xmm7
-    sub       ecx, 8
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    jg        convertloop
-
-    mov       esp, [esp + 16]
-    pop       ebp
-    pop       edi
-    pop       esi
-    pop       ebx
-    ret
-  }
-}
-
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/row_any.cc b/thirdparty/libyuv/source/row_any.cc
deleted file mode 100644
index c9a402e..0000000
--- a/thirdparty/libyuv/source/row_any.cc
+++ /dev/null
@@ -1,2071 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include <string.h>  // For memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// memset for temp is meant to clear the source buffer (not dest) so that
-// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
-// memset is not needed for production, as the garbage values are processed but
-// not used, although there may be edge cases for subsampling.
-// The size of the buffer is based on the largest read, which can be inferred
-// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
-// the source code for how much the source pointers are advanced.
-
-// Subsampled source needs to be increase by 1 of not even.
-#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
-
-// Any 4 planes to 1
-#define ANY41(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)               \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
-               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
-               int width) {                                                  \
-    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
-    memset(temp, 0, 64 * 4); /* for msan */                                  \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n);                      \
-    }                                                                        \
-    memcpy(temp, y_buf + n, r);                                              \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-    memcpy(temp + 192, a_buf + n, r);                                        \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, MASK + 1); \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-           SS(r, DUVSHIFT) * BPP);                                           \
-  }
-
-#ifdef HAS_MERGEARGBROW_SSE2
-ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7)
-#endif
-#ifdef HAS_MERGEARGBROW_AVX2
-ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15)
-#endif
-#ifdef HAS_MERGEARGBROW_NEON
-ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15)
-#endif
-
-// Note that odd width replication includes 444 due to implementation
-// on arm that subsamples 444 to 422 internally.
-// Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
-               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
-               const struct YuvConstants* yuvconstants, int width) {         \
-    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
-    memset(temp, 0, 64 * 4); /* for msan */                                  \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
-    }                                                                        \
-    memcpy(temp, y_buf + n, r);                                              \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-    memcpy(temp + 192, a_buf + n, r);                                        \
-    if (width & 1) {                                                         \
-      temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \
-      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \
-    }                                                                        \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
-             yuvconstants, MASK + 1);                                        \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-           SS(r, DUVSHIFT) * BPP);                                           \
-  }
-
-#ifdef HAS_I444ALPHATOARGBROW_SSSE3
-ANY41C(I444AlphaToARGBRow_Any_SSSE3, I444AlphaToARGBRow_SSSE3, 0, 0, 4, 7)
-#endif
-#ifdef HAS_I444ALPHATOARGBROW_AVX2
-ANY41C(I444AlphaToARGBRow_Any_AVX2, I444AlphaToARGBRow_AVX2, 0, 0, 4, 15)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_AVX2
-ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I444ALPHATOARGBROW_NEON
-ANY41C(I444AlphaToARGBRow_Any_NEON, I444AlphaToARGBRow_NEON, 0, 0, 4, 7)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_NEON
-ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I444ALPHATOARGBROW_MSA
-ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_MSA
-ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I444ALPHATOARGBROW_MMI
-ANY41C(I444AlphaToARGBRow_Any_MMI, I444AlphaToARGBRow_MMI, 0, 0, 4, 7)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_MMI
-ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7)
-#endif
-#undef ANY41C
-
-// Any 4 planes to 1 plane of 8 bit with yuvconstants
-#define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)      \
-  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \
-               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants,      \
-               int width) {                                                    \
-    SIMD_ALIGNED(T temp[16 * 4]);                                              \
-    SIMD_ALIGNED(uint8_t out[64]);                                             \
-    memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);          \
-    }                                                                          \
-    memcpy(temp, y_buf + n, r * SBPP);                                         \
-    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
-    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
-    memcpy(temp + 48, a_buf + n, r * SBPP);                                    \
-    ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants,         \
-             MASK + 1);                                                        \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
-  }
-
-#ifdef HAS_I210ALPHATOARGBROW_SSSE3
-ANY41CT(I210AlphaToARGBRow_Any_SSSE3,
-        I210AlphaToARGBRow_SSSE3,
-        1,
-        0,
-        uint16_t,
-        2,
-        4,
-        7)
-#endif
-
-#ifdef HAS_I210ALPHATOARGBROW_AVX2
-ANY41CT(I210AlphaToARGBRow_Any_AVX2,
-        I210AlphaToARGBRow_AVX2,
-        1,
-        0,
-        uint16_t,
-        2,
-        4,
-        15)
-#endif
-
-#ifdef HAS_I410ALPHATOARGBROW_SSSE3
-ANY41CT(I410AlphaToARGBRow_Any_SSSE3,
-        I410AlphaToARGBRow_SSSE3,
-        0,
-        0,
-        uint16_t,
-        2,
-        4,
-        7)
-#endif
-
-#ifdef HAS_I410ALPHATOARGBROW_AVX2
-ANY41CT(I410AlphaToARGBRow_Any_AVX2,
-        I410AlphaToARGBRow_AVX2,
-        0,
-        0,
-        uint16_t,
-        2,
-        4,
-        15)
-#endif
-
-#undef ANY41CT
-
-// Any 4 planes to 1 plane with parameter
-#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK)          \
-  void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
-               const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \
-    SIMD_ALIGNED(STYPE temp[16 * 4]);                                      \
-    SIMD_ALIGNED(DTYPE out[64]);                                           \
-    memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */                \
-    int r = width & MASK;                                                  \
-    int n = width & ~MASK;                                                 \
-    if (n > 0) {                                                           \
-      ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n);             \
-    }                                                                      \
-    memcpy(temp, r_buf + n, r * SBPP);                                     \
-    memcpy(temp + 16, g_buf + n, r * SBPP);                                \
-    memcpy(temp + 32, b_buf + n, r * SBPP);                                \
-    memcpy(temp + 48, a_buf + n, r * SBPP);                                \
-    ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \
-    memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP);                     \
-  }
-
-#ifdef HAS_MERGEAR64ROW_AVX2
-ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15)
-#endif
-
-#ifdef HAS_MERGEAR64ROW_NEON
-ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 8, 7)
-#endif
-
-#ifdef HAS_MERGEARGB16TO8ROW_AVX2
-ANY41PT(MergeARGB16To8Row_Any_AVX2,
-        MergeARGB16To8Row_AVX2,
-        uint16_t,
-        2,
-        uint8_t,
-        4,
-        15)
-#endif
-
-#ifdef HAS_MERGEARGB16TO8ROW_NEON
-ANY41PT(MergeARGB16To8Row_Any_NEON,
-        MergeARGB16To8Row_NEON,
-        uint16_t,
-        2,
-        uint8_t,
-        4,
-        7)
-#endif
-
-#undef ANY41PT
-
-// Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \
-               const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 4]);                             \
-    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                \
-    int r = width & MASK;                                           \
-    int n = width & ~MASK;                                          \
-    if (n > 0) {                                                    \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                    \
-    }                                                               \
-    memcpy(temp, y_buf + n, r);                                     \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));     \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,             \
-           SS(r, DUVSHIFT) * BPP);                                  \
-  }
-
-// Merge functions.
-#ifdef HAS_MERGERGBROW_SSSE3
-ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
-#endif
-#ifdef HAS_MERGERGBROW_NEON
-ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
-#endif
-#ifdef HAS_MERGERGBROW_MMI
-ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
-#endif
-#ifdef HAS_MERGEXRGBROW_SSE2
-ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7)
-#endif
-#ifdef HAS_MERGEXRGBROW_AVX2
-ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15)
-#endif
-#ifdef HAS_MERGEXRGBROW_NEON
-ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15)
-#endif
-#ifdef HAS_I422TOYUY2ROW_SSE2
-ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
-ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOYUY2ROW_AVX2
-ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
-ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
-#endif
-#ifdef HAS_I422TOYUY2ROW_NEON
-ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOYUY2ROW_MSA
-ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
-#endif
-#ifdef HAS_I422TOYUY2ROW_MMI
-ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7)
-#endif
-#ifdef HAS_I422TOUYVYROW_NEON
-ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOUYVYROW_MSA
-ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
-#endif
-#ifdef HAS_I422TOUYVYROW_MMI
-ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7)
-#endif
-#ifdef HAS_BLENDPLANEROW_AVX2
-ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
-#endif
-#ifdef HAS_BLENDPLANEROW_SSSE3
-ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
-#endif
-#ifdef HAS_BLENDPLANEROW_MMI
-ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7)
-#endif
-#undef ANY31
-
-// Note that odd width replication includes 444 due to implementation
-// on arm that subsamples 444 to 422 internally.
-// Any 3 planes to 1 with yuvconstants
-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,           \
-               const uint8_t* v_buf, uint8_t* dst_ptr,               \
-               const struct YuvConstants* yuvconstants, int width) { \
-    SIMD_ALIGNED(uint8_t temp[128 * 4]);                             \
-    memset(temp, 0, 128 * 3); /* for YUY2 and msan */                \
-    int r = width & MASK;                                            \
-    int n = width & ~MASK;                                           \
-    if (n > 0) {                                                     \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);       \
-    }                                                                \
-    memcpy(temp, y_buf + n, r);                                      \
-    memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    if (width & 1) {                                                 \
-      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];   \
-      temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1];   \
-    }                                                                \
-    ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
-             MASK + 1);                                              \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384,              \
-           SS(r, DUVSHIFT) * BPP);                                   \
-  }
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422TORGBAROW_SSSE3
-ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422TOARGB4444ROW_SSSE3
-ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TOARGB1555ROW_SSSE3
-ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TORGB565ROW_SSSE3
-ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TORGB24ROW_SSSE3
-ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
-#endif
-#ifdef HAS_I422TOAR30ROW_SSSE3
-ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422TOAR30ROW_AVX2
-ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I444TOARGBROW_SSSE3
-ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
-#endif
-#ifdef HAS_I422TORGB24ROW_AVX2
-ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
-#endif
-#ifdef HAS_I422TOARGBROW_AVX2
-ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I422TORGBAROW_AVX2
-ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I444TOARGBROW_AVX2
-ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
-#endif
-#ifdef HAS_I422TOARGB4444ROW_AVX2
-ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
-#endif
-#ifdef HAS_I422TOARGB1555ROW_AVX2
-ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
-#endif
-#ifdef HAS_I422TORGB565ROW_AVX2
-ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
-#endif
-#ifdef HAS_I422TOARGBROW_NEON
-ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
-ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
-ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
-ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TOARGBROW_MSA
-ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
-ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
-ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
-ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TOARGBROW_MMI
-ANY31C(I444ToARGBRow_Any_MMI, I444ToARGBRow_MMI, 0, 0, 4, 7)
-ANY31C(I422ToARGBRow_Any_MMI, I422ToARGBRow_MMI, 1, 0, 4, 7)
-ANY31C(I422ToRGB24Row_Any_MMI, I422ToRGB24Row_MMI, 1, 0, 3, 15)
-ANY31C(I422ToARGB4444Row_Any_MMI, I422ToARGB4444Row_MMI, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_MMI, I422ToARGB1555Row_MMI, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_MMI, I422ToRGB565Row_MMI, 1, 0, 2, 7)
-ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7)
-#endif
-#undef ANY31C
-
-// Any 3 planes of 16 bit to 1 with yuvconstants
-// TODO(fbarchard): consider sharing this code with ANY31C
-#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
-  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \
-               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
-               int width) {                                               \
-    SIMD_ALIGNED(T temp[16 * 3]);                                         \
-    SIMD_ALIGNED(uint8_t out[64]);                                        \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */               \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
-    }                                                                     \
-    memcpy(temp, y_buf + n, r * SBPP);                                    \
-    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
-    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
-    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);  \
-  }
-
-#ifdef HAS_I210TOAR30ROW_SSSE3
-ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I210TOARGBROW_SSSE3
-ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I210TOARGBROW_AVX2
-ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
-#endif
-#ifdef HAS_I210TOAR30ROW_AVX2
-ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
-#endif
-#ifdef HAS_I410TOAR30ROW_SSSE3
-ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I410TOARGBROW_SSSE3
-ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I410TOARGBROW_AVX2
-ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
-#endif
-#ifdef HAS_I410TOAR30ROW_AVX2
-ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
-#endif
-#ifdef HAS_I210TOARGBROW_MMI
-ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I212TOAR30ROW_SSSE3
-ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I212TOARGBROW_SSSE3
-ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I212TOARGBROW_AVX2
-ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
-#endif
-#ifdef HAS_I212TOAR30ROW_AVX2
-ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
-#endif
-#undef ANY31CT
-
-// Any 3 planes to 1 plane with parameter
-#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK)          \
-  void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
-               DTYPE* dst_ptr, int depth, int width) {                     \
-    SIMD_ALIGNED(STYPE temp[16 * 3]);                                      \
-    SIMD_ALIGNED(DTYPE out[64]);                                           \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */                \
-    int r = width & MASK;                                                  \
-    int n = width & ~MASK;                                                 \
-    if (n > 0) {                                                           \
-      ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n);                    \
-    }                                                                      \
-    memcpy(temp, r_buf + n, r * SBPP);                                     \
-    memcpy(temp + 16, g_buf + n, r * SBPP);                                \
-    memcpy(temp + 32, b_buf + n, r * SBPP);                                \
-    ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1);            \
-    memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP);                     \
-  }
-
-#ifdef HAS_MERGEXR30ROW_AVX2
-ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
-#endif
-
-#ifdef HAS_MERGEXR30ROW_NEON
-ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3)
-ANY31PT(MergeXR30Row_10_Any_NEON,
-        MergeXR30Row_10_NEON,
-        uint16_t,
-        2,
-        uint8_t,
-        4,
-        3)
-#endif
-
-#ifdef HAS_MERGEXR64ROW_AVX2
-ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15)
-#endif
-
-#ifdef HAS_MERGEXR64ROW_NEON
-ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 8, 7)
-#endif
-
-#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
-ANY31PT(MergeXRGB16To8Row_Any_AVX2,
-        MergeXRGB16To8Row_AVX2,
-        uint16_t,
-        2,
-        uint8_t,
-        4,
-        15)
-#endif
-
-#ifdef HAS_MERGEXRGB16TO8ROW_NEON
-ANY31PT(MergeXRGB16To8Row_Any_NEON,
-        MergeXRGB16To8Row_NEON,
-        uint16_t,
-        2,
-        uint8_t,
-        4,
-        7)
-#endif
-
-#undef ANY31PT
-
-// Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
-               int width) {                                                   \
-    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                       \
-    memset(temp, 0, 64 * 2); /* for msan */                                   \
-    int r = width & MASK;                                                     \
-    int n = width & ~MASK;                                                    \
-    if (n > 0) {                                                              \
-      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \
-    }                                                                         \
-    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
-    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
-           SS(r, UVSHIFT) * SBPP2);                                           \
-    ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                          \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                           \
-  }
-
-// Merge functions.
-#ifdef HAS_MERGEUVROW_SSE2
-ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_AVX2
-ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
-#endif
-#ifdef HAS_MERGEUVROW_NEON
-ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_MSA
-ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_MMI
-ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
-#endif
-#ifdef HAS_NV21TOYUV24ROW_NEON
-ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
-#endif
-#ifdef HAS_NV21TOYUV24ROW_AVX2
-ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
-#endif
-// Math functions.
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBADDROW_SSE2
-ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_AVX2
-ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_NEON
-ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_NEON
-ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_NEON
-ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_MSA
-ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_MMI
-ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBADDROW_MSA
-ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_MMI
-ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_MSA
-ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_MMI
-ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1)
-#endif
-#ifdef HAS_SOBELROW_SSE2
-ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELROW_NEON
-ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
-#endif
-#ifdef HAS_SOBELROW_MSA
-ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELROW_MMI
-ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_NEON
-ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_MSA
-ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_MMI
-ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7)
-#endif
-#ifdef HAS_SOBELXYROW_SSE2
-ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELXYROW_NEON
-ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
-#endif
-#ifdef HAS_SOBELXYROW_MSA
-ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELXYROW_MMI
-ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7)
-#endif
-#undef ANY21
-
-// Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
-               const struct YuvConstants* yuvconstants, int width) {          \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \
-    memset(temp, 0, 128 * 2); /* for msan */                                  \
-    int r = width & MASK;                                                     \
-    int n = width & ~MASK;                                                    \
-    if (n > 0) {                                                              \
-      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
-    }                                                                         \
-    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
-    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-           SS(r, UVSHIFT) * SBPP2);                                           \
-    ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1);           \
-    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \
-  }
-
-// Biplanar to RGB.
-#ifdef HAS_NV12TOARGBROW_SSSE3
-ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TOARGBROW_AVX2
-ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-#endif
-#ifdef HAS_NV12TOARGBROW_NEON
-ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TOARGBROW_MSA
-ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TOARGBROW_MMI
-ANY21C(NV12ToARGBRow_Any_MMI, NV12ToARGBRow_MMI, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_SSSE3
-ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_AVX2
-ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-#endif
-#ifdef HAS_NV21TOARGBROW_NEON
-ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_MSA
-ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_MMI
-ANY21C(NV21ToARGBRow_Any_MMI, NV21ToARGBRow_MMI, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TORGB24ROW_NEON
-ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
-#endif
-#ifdef HAS_NV21TORGB24ROW_NEON
-ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
-#endif
-#ifdef HAS_NV12TORGB24ROW_SSSE3
-ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
-#endif
-#ifdef HAS_NV12TORGB24ROW_MMI
-ANY21C(NV12ToRGB24Row_Any_MMI, NV12ToRGB24Row_MMI, 1, 1, 2, 3, 7)
-#endif
-#ifdef HAS_NV21TORGB24ROW_SSSE3
-ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
-#endif
-#ifdef HAS_NV12TORGB24ROW_AVX2
-ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
-#endif
-#ifdef HAS_NV21TORGB24ROW_AVX2
-ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
-#endif
-#ifdef HAS_NV21TORGB24ROW_MMI
-ANY21C(NV21ToRGB24Row_Any_MMI, NV21ToRGB24Row_MMI, 1, 1, 2, 3, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_SSSE3
-ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_AVX2
-ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
-#endif
-#ifdef HAS_NV12TORGB565ROW_NEON
-ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_MSA
-ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_MMI
-ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7)
-#endif
-#undef ANY21C
-
-// Any 2 planes of 16 bit to 1 with yuvconstants
-#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)      \
-  void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr,              \
-               const struct YuvConstants* yuvconstants, int width) {           \
-    SIMD_ALIGNED(T temp[16 * 3]);                                              \
-    SIMD_ALIGNED(uint8_t out[64]);                                             \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                       \
-    }                                                                          \
-    memcpy(temp, y_buf + n, r * SBPP);                                         \
-    memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \
-    ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1);                    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
-  }
-
-#ifdef HAS_P210TOAR30ROW_SSSE3
-ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_P210TOARGBROW_SSSE3
-ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_P210TOARGBROW_AVX2
-ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
-#endif
-#ifdef HAS_P210TOAR30ROW_AVX2
-ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
-#endif
-#ifdef HAS_P410TOAR30ROW_SSSE3
-ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_P410TOARGBROW_SSSE3
-ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_P410TOARGBROW_AVX2
-ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
-#endif
-#ifdef HAS_P410TOAR30ROW_AVX2
-ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
-#endif
-
-#undef ANY21CT
-
-// Any 2 16 bit planes with parameter to 1
-#define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                     \
-  void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \
-               int width) {                                          \
-    SIMD_ALIGNED(T temp[16 * 4]);                                    \
-    memset(temp, 0, 16 * 4 * BPP); /* for msan */                    \
-    int r = width & MASK;                                            \
-    int n = width & ~MASK;                                           \
-    if (n > 0) {                                                     \
-      ANY_SIMD(src_u, src_v, dst_uv, depth, n);                      \
-    }                                                                \
-    memcpy(temp, src_u + n, r * BPP);                                \
-    memcpy(temp + 16, src_v + n, r * BPP);                           \
-    ANY_SIMD(temp, temp + 16, temp + 32, depth, MASK + 1);           \
-    memcpy(dst_uv + n * 2, temp + 32, r * BPP * 2);                  \
-  }
-
-#ifdef HAS_MERGEUVROW_16_AVX2
-ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_16_NEON
-ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
-#endif
-
-#undef ANY21CT
-
-// Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
-    memset(temp, 0, 128); /* for YUY2 and msan */                         \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
-  }
-
-#ifdef HAS_COPYROW_AVX
-ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
-#endif
-#ifdef HAS_COPYROW_SSE2
-ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
-#endif
-#ifdef HAS_COPYROW_NEON
-ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
-ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
-ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
-ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
-ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_AVX2)
-ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
-ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
-#endif
-#if defined(HAS_ARGBTORAWROW_AVX2)
-ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
-#endif
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
-#endif
-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
-ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
-#endif
-#if defined(HAS_ABGRTOAR30ROW_SSSE3)
-ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
-#endif
-#if defined(HAS_ARGBTOAR30ROW_SSSE3)
-ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
-#endif
-#if defined(HAS_ABGRTOAR30ROW_AVX2)
-ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
-#endif
-#if defined(HAS_ARGBTOAR30ROW_AVX2)
-ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
-#endif
-#if defined(HAS_J400TOARGBROW_SSE2)
-ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_J400TOARGBROW_AVX2)
-ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
-ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
-ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
-ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
-ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
-#endif
-#if defined(HAS_RAWTORGBAROW_SSSE3)
-ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
-#endif
-#if defined(HAS_RAWTORGB24ROW_SSSE3)
-ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
-ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
-ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
-ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
-ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_MSA)
-ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
-ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
-ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
-ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
-ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_MMI)
-ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3)
-ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3)
-ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3)
-ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
-ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
-ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
-#endif
-#if defined(HAS_RAWTORGB24ROW_NEON)
-ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
-#endif
-#if defined(HAS_RAWTORGB24ROW_MSA)
-ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
-#endif
-#if defined(HAS_RAWTORGB24ROW_MMI)
-ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
-#endif
-#ifdef HAS_ARGBTOYROW_AVX2
-ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_ABGRTOYROW_AVX2
-ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBTOYJROW_AVX2
-ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_RGBATOYJROW_AVX2
-ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_UYVYTOYROW_AVX2
-ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
-#endif
-#ifdef HAS_YUY2TOYROW_AVX2
-ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBTOYROW_SSSE3
-ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
-#endif
-#ifdef HAS_BGRATOYROW_SSSE3
-ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
-ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYJROW_SSSE3
-ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
-#endif
-#ifdef HAS_RGBATOYJROW_SSSE3
-ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYROW_NEON
-ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYROW_MSA
-ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYROW_MMI
-ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYJROW_NEON
-ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGBATOYJROW_NEON
-ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYJROW_MSA
-ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYJROW_MMI
-ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_BGRATOYROW_NEON
-ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_BGRATOYROW_MSA
-ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_BGRATOYROW_MMI
-ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_NEON
-ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_MSA
-ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_MMI
-ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGBATOYROW_NEON
-ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGBATOYROW_MSA
-ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_RGBATOYROW_MMI
-ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGB24TOYROW_NEON
-ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RGB24TOYJROW_AVX2
-ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
-#endif
-#ifdef HAS_RGB24TOYJROW_SSSE3
-ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
-#endif
-#ifdef HAS_RGB24TOYJROW_NEON
-ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RGB24TOYROW_MSA
-ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
-#endif
-#ifdef HAS_RGB24TOYROW_MMI
-ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RAWTOYROW_NEON
-ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RAWTOYJROW_AVX2
-ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
-#endif
-#ifdef HAS_RAWTOYJROW_SSSE3
-ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
-#endif
-#ifdef HAS_RAWTOYJROW_NEON
-ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RAWTOYROW_MSA
-ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
-#endif
-#ifdef HAS_RAWTOYROW_MMI
-ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RGB565TOYROW_NEON
-ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_RGB565TOYROW_MSA
-ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
-#endif
-#ifdef HAS_RGB565TOYROW_MMI
-ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB1555TOYROW_NEON
-ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB1555TOYROW_MSA
-ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
-#endif
-#ifdef HAS_ARGB1555TOYROW_MMI
-ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB4444TOYROW_NEON
-ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB4444TOYROW_MMI
-ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7)
-#endif
-#ifdef HAS_YUY2TOYROW_NEON
-ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
-#endif
-#ifdef HAS_UYVYTOYROW_NEON
-ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
-#endif
-#ifdef HAS_YUY2TOYROW_MSA
-ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
-#endif
-#ifdef HAS_YUY2TOYROW_MMI
-ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7)
-#endif
-#ifdef HAS_UYVYTOYROW_MSA
-ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
-#endif
-#ifdef HAS_UYVYTOYROW_MMI
-ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
-#endif
-#ifdef HAS_AYUVTOYROW_NEON
-ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
-#endif
-#ifdef HAS_SWAPUVROW_SSSE3
-ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
-#endif
-#ifdef HAS_SWAPUVROW_AVX2
-ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
-#endif
-#ifdef HAS_SWAPUVROW_NEON
-ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
-#endif
-#ifdef HAS_RGB24TOARGBROW_NEON
-ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RGB24TOARGBROW_MSA
-ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
-#endif
-#ifdef HAS_RGB24TOARGBROW_MMI
-ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3)
-#endif
-#ifdef HAS_RAWTOARGBROW_NEON
-ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RAWTORGBAROW_NEON
-ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RAWTOARGBROW_MSA
-ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
-#endif
-#ifdef HAS_RAWTOARGBROW_MMI
-ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3)
-#endif
-#ifdef HAS_RGB565TOARGBROW_NEON
-ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_RGB565TOARGBROW_MSA
-ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
-#endif
-#ifdef HAS_RGB565TOARGBROW_MMI
-ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_NEON
-ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_MSA
-ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_MMI
-ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_NEON
-ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_MSA
-ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_MMI
-ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_NEON
-ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_MSA
-ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_MMI
-ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
-ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
-ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
-ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_MMI
-ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7)
-#endif
-#undef ANY11
-
-// Any 1 to 1 blended.  Destination is read, modify, write.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
-    memset(temp, 0, 64 * 2); /* for msan */                               \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    memcpy(temp + 64, dst_ptr + n * BPP, r * BPP);                        \
-    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
-    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                        \
-  }
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBCOPYALPHAROW_MMI
-ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI
-ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
-#endif
-#undef ANY11B
-
-// Any 1 to 1 with parameter.
-#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                        \
-    memset(temp, 0, 64); /* for msan */                                        \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \
-    }                                                                          \
-    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
-    ANY_SIMD(temp, temp + 64, param, MASK + 1);                                \
-    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \
-  }
-
-#if defined(HAS_I400TOARGBROW_SSE2)
-ANY11P(I400ToARGBRow_Any_SSE2,
-       I400ToARGBRow_SSE2,
-       const struct YuvConstants*,
-       1,
-       4,
-       7)
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-ANY11P(I400ToARGBRow_Any_AVX2,
-       I400ToARGBRow_AVX2,
-       const struct YuvConstants*,
-       1,
-       4,
-       15)
-#endif
-#if defined(HAS_I400TOARGBROW_NEON)
-ANY11P(I400ToARGBRow_Any_NEON,
-       I400ToARGBRow_NEON,
-       const struct YuvConstants*,
-       1,
-       4,
-       7)
-#endif
-#if defined(HAS_I400TOARGBROW_MSA)
-ANY11P(I400ToARGBRow_Any_MSA,
-       I400ToARGBRow_MSA,
-       const struct YuvConstants*,
-       1,
-       4,
-       15)
-#endif
-#if defined(HAS_I400TOARGBROW_MMI)
-ANY11P(I400ToARGBRow_Any_MMI,
-       I400ToARGBRow_MMI,
-       const struct YuvConstants*,
-       1,
-       4,
-       7)
-#endif
-
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
-       ARGBToRGB565DitherRow_SSE2,
-       const uint32_t,
-       4,
-       2,
-       3)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
-       ARGBToRGB565DitherRow_AVX2,
-       const uint32_t,
-       4,
-       2,
-       7)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-ANY11P(ARGBToRGB565DitherRow_Any_NEON,
-       ARGBToRGB565DitherRow_NEON,
-       const uint32_t,
-       4,
-       2,
-       7)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
-ANY11P(ARGBToRGB565DitherRow_Any_MSA,
-       ARGBToRGB565DitherRow_MSA,
-       const uint32_t,
-       4,
-       2,
-       7)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
-ANY11P(ARGBToRGB565DitherRow_Any_MMI,
-       ARGBToRGB565DitherRow_MMI,
-       const uint32_t,
-       4,
-       2,
-       3)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_NEON
-ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_MSA
-ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_MMI
-ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
-#endif
-#undef ANY11P
-#undef ANY11P
-
-// Any 1 to 1 with type
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)  \
-  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]);                \
-    SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]);                  \
-    memset(temp, 0, (MASK + 1) * SBPP); /* for msan */            \
-    int r = width & MASK;                                         \
-    int n = width & ~MASK;                                        \
-    if (n > 0) {                                                  \
-      ANY_SIMD(src_ptr, dst_ptr, n);                              \
-    }                                                             \
-    memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP);       \
-    ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1);                \
-    memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP);          \
-  }
-
-#ifdef HAS_ARGBTOAR64ROW_SSSE3
-ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3)
-#endif
-
-#ifdef HAS_ARGBTOAB64ROW_SSSE3
-ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3)
-#endif
-
-#ifdef HAS_AR64TOARGBROW_SSSE3
-ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3)
-#endif
-
-#ifdef HAS_ARGBTOAR64ROW_SSSE3
-ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3)
-#endif
-
-#ifdef HAS_ARGBTOAR64ROW_AVX2
-ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7)
-#endif
-
-#ifdef HAS_ARGBTOAB64ROW_AVX2
-ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7)
-#endif
-
-#ifdef HAS_AR64TOARGBROW_AVX2
-ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7)
-#endif
-
-#ifdef HAS_ARGBTOAR64ROW_AVX2
-ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7)
-#endif
-
-#ifdef HAS_ARGBTOAR64ROW_NEON
-ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7)
-#endif
-
-#ifdef HAS_ARGBTOAB64ROW_NEON
-ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7)
-#endif
-
-#ifdef HAS_AR64TOARGBROW_NEON
-ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
-#endif
-
-#ifdef HAS_ARGBTOAR64ROW_NEON
-ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
-#endif
-
-#undef ANY11T
-
-// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
-#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
-  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
-    SIMD_ALIGNED(STYPE temp[32]);                                            \
-    SIMD_ALIGNED(DTYPE out[32]);                                             \
-    memset(temp, 0, 32 * SBPP); /* for msan */                               \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                  \
-    }                                                                        \
-    memcpy(temp, src_ptr + n, r * SBPP);                                     \
-    ANY_SIMD(temp, out, scale, MASK + 1);                                    \
-    memcpy(dst_ptr + n, out, r * BPP);                                       \
-  }
-
-#ifdef HAS_CONVERT16TO8ROW_SSSE3
-ANY11C(Convert16To8Row_Any_SSSE3,
-       Convert16To8Row_SSSE3,
-       2,
-       1,
-       uint16_t,
-       uint8_t,
-       15)
-#endif
-#ifdef HAS_CONVERT16TO8ROW_AVX2
-ANY11C(Convert16To8Row_Any_AVX2,
-       Convert16To8Row_AVX2,
-       2,
-       1,
-       uint16_t,
-       uint8_t,
-       31)
-#endif
-#ifdef HAS_CONVERT8TO16ROW_SSE2
-ANY11C(Convert8To16Row_Any_SSE2,
-       Convert8To16Row_SSE2,
-       1,
-       2,
-       uint8_t,
-       uint16_t,
-       15)
-#endif
-#ifdef HAS_CONVERT8TO16ROW_AVX2
-ANY11C(Convert8To16Row_Any_AVX2,
-       Convert8To16Row_AVX2,
-       1,
-       2,
-       uint8_t,
-       uint16_t,
-       31)
-#endif
-#ifdef HAS_MULTIPLYROW_16_AVX2
-ANY11C(MultiplyRow_16_Any_AVX2,
-       MultiplyRow_16_AVX2,
-       2,
-       2,
-       uint16_t,
-       uint16_t,
-       31)
-#endif
-#ifdef HAS_MULTIPLYROW_16_NEON
-ANY11C(MultiplyRow_16_Any_NEON,
-       MultiplyRow_16_NEON,
-       2,
-       2,
-       uint16_t,
-       uint16_t,
-       15)
-#endif
-#ifdef HAS_DIVIDEROW_16_AVX2
-ANY11C(DivideRow_16_Any_AVX2, DivideRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31)
-#endif
-#ifdef HAS_DIVIDEROW_16_NEON
-ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
-#endif
-#undef ANY11C
-
-// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
-#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
-  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
-    SIMD_ALIGNED(ST temp[32]);                                          \
-    SIMD_ALIGNED(T out[32]);                                            \
-    memset(temp, 0, SBPP * 32); /* for msan */                          \
-    int r = width & MASK;                                               \
-    int n = width & ~MASK;                                              \
-    if (n > 0) {                                                        \
-      ANY_SIMD(src_ptr, dst_ptr, param, n);                             \
-    }                                                                   \
-    memcpy(temp, src_ptr + n, r * SBPP);                                \
-    ANY_SIMD(temp, out, param, MASK + 1);                               \
-    memcpy(dst_ptr + n, out, r * BPP);                                  \
-  }
-
-#ifdef HAS_HALFFLOATROW_SSE2
-ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
-#endif
-#ifdef HAS_HALFFLOATROW_AVX2
-ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
-#endif
-#ifdef HAS_HALFFLOATROW_F16C
-ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
-ANY11P16(HalfFloat1Row_Any_F16C,
-         HalfFloat1Row_F16C,
-         uint16_t,
-         uint16_t,
-         2,
-         2,
-         15)
-#endif
-#ifdef HAS_HALFFLOATROW_NEON
-ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
-ANY11P16(HalfFloat1Row_Any_NEON,
-         HalfFloat1Row_NEON,
-         uint16_t,
-         uint16_t,
-         2,
-         2,
-         7)
-#endif
-#ifdef HAS_HALFFLOATROW_MSA
-ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
-#endif
-#ifdef HAS_BYTETOFLOATROW_NEON
-ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
-#endif
-#undef ANY11P16
-
-// Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                  \
-               const struct YuvConstants* yuvconstants, int width) {      \
-    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
-    memset(temp, 0, 128); /* for YUY2 and msan */                         \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                        \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                   \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
-  }
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
-ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
-ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
-#endif
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
-ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
-#endif
-#if defined(HAS_YUY2TOARGBROW_NEON)
-ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
-ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
-#endif
-#if defined(HAS_YUY2TOARGBROW_MSA)
-ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
-ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
-#endif
-#if defined(HAS_YUY2TOARGBROW_MMI)
-ANY11C(YUY2ToARGBRow_Any_MMI, YUY2ToARGBRow_MMI, 1, 4, 4, 7)
-ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7)
-#endif
-#undef ANY11C
-
-// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
-#define ANY11I(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
-  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, \
-               int width, int source_y_fraction) {                             \
-    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                        \
-    memset(temp, 0, 64 * 2); /* for msan */                                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction);            \
-    }                                                                          \
-    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
-    memcpy(temp + 64, src_ptr + src_stride + n * SBPP, r * SBPP);              \
-    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);               \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                            \
-  }
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
-#endif
-#ifdef HAS_INTERPOLATEROW_SSSE3
-ANY11I(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_NEON
-ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_MSA
-ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
-#endif
-#ifdef HAS_INTERPOLATEROW_MMI
-ANY11I(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
-#endif
-#undef ANY11I
-
-// Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
-    memset(temp, 0, 64); /* for msan */                                   \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                            \
-    }                                                                     \
-    memcpy(temp, src_ptr, r* BPP);                                        \
-    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
-    memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
-  }
-
-#ifdef HAS_MIRRORROW_AVX2
-ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
-#endif
-#ifdef HAS_MIRRORROW_SSSE3
-ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
-#endif
-#ifdef HAS_MIRRORROW_NEON
-ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
-#endif
-#ifdef HAS_MIRRORROW_MSA
-ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
-#endif
-#ifdef HAS_MIRRORROW_MMI
-ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
-#endif
-#ifdef HAS_MIRRORUVROW_AVX2
-ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
-#endif
-#ifdef HAS_MIRRORUVROW_SSSE3
-ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
-#endif
-#ifdef HAS_MIRRORUVROW_NEON
-ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
-#endif
-#ifdef HAS_MIRRORUVROW_MSA
-ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
-#endif
-#ifdef HAS_ARGBMIRRORROW_AVX2
-ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
-#endif
-#ifdef HAS_ARGBMIRRORROW_SSE2
-ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
-#endif
-#ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
-#endif
-#ifdef HAS_ARGBMIRRORROW_MSA
-ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
-#endif
-#ifdef HAS_ARGBMIRRORROW_MMI
-ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
-#endif
-#ifdef HAS_RGB24MIRRORROW_SSSE3
-ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
-#endif
-#ifdef HAS_RGB24MIRRORROW_NEON
-ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
-#endif
-#undef ANY11M
-
-// Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
-  void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64]);                  \
-    memset(temp, 0, 64); /* for msan */              \
-    int r = width & MASK;                            \
-    int n = width & ~MASK;                           \
-    if (n > 0) {                                     \
-      ANY_SIMD(dst_ptr, v32, n);                     \
-    }                                                \
-    ANY_SIMD(temp, v32, MASK + 1);                   \
-    memcpy(dst_ptr + n * BPP, temp, r * BPP);        \
-  }
-
-#ifdef HAS_SETROW_X86
-ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
-#endif
-#ifdef HAS_SETROW_NEON
-ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
-#endif
-#ifdef HAS_ARGBSETROW_NEON
-ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
-#endif
-#ifdef HAS_ARGBSETROW_MSA
-ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
-#endif
-#ifdef HAS_ARGBSETROW_MMI
-ANY1(ARGBSetRow_Any_MMI, ARGBSetRow_MMI, uint32_t, 4, 3)
-#endif
-#undef ANY1
-
-// Any 1 to 2.  Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)          \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,  \
-               int width) {                                             \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                \
-    memset(temp, 0, 128); /* for msan */                                \
-    int r = width & MASK;                                               \
-    int n = width & ~MASK;                                              \
-    if (n > 0) {                                                        \
-      ANY_SIMD(src_ptr, dst_u, dst_v, n);                               \
-    }                                                                   \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
-    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                   \
-    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));       \
-    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));       \
-  }
-
-#ifdef HAS_SPLITUVROW_SSE2
-ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
-#endif
-#ifdef HAS_SPLITUVROW_AVX2
-ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
-#endif
-#ifdef HAS_SPLITUVROW_NEON
-ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
-#endif
-#ifdef HAS_SPLITUVROW_MSA
-ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
-#endif
-#ifdef HAS_SPLITUVROW_MMI
-ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7)
-#endif
-#ifdef HAS_ARGBTOUV444ROW_SSSE3
-ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_AVX2
-ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
-ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_SSE2
-ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
-ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_NEON
-ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
-ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
-ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_MSA
-ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
-ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
-ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_MMI
-ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7)
-ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15)
-ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
-#endif
-#undef ANY12
-
-// Any 2 16 bit planes with parameter to 1
-#define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                            \
-  void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \
-    SIMD_ALIGNED(T temp[16 * 4]);                                           \
-    memset(temp, 0, 16 * 4 * BPP); /* for msan */                           \
-    int r = width & MASK;                                                   \
-    int n = width & ~MASK;                                                  \
-    if (n > 0) {                                                            \
-      ANY_SIMD(src_uv, dst_u, dst_v, depth, n);                             \
-    }                                                                       \
-    memcpy(temp, src_uv + n * 2, r * BPP * 2);                              \
-    ANY_SIMD(temp, temp + 32, temp + 48, depth, MASK + 1);                  \
-    memcpy(dst_u + n, temp + 32, r * BPP);                                  \
-    memcpy(dst_v + n, temp + 48, r * BPP);                                  \
-  }
-
-#ifdef HAS_SPLITUVROW_16_AVX2
-ANY12PT(SplitUVRow_16_Any_AVX2, SplitUVRow_16_AVX2, uint16_t, 2, 15)
-#endif
-
-#ifdef HAS_SPLITUVROW_16_NEON
-ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7)
-#endif
-
-#undef ANY21CT
-
-// Any 1 to 3.  Outputs RGB planes.
-#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \
-               uint8_t* dst_b, int width) {                                \
-    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                    \
-    memset(temp, 0, 16 * 3); /* for msan */                                \
-    int r = width & MASK;                                                  \
-    int n = width & ~MASK;                                                 \
-    if (n > 0) {                                                           \
-      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                           \
-    }                                                                      \
-    memcpy(temp, src_ptr + n * BPP, r * BPP);                              \
-    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
-    memcpy(dst_r + n, temp + 16 * 3, r);                                   \
-    memcpy(dst_g + n, temp + 16 * 4, r);                                   \
-    memcpy(dst_b + n, temp + 16 * 5, r);                                   \
-  }
-
-#ifdef HAS_SPLITRGBROW_SSSE3
-ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
-#endif
-#ifdef HAS_SPLITRGBROW_NEON
-ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
-#endif
-#ifdef HAS_SPLITRGBROW_MMI
-ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
-#endif
-#ifdef HAS_SPLITXRGBROW_SSE2
-ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7)
-#endif
-#ifdef HAS_SPLITXRGBROW_SSSE3
-ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7)
-#endif
-#ifdef HAS_SPLITXRGBROW_AVX2
-ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15)
-#endif
-#ifdef HAS_SPLITXRGBROW_NEON
-ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
-#endif
-
-// Any 1 to 4.  Outputs ARGB planes.
-#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK)                                    \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,         \
-               uint8_t* dst_b, uint8_t* dst_a, int width) {                    \
-    SIMD_ALIGNED(uint8_t temp[16 * 8]);                                        \
-    memset(temp, 0, 16 * 4); /* for msan */                                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n);                        \
-    }                                                                          \
-    memcpy(temp, src_ptr + n * BPP, r * BPP);                                  \
-    ANY_SIMD(temp, temp + 16 * 4, temp + 16 * 5, temp + 16 * 6, temp + 16 * 7, \
-             MASK + 1);                                                        \
-    memcpy(dst_r + n, temp + 16 * 4, r);                                       \
-    memcpy(dst_g + n, temp + 16 * 5, r);                                       \
-    memcpy(dst_b + n, temp + 16 * 6, r);                                       \
-    memcpy(dst_a + n, temp + 16 * 7, r);                                       \
-  }
-
-#ifdef HAS_SPLITARGBROW_SSE2
-ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7)
-#endif
-#ifdef HAS_SPLITARGBROW_SSSE3
-ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7)
-#endif
-#ifdef HAS_SPLITARGBROW_AVX2
-ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15)
-#endif
-#ifdef HAS_SPLITARGBROW_NEON
-ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
-#endif
-
-// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
-// 128 byte row allows for 32 avx ARGB pixels.
-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,       \
-               uint8_t* dst_v, int width) {                                  \
-    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \
-    memset(temp, 0, 128 * 2); /* for msan */                                 \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n);                        \
-    }                                                                        \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
-    memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,          \
-           SS(r, UVSHIFT) * BPP);                                            \
-    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
-      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
-             BPP);                                                           \
-      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
-    }                                                                        \
-    ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
-    memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
-    memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
-  }
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
-#endif
-#ifdef HAS_ABGRTOUVROW_AVX2
-ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVJROW_AVX2
-ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
-ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
-ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
-ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
-ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_AVX2
-ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
-ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
-#endif
-#ifdef HAS_YUY2TOUVROW_SSE2
-ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
-ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVROW_NEON
-ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVROW_MSA
-ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVROW_MMI
-ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVJROW_NEON
-ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVJROW_MSA
-ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVJROW_MMI
-ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_BGRATOUVROW_NEON
-ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_BGRATOUVROW_MSA
-ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
-#endif
-#ifdef HAS_BGRATOUVROW_MMI
-ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_ABGRTOUVROW_NEON
-ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ABGRTOUVROW_MSA
-ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
-#endif
-#ifdef HAS_ABGRTOUVROW_MMI
-ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_RGBATOUVROW_NEON
-ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_RGBATOUVROW_MSA
-ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
-#endif
-#ifdef HAS_RGBATOUVROW_MMI
-ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_NEON
-ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_MSA
-ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_MMI
-ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_NEON
-ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_MSA
-ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_MMI
-ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_NEON
-ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_MSA
-ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_MMI
-ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_NEON
-ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_MSA
-ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_MMI
-ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOUVROW_NEON
-ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOUVROW_MMI
-ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_NEON
-ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
-#endif
-#ifdef HAS_UYVYTOUVROW_NEON
-ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_MSA
-ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
-#endif
-#ifdef HAS_YUY2TOUVROW_MMI
-ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15)
-#endif
-#ifdef HAS_UYVYTOUVROW_MSA
-ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
-#endif
-#ifdef HAS_UYVYTOUVROW_MMI
-ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
-#endif
-#undef ANY12S
-
-// Any 1 to 1 with source stride (2 rows of source).  Outputs UV plane.
-// 128 byte row allows for 32 avx ARGB pixels.
-#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu,      \
-               int width) {                                                  \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                     \
-    memset(temp, 0, 128 * 2); /* for msan */                                 \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, src_stride, dst_vu, n);                              \
-    }                                                                        \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
-    memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,          \
-           SS(r, UVSHIFT) * BPP);                                            \
-    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
-      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
-             BPP);                                                           \
-      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
-    }                                                                        \
-    ANY_SIMD(temp, 128, temp + 256, MASK + 1);                               \
-    memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2);                 \
-  }
-
-#ifdef HAS_AYUVTOVUROW_NEON
-ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
-ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
-#endif
-#undef ANY11S
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/row_common.cc b/thirdparty/libyuv/source/row_common.cc
deleted file mode 100644
index 4d0dce2..0000000
--- a/thirdparty/libyuv/source/row_common.cc
+++ /dev/null
@@ -1,4212 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include <assert.h>
-#include <stdio.h>
-#include <string.h>  // For memcpy and memset.
-
-#include "libyuv/basic_types.h"
-#include "libyuv/convert_argb.h"  // For kYuvI601Constants
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This macro control YUV to RGB using unsigned math to extend range of
-// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
-// LIBYUV_UNLIMITED_DATA
-
-// The following macro from row_win makes the C code match the row_win code,
-// which is 7 bit fixed point for ARGBToI420:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
-#define LIBYUV_RGB7 1
-#endif
-
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
-    defined(_M_IX86)
-#define LIBYUV_ARGBTOUV_PAVGB 1
-#define LIBYUV_RGBTOU_TRUNCATE 1
-#endif
-
-// llvm x86 is poor at ternary operator, so use branchless min/max.
-
-#define USE_BRANCHLESS 1
-#if USE_BRANCHLESS
-static __inline int32_t clamp0(int32_t v) {
-  return -(v >= 0) & v;
-}
-// TODO(fbarchard): make clamp255 preserve negative values.
-static __inline int32_t clamp255(int32_t v) {
-  return (-(v >= 255) | v) & 255;
-}
-
-static __inline int32_t clamp1023(int32_t v) {
-  return (-(v >= 1023) | v) & 1023;
-}
-
-// clamp to max
-static __inline int32_t ClampMax(int32_t v, int32_t max) {
-  return (-(v >= max) | v) & max;
-}
-
-static __inline uint32_t Abs(int32_t v) {
-  int m = -(v < 0);
-  return (v + m) ^ m;
-}
-#else   // USE_BRANCHLESS
-static __inline int32_t clamp0(int32_t v) {
-  return (v < 0) ? 0 : v;
-}
-
-static __inline int32_t clamp255(int32_t v) {
-  return (v > 255) ? 255 : v;
-}
-
-static __inline int32_t clamp1023(int32_t v) {
-  return (v > 1023) ? 1023 : v;
-}
-
-static __inline int32_t ClampMax(int32_t v, int32_t max) {
-  return (v > max) ? max : v;
-}
-
-static __inline uint32_t Abs(int32_t v) {
-  return (v < 0) ? -v : v;
-}
-#endif  // USE_BRANCHLESS
-static __inline uint32_t Clamp(int32_t val) {
-  int v = clamp0(val);
-  return (uint32_t)(clamp255(v));
-}
-
-static __inline uint32_t Clamp10(int32_t val) {
-  int v = clamp0(val);
-  return (uint32_t)(clamp1023(v));
-}
-
-// Little Endian
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
-    defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) ||     \
-    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define WRITEWORD(p, v) *(uint32_t*)(p) = v
-#else
-static inline void WRITEWORD(uint8_t* p, uint32_t v) {
-  p[0] = (uint8_t)(v & 255);
-  p[1] = (uint8_t)((v >> 8) & 255);
-  p[2] = (uint8_t)((v >> 16) & 255);
-  p[3] = (uint8_t)((v >> 24) & 255);
-}
-#endif
-
-void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t b = src_rgb24[0];
-    uint8_t g = src_rgb24[1];
-    uint8_t r = src_rgb24[2];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    src_rgb24 += 3;
-  }
-}
-
-void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t r = src_raw[0];
-    uint8_t g = src_raw[1];
-    uint8_t b = src_raw[2];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    src_raw += 3;
-  }
-}
-
-void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t r = src_raw[0];
-    uint8_t g = src_raw[1];
-    uint8_t b = src_raw[2];
-    dst_rgba[0] = 255u;
-    dst_rgba[1] = b;
-    dst_rgba[2] = g;
-    dst_rgba[3] = r;
-    dst_rgba += 4;
-    src_raw += 3;
-  }
-}
-
-void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t r = src_raw[0];
-    uint8_t g = src_raw[1];
-    uint8_t b = src_raw[2];
-    dst_rgb24[0] = b;
-    dst_rgb24[1] = g;
-    dst_rgb24[2] = r;
-    dst_rgb24 += 3;
-    src_raw += 3;
-  }
-}
-
-void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
-                       uint8_t* dst_argb,
-                       int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t b = src_rgb565[0] & 0x1f;
-    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r = src_rgb565[1] >> 3;
-    dst_argb[0] = (b << 3) | (b >> 2);
-    dst_argb[1] = (g << 2) | (g >> 4);
-    dst_argb[2] = (r << 3) | (r >> 2);
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    src_rgb565 += 2;
-  }
-}
-
-void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
-                         uint8_t* dst_argb,
-                         int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb1555[0] & 0x1f;
-    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t a = src_argb1555[1] >> 7;
-    dst_argb[0] = (b << 3) | (b >> 2);
-    dst_argb[1] = (g << 3) | (g >> 2);
-    dst_argb[2] = (r << 3) | (r >> 2);
-    dst_argb[3] = -a;
-    dst_argb += 4;
-    src_argb1555 += 2;
-  }
-}
-
-void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
-                         uint8_t* dst_argb,
-                         int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb4444[0] & 0x0f;
-    uint8_t g = src_argb4444[0] >> 4;
-    uint8_t r = src_argb4444[1] & 0x0f;
-    uint8_t a = src_argb4444[1] >> 4;
-    dst_argb[0] = (b << 4) | b;
-    dst_argb[1] = (g << 4) | g;
-    dst_argb[2] = (r << 4) | r;
-    dst_argb[3] = (a << 4) | a;
-    dst_argb += 4;
-    src_argb4444 += 2;
-  }
-}
-
-void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint32_t ar30;
-    memcpy(&ar30, src_ar30, sizeof ar30);
-    uint32_t b = (ar30 >> 2) & 0xff;
-    uint32_t g = (ar30 >> 12) & 0xff;
-    uint32_t r = (ar30 >> 22) & 0xff;
-    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
-    *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
-    dst_argb += 4;
-    src_ar30 += 4;
-  }
-}
-
-void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint32_t ar30;
-    memcpy(&ar30, src_ar30, sizeof ar30);
-    uint32_t b = (ar30 >> 2) & 0xff;
-    uint32_t g = (ar30 >> 12) & 0xff;
-    uint32_t r = (ar30 >> 22) & 0xff;
-    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
-    *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
-    dst_abgr += 4;
-    src_ar30 += 4;
-  }
-}
-
-void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint32_t ar30;
-    memcpy(&ar30, src_ar30, sizeof ar30);
-    uint32_t b = ar30 & 0x3ff;
-    uint32_t ga = ar30 & 0xc00ffc00;
-    uint32_t r = (ar30 >> 20) & 0x3ff;
-    *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
-    dst_ab30 += 4;
-    src_ar30 += 4;
-  }
-}
-
-void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb[0];
-    uint8_t g = src_argb[1];
-    uint8_t r = src_argb[2];
-    dst_rgb[0] = b;
-    dst_rgb[1] = g;
-    dst_rgb[2] = r;
-    dst_rgb += 3;
-    src_argb += 4;
-  }
-}
-
-void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb[0];
-    uint8_t g = src_argb[1];
-    uint8_t r = src_argb[2];
-    dst_rgb[0] = r;
-    dst_rgb[1] = g;
-    dst_rgb[2] = b;
-    dst_rgb += 3;
-    src_argb += 4;
-  }
-}
-
-void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_argb[0] >> 3;
-    uint8_t g0 = src_argb[1] >> 2;
-    uint8_t r0 = src_argb[2] >> 3;
-    uint8_t b1 = src_argb[4] >> 3;
-    uint8_t g1 = src_argb[5] >> 2;
-    uint8_t r1 = src_argb[6] >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
-                           (r1 << 27));
-    dst_rgb += 4;
-    src_argb += 8;
-  }
-  if (width & 1) {
-    uint8_t b0 = src_argb[0] >> 3;
-    uint8_t g0 = src_argb[1] >> 2;
-    uint8_t r0 = src_argb[2] >> 3;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
-  }
-}
-
-// dither4 is a row of 4 values from 4x4 dither matrix.
-// The 4x4 matrix contains values to increase RGB.  When converting to
-// fewer bits (565) this provides an ordered dither.
-// The order in the 4x4 matrix in first byte is upper left.
-// The 4 values are passed as an int, then referenced as an array, so
-// endian will not affect order of the original matrix.  But the dither4
-// will containing the first pixel in the lower byte for little endian
-// or the upper byte for big endian.
-void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
-                             uint8_t* dst_rgb,
-                             const uint32_t dither4,
-                             int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
-    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
-    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
-    uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
-    uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
-    uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
-                           (r1 << 27));
-    dst_rgb += 4;
-    src_argb += 8;
-  }
-  if (width & 1) {
-    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
-    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
-  }
-}
-
-void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_argb[0] >> 3;
-    uint8_t g0 = src_argb[1] >> 3;
-    uint8_t r0 = src_argb[2] >> 3;
-    uint8_t a0 = src_argb[3] >> 7;
-    uint8_t b1 = src_argb[4] >> 3;
-    uint8_t g1 = src_argb[5] >> 3;
-    uint8_t r1 = src_argb[6] >> 3;
-    uint8_t a1 = src_argb[7] >> 7;
-    *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
-                            (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
-    dst_rgb += 4;
-    src_argb += 8;
-  }
-  if (width & 1) {
-    uint8_t b0 = src_argb[0] >> 3;
-    uint8_t g0 = src_argb[1] >> 3;
-    uint8_t r0 = src_argb[2] >> 3;
-    uint8_t a0 = src_argb[3] >> 7;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
-  }
-}
-
-void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_argb[0] >> 4;
-    uint8_t g0 = src_argb[1] >> 4;
-    uint8_t r0 = src_argb[2] >> 4;
-    uint8_t a0 = src_argb[3] >> 4;
-    uint8_t b1 = src_argb[4] >> 4;
-    uint8_t g1 = src_argb[5] >> 4;
-    uint8_t r1 = src_argb[6] >> 4;
-    uint8_t a1 = src_argb[7] >> 4;
-    *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
-                            (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
-    dst_rgb += 4;
-    src_argb += 8;
-  }
-  if (width & 1) {
-    uint8_t b0 = src_argb[0] >> 4;
-    uint8_t g0 = src_argb[1] >> 4;
-    uint8_t r0 = src_argb[2] >> 4;
-    uint8_t a0 = src_argb[3] >> 4;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
-  }
-}
-
-void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
-    uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
-    uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
-    uint32_t a0 = (src_abgr[3] >> 6);
-    *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
-    dst_ar30 += 4;
-    src_abgr += 4;
-  }
-}
-
-void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
-    uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
-    uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
-    uint32_t a0 = (src_argb[3] >> 6);
-    *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
-    dst_ar30 += 4;
-    src_argb += 4;
-  }
-}
-
-void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_ar64[0] = src_argb[0] * 0x0101;
-    dst_ar64[1] = src_argb[1] * 0x0101;
-    dst_ar64[2] = src_argb[2] * 0x0101;
-    dst_ar64[3] = src_argb[3] * 0x0101;
-    dst_ar64 += 4;
-    src_argb += 4;
-  }
-}
-
-void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_ab64[0] = src_argb[2] * 0x0101;
-    dst_ab64[1] = src_argb[1] * 0x0101;
-    dst_ab64[2] = src_argb[0] * 0x0101;
-    dst_ab64[3] = src_argb[3] * 0x0101;
-    dst_ab64 += 4;
-    src_argb += 4;
-  }
-}
-
-void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_argb[0] = src_ar64[0] >> 8;
-    dst_argb[1] = src_ar64[1] >> 8;
-    dst_argb[2] = src_ar64[2] >> 8;
-    dst_argb[3] = src_ar64[3] >> 8;
-    dst_argb += 4;
-    src_ar64 += 4;
-  }
-}
-
-void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_argb[0] = src_ab64[2] >> 8;
-    dst_argb[1] = src_ab64[1] >> 8;
-    dst_argb[2] = src_ab64[0] >> 8;
-    dst_argb[3] = src_ab64[3] >> 8;
-    dst_argb += 4;
-    src_ab64 += 4;
-  }
-}
-
-// TODO(fbarchard): Make shuffle compatible with SIMD versions
-void AR64ShuffleRow_C(const uint8_t* src_ar64,
-                      uint8_t* dst_ar64,
-                      const uint8_t* shuffler,
-                      int width) {
-  const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64;
-  uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64;
-  int index0 = shuffler[0] / 2;
-  int index1 = shuffler[2] / 2;
-  int index2 = shuffler[4] / 2;
-  int index3 = shuffler[6] / 2;
-  // Shuffle a row of AR64.
-  int x;
-  for (x = 0; x < width / 2; ++x) {
-    // To support in-place conversion.
-    uint16_t b = src_ar64_16[index0];
-    uint16_t g = src_ar64_16[index1];
-    uint16_t r = src_ar64_16[index2];
-    uint16_t a = src_ar64_16[index3];
-    dst_ar64_16[0] = b;
-    dst_ar64_16[1] = g;
-    dst_ar64_16[2] = r;
-    dst_ar64_16[3] = a;
-    src_ar64_16 += 4;
-    dst_ar64_16 += 4;
-  }
-}
-
-#ifdef LIBYUV_RGB7
-// Old 7 bit math for compatibility on unsupported platforms.
-static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
-  return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
-}
-#else
-// 8 bit
-// Intel SSE/AVX uses the following equivalent formula
-// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
-//  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
-//  0x7e80) >> 8;
-
-static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
-  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
-}
-#endif
-
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
-
-#ifdef LIBYUV_RGBTOU_TRUNCATE
-static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
-}
-static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
-}
-#else
-// TODO(fbarchard): Add rounding to SIMD and use this
-static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
-}
-static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
-}
-#endif
-
-#if !defined(LIBYUV_ARGBTOUV_PAVGB)
-static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
-  return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
-}
-static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
-  return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
-}
-#endif
-
-// ARGBToY_C and ARGBToUV_C
-// Intel version mimic SSE/AVX which does 2 pavgb
-#if LIBYUV_ARGBTOUV_PAVGB
-
-#define MAKEROWY(NAME, R, G, B, BPP)                                       \
-  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
-    int x;                                                                 \
-    for (x = 0; x < width; ++x) {                                          \
-      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
-      src_rgb += BPP;                                                      \
-      dst_y += 1;                                                          \
-    }                                                                      \
-  }                                                                        \
-  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
-                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
-    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
-    int x;                                                                 \
-    for (x = 0; x < width - 1; x += 2) {                                   \
-      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                     \
-                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));        \
-      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                     \
-                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));        \
-      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                     \
-                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));        \
-      dst_u[0] = RGBToU(ar, ag, ab);                                       \
-      dst_v[0] = RGBToV(ar, ag, ab);                                       \
-      src_rgb += BPP * 2;                                                  \
-      src_rgb1 += BPP * 2;                                                 \
-      dst_u += 1;                                                          \
-      dst_v += 1;                                                          \
-    }                                                                      \
-    if (width & 1) {                                                       \
-      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                          \
-      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                          \
-      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                          \
-      dst_u[0] = RGBToU(ar, ag, ab);                                       \
-      dst_v[0] = RGBToV(ar, ag, ab);                                       \
-    }                                                                      \
-  }
-#else
-// ARM version does sum / 2 then multiply by 2x smaller coefficients
-#define MAKEROWY(NAME, R, G, B, BPP)                                       \
-  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
-    int x;                                                                 \
-    for (x = 0; x < width; ++x) {                                          \
-      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
-      src_rgb += BPP;                                                      \
-      dst_y += 1;                                                          \
-    }                                                                      \
-  }                                                                        \
-  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
-                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
-    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
-    int x;                                                                 \
-    for (x = 0; x < width - 1; x += 2) {                                   \
-      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +         \
-                     src_rgb1[B + BPP] + 1) >>                             \
-                    1;                                                     \
-      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +         \
-                     src_rgb1[G + BPP] + 1) >>                             \
-                    1;                                                     \
-      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +         \
-                     src_rgb1[R + BPP] + 1) >>                             \
-                    1;                                                     \
-      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
-      src_rgb += BPP * 2;                                                  \
-      src_rgb1 += BPP * 2;                                                 \
-      dst_u += 1;                                                          \
-      dst_v += 1;                                                          \
-    }                                                                      \
-    if (width & 1) {                                                       \
-      uint16_t ab = src_rgb[B] + src_rgb1[B];                              \
-      uint16_t ag = src_rgb[G] + src_rgb1[G];                              \
-      uint16_t ar = src_rgb[R] + src_rgb1[R];                              \
-      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
-    }                                                                      \
-  }
-#endif
-
-MAKEROWY(ARGB, 2, 1, 0, 4)
-MAKEROWY(BGRA, 1, 2, 3, 4)
-MAKEROWY(ABGR, 0, 1, 2, 4)
-MAKEROWY(RGBA, 3, 2, 1, 4)
-MAKEROWY(RGB24, 2, 1, 0, 3)
-MAKEROWY(RAW, 0, 1, 2, 3)
-#undef MAKEROWY
-
-// JPeg uses a variation on BT.601-1 full range
-// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
-// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
-// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
-// BT.601 Mpeg range uses:
-// b 0.1016 * 255 = 25.908 = 25
-// g 0.5078 * 255 = 129.489 = 129
-// r 0.2578 * 255 = 65.739 = 66
-// JPeg 7 bit Y (deprecated)
-// b 0.11400 * 128 = 14.592 = 15
-// g 0.58700 * 128 = 75.136 = 75
-// r 0.29900 * 128 = 38.272 = 38
-// JPeg 8 bit Y:
-// b 0.11400 * 256 = 29.184 = 29
-// g 0.58700 * 256 = 150.272 = 150
-// r 0.29900 * 256 = 76.544 = 77
-// JPeg 8 bit U:
-// b  0.50000 * 255 = 127.5 = 127
-// g -0.33126 * 255 = -84.4713 = -84
-// r -0.16874 * 255 = -43.0287 = -43
-// JPeg 8 bit V:
-// b -0.08131 * 255 = -20.73405 = -20
-// g -0.41869 * 255 = -106.76595 = -107
-// r  0.50000 * 255 = 127.5 = 127
-
-#ifdef LIBYUV_RGB7
-// Old 7 bit math for compatibility on unsupported platforms.
-static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
-  return (38 * r + 75 * g + 15 * b + 64) >> 7;
-}
-#else
-// 8 bit
-static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
-  return (77 * r + 150 * g + 29 * b + 128) >> 8;
-}
-#endif
-
-#if defined(LIBYUV_ARGBTOUV_PAVGB)
-static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
-  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
-}
-static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
-  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
-}
-#else
-static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
-  return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
-}
-static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
-  return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
-}
-#endif
-
-// ARGBToYJ_C and ARGBToUVJ_C
-// Intel version mimic SSE/AVX which does 2 pavgb
-#if LIBYUV_ARGBTOUV_PAVGB
-#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
-  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
-    int x;                                                                  \
-    for (x = 0; x < width; ++x) {                                           \
-      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
-      src_rgb += BPP;                                                       \
-      dst_y += 1;                                                           \
-    }                                                                       \
-  }                                                                         \
-  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
-                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
-    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
-    int x;                                                                  \
-    for (x = 0; x < width - 1; x += 2) {                                    \
-      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                      \
-                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));         \
-      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                      \
-                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));         \
-      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                      \
-                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));         \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
-      src_rgb += BPP * 2;                                                   \
-      src_rgb1 += BPP * 2;                                                  \
-      dst_u += 1;                                                           \
-      dst_v += 1;                                                           \
-    }                                                                       \
-    if (width & 1) {                                                        \
-      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                           \
-      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                           \
-      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                           \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
-    }                                                                       \
-  }
-#else
-// ARM version does sum / 2 then multiply by 2x smaller coefficients
-#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
-  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
-    int x;                                                                  \
-    for (x = 0; x < width; ++x) {                                           \
-      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
-      src_rgb += BPP;                                                       \
-      dst_y += 1;                                                           \
-    }                                                                       \
-  }                                                                         \
-  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
-                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
-    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
-    int x;                                                                  \
-    for (x = 0; x < width - 1; x += 2) {                                    \
-      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +          \
-                     src_rgb1[B + BPP] + 1) >>                              \
-                    1;                                                      \
-      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +          \
-                     src_rgb1[G + BPP] + 1) >>                              \
-                    1;                                                      \
-      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +          \
-                     src_rgb1[R + BPP] + 1) >>                              \
-                    1;                                                      \
-      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
-      src_rgb += BPP * 2;                                                   \
-      src_rgb1 += BPP * 2;                                                  \
-      dst_u += 1;                                                           \
-      dst_v += 1;                                                           \
-    }                                                                       \
-    if (width & 1) {                                                        \
-      uint16_t ab = (src_rgb[B] + src_rgb1[B]);                             \
-      uint16_t ag = (src_rgb[G] + src_rgb1[G]);                             \
-      uint16_t ar = (src_rgb[R] + src_rgb1[R]);                             \
-      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
-    }                                                                       \
-  }
-
-#endif
-
-MAKEROWYJ(ARGB, 2, 1, 0, 4)
-MAKEROWYJ(RGBA, 3, 2, 1, 4)
-MAKEROWYJ(RGB24, 2, 1, 0, 3)
-MAKEROWYJ(RAW, 0, 1, 2, 3)
-#undef MAKEROWYJ
-
-void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t b = src_rgb565[0] & 0x1f;
-    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r = src_rgb565[1] >> 3;
-    b = (b << 3) | (b >> 2);
-    g = (g << 2) | (g >> 4);
-    r = (r << 3) | (r >> 2);
-    dst_y[0] = RGBToY(r, g, b);
-    src_rgb565 += 2;
-    dst_y += 1;
-  }
-}
-
-void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb1555[0] & 0x1f;
-    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
-    b = (b << 3) | (b >> 2);
-    g = (g << 3) | (g >> 2);
-    r = (r << 3) | (r >> 2);
-    dst_y[0] = RGBToY(r, g, b);
-    src_argb1555 += 2;
-    dst_y += 1;
-  }
-}
-
-void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb4444[0] & 0x0f;
-    uint8_t g = src_argb4444[0] >> 4;
-    uint8_t r = src_argb4444[1] & 0x0f;
-    b = (b << 4) | b;
-    g = (g << 4) | g;
-    r = (r << 4) | r;
-    dst_y[0] = RGBToY(r, g, b);
-    src_argb4444 += 2;
-    dst_y += 1;
-  }
-}
-
-void RGB565ToUVRow_C(const uint8_t* src_rgb565,
-                     int src_stride_rgb565,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_rgb565[0] & 0x1f;
-    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r0 = src_rgb565[1] >> 3;
-    uint8_t b1 = src_rgb565[2] & 0x1f;
-    uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
-    uint8_t r1 = src_rgb565[3] >> 3;
-    uint8_t b2 = next_rgb565[0] & 0x1f;
-    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8_t r2 = next_rgb565[1] >> 3;
-    uint8_t b3 = next_rgb565[2] & 0x1f;
-    uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
-    uint8_t r3 = next_rgb565[3] >> 3;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 2) | (g0 >> 4);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b1 = (b1 << 3) | (b1 >> 2);
-    g1 = (g1 << 2) | (g1 >> 4);
-    r1 = (r1 << 3) | (r1 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 2) | (g2 >> 4);
-    r2 = (r2 << 3) | (r2 >> 2);
-    b3 = (b3 << 3) | (b3 >> 2);
-    g3 = (g3 << 2) | (g3 >> 4);
-    r3 = (r3 << 3) | (r3 >> 2);
-
-#if LIBYUV_ARGBTOUV_PAVGB
-    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
-    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
-    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-#else
-    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
-    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
-    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
-#endif
-
-    src_rgb565 += 4;
-    next_rgb565 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  if (width & 1) {
-    uint8_t b0 = src_rgb565[0] & 0x1f;
-    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r0 = src_rgb565[1] >> 3;
-    uint8_t b2 = next_rgb565[0] & 0x1f;
-    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8_t r2 = next_rgb565[1] >> 3;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 2) | (g0 >> 4);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 2) | (g2 >> 4);
-    r2 = (r2 << 3) | (r2 >> 2);
-
-#if LIBYUV_ARGBTOUV_PAVGB
-    uint8_t ab = AVGB(b0, b2);
-    uint8_t ag = AVGB(g0, g2);
-    uint8_t ar = AVGB(r0, r2);
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-#else
-    uint16_t b = b0 + b2;
-    uint16_t g = g0 + g2;
-    uint16_t r = r0 + r2;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
-#endif
-  }
-}
-
-void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
-                       int src_stride_argb1555,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_argb1555[0] & 0x1f;
-    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t b1 = src_argb1555[2] & 0x1f;
-    uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
-    uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
-    uint8_t b2 = next_argb1555[0] & 0x1f;
-    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
-    uint8_t b3 = next_argb1555[2] & 0x1f;
-    uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
-    uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 3) | (g0 >> 2);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b1 = (b1 << 3) | (b1 >> 2);
-    g1 = (g1 << 3) | (g1 >> 2);
-    r1 = (r1 << 3) | (r1 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 3) | (g2 >> 2);
-    r2 = (r2 << 3) | (r2 >> 2);
-    b3 = (b3 << 3) | (b3 >> 2);
-    g3 = (g3 << 3) | (g3 >> 2);
-    r3 = (r3 << 3) | (r3 >> 2);
-
-#if LIBYUV_ARGBTOUV_PAVGB
-    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
-    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
-    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-#else
-    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
-    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
-    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
-#endif
-
-    src_argb1555 += 4;
-    next_argb1555 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  if (width & 1) {
-    uint8_t b0 = src_argb1555[0] & 0x1f;
-    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t b2 = next_argb1555[0] & 0x1f;
-    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8_t r2 = next_argb1555[1] >> 3;
-
-    b0 = (b0 << 3) | (b0 >> 2);
-    g0 = (g0 << 3) | (g0 >> 2);
-    r0 = (r0 << 3) | (r0 >> 2);
-    b2 = (b2 << 3) | (b2 >> 2);
-    g2 = (g2 << 3) | (g2 >> 2);
-    r2 = (r2 << 3) | (r2 >> 2);
-
-#if LIBYUV_ARGBTOUV_PAVGB
-    uint8_t ab = AVGB(b0, b2);
-    uint8_t ag = AVGB(g0, g2);
-    uint8_t ar = AVGB(r0, r2);
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-#else
-    uint16_t b = b0 + b2;
-    uint16_t g = g0 + g2;
-    uint16_t r = r0 + r2;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
-#endif
-  }
-}
-
-void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
-                       int src_stride_argb4444,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_argb4444[0] & 0x0f;
-    uint8_t g0 = src_argb4444[0] >> 4;
-    uint8_t r0 = src_argb4444[1] & 0x0f;
-    uint8_t b1 = src_argb4444[2] & 0x0f;
-    uint8_t g1 = src_argb4444[2] >> 4;
-    uint8_t r1 = src_argb4444[3] & 0x0f;
-    uint8_t b2 = next_argb4444[0] & 0x0f;
-    uint8_t g2 = next_argb4444[0] >> 4;
-    uint8_t r2 = next_argb4444[1] & 0x0f;
-    uint8_t b3 = next_argb4444[2] & 0x0f;
-    uint8_t g3 = next_argb4444[2] >> 4;
-    uint8_t r3 = next_argb4444[3] & 0x0f;
-
-    b0 = (b0 << 4) | b0;
-    g0 = (g0 << 4) | g0;
-    r0 = (r0 << 4) | r0;
-    b1 = (b1 << 4) | b1;
-    g1 = (g1 << 4) | g1;
-    r1 = (r1 << 4) | r1;
-    b2 = (b2 << 4) | b2;
-    g2 = (g2 << 4) | g2;
-    r2 = (r2 << 4) | r2;
-    b3 = (b3 << 4) | b3;
-    g3 = (g3 << 4) | g3;
-    r3 = (r3 << 4) | r3;
-
-#if LIBYUV_ARGBTOUV_PAVGB
-    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
-    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
-    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-#else
-    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
-    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
-    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
-#endif
-
-    src_argb4444 += 4;
-    next_argb4444 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  if (width & 1) {
-    uint8_t b0 = src_argb4444[0] & 0x0f;
-    uint8_t g0 = src_argb4444[0] >> 4;
-    uint8_t r0 = src_argb4444[1] & 0x0f;
-    uint8_t b2 = next_argb4444[0] & 0x0f;
-    uint8_t g2 = next_argb4444[0] >> 4;
-    uint8_t r2 = next_argb4444[1] & 0x0f;
-
-    b0 = (b0 << 4) | b0;
-    g0 = (g0 << 4) | g0;
-    r0 = (r0 << 4) | r0;
-    b2 = (b2 << 4) | b2;
-    g2 = (g2 << 4) | g2;
-    r2 = (r2 << 4) | r2;
-
-#if LIBYUV_ARGBTOUV_PAVGB
-    uint8_t ab = AVGB(b0, b2);
-    uint8_t ag = AVGB(g0, g2);
-    uint8_t ar = AVGB(r0, r2);
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-#else
-    uint16_t b = b0 + b2;
-    uint16_t g = g0 + g2;
-    uint16_t r = r0 + r2;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
-#endif
-  }
-}
-
-void ARGBToUV444Row_C(const uint8_t* src_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t ab = src_argb[0];
-    uint8_t ag = src_argb[1];
-    uint8_t ar = src_argb[2];
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-    src_argb += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
-
-void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
-    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
-    dst_argb[3] = src_argb[3];
-    dst_argb += 4;
-    src_argb += 4;
-  }
-}
-
-// Convert a row of image to Sepia tone.
-void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    int sb = (b * 17 + g * 68 + r * 35) >> 7;
-    int sg = (b * 22 + g * 88 + r * 45) >> 7;
-    int sr = (b * 24 + g * 98 + r * 50) >> 7;
-    // b does not over flow. a is preserved from original.
-    dst_argb[0] = sb;
-    dst_argb[1] = clamp255(sg);
-    dst_argb[2] = clamp255(sr);
-    dst_argb += 4;
-  }
-}
-
-// Apply color matrix to a row of image. Matrix is signed.
-// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_C(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          const int8_t* matrix_argb,
-                          int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    int b = src_argb[0];
-    int g = src_argb[1];
-    int r = src_argb[2];
-    int a = src_argb[3];
-    int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
-              a * matrix_argb[3]) >>
-             6;
-    int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
-              a * matrix_argb[7]) >>
-             6;
-    int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
-              a * matrix_argb[11]) >>
-             6;
-    int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
-              a * matrix_argb[15]) >>
-             6;
-    dst_argb[0] = Clamp(sb);
-    dst_argb[1] = Clamp(sg);
-    dst_argb[2] = Clamp(sr);
-    dst_argb[3] = Clamp(sa);
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-
-// Apply color table to a row of image.
-void ARGBColorTableRow_C(uint8_t* dst_argb,
-                         const uint8_t* table_argb,
-                         int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    int a = dst_argb[3];
-    dst_argb[0] = table_argb[b * 4 + 0];
-    dst_argb[1] = table_argb[g * 4 + 1];
-    dst_argb[2] = table_argb[r * 4 + 2];
-    dst_argb[3] = table_argb[a * 4 + 3];
-    dst_argb += 4;
-  }
-}
-
-// Apply color table to a row of image.
-void RGBColorTableRow_C(uint8_t* dst_argb,
-                        const uint8_t* table_argb,
-                        int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    dst_argb[0] = table_argb[b * 4 + 0];
-    dst_argb[1] = table_argb[g * 4 + 1];
-    dst_argb[2] = table_argb[r * 4 + 2];
-    dst_argb += 4;
-  }
-}
-
-void ARGBQuantizeRow_C(uint8_t* dst_argb,
-                       int scale,
-                       int interval_size,
-                       int interval_offset,
-                       int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
-    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
-    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
-    dst_argb += 4;
-  }
-}
-
-#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v* f >> 24
-
-void ARGBShadeRow_C(const uint8_t* src_argb,
-                    uint8_t* dst_argb,
-                    int width,
-                    uint32_t value) {
-  const uint32_t b_scale = REPEAT8(value & 0xff);
-  const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);
-  const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);
-  const uint32_t a_scale = REPEAT8(value >> 24);
-
-  int i;
-  for (i = 0; i < width; ++i) {
-    const uint32_t b = REPEAT8(src_argb[0]);
-    const uint32_t g = REPEAT8(src_argb[1]);
-    const uint32_t r = REPEAT8(src_argb[2]);
-    const uint32_t a = REPEAT8(src_argb[3]);
-    dst_argb[0] = SHADE(b, b_scale);
-    dst_argb[1] = SHADE(g, g_scale);
-    dst_argb[2] = SHADE(r, r_scale);
-    dst_argb[3] = SHADE(a, a_scale);
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-#undef REPEAT8
-#undef SHADE
-
-#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v* f >> 16
-
-void ARGBMultiplyRow_C(const uint8_t* src_argb,
-                       const uint8_t* src_argb1,
-                       uint8_t* dst_argb,
-                       int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    const uint32_t b = REPEAT8(src_argb[0]);
-    const uint32_t g = REPEAT8(src_argb[1]);
-    const uint32_t r = REPEAT8(src_argb[2]);
-    const uint32_t a = REPEAT8(src_argb[3]);
-    const uint32_t b_scale = src_argb1[0];
-    const uint32_t g_scale = src_argb1[1];
-    const uint32_t r_scale = src_argb1[2];
-    const uint32_t a_scale = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_scale);
-    dst_argb[1] = SHADE(g, g_scale);
-    dst_argb[2] = SHADE(r, r_scale);
-    dst_argb[3] = SHADE(a, a_scale);
-    src_argb += 4;
-    src_argb1 += 4;
-    dst_argb += 4;
-  }
-}
-#undef REPEAT8
-#undef SHADE
-
-#define SHADE(f, v) clamp255(v + f)
-
-void ARGBAddRow_C(const uint8_t* src_argb,
-                  const uint8_t* src_argb1,
-                  uint8_t* dst_argb,
-                  int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    const int b = src_argb[0];
-    const int g = src_argb[1];
-    const int r = src_argb[2];
-    const int a = src_argb[3];
-    const int b_add = src_argb1[0];
-    const int g_add = src_argb1[1];
-    const int r_add = src_argb1[2];
-    const int a_add = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_add);
-    dst_argb[1] = SHADE(g, g_add);
-    dst_argb[2] = SHADE(r, r_add);
-    dst_argb[3] = SHADE(a, a_add);
-    src_argb += 4;
-    src_argb1 += 4;
-    dst_argb += 4;
-  }
-}
-#undef SHADE
-
-#define SHADE(f, v) clamp0(f - v)
-
-void ARGBSubtractRow_C(const uint8_t* src_argb,
-                       const uint8_t* src_argb1,
-                       uint8_t* dst_argb,
-                       int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    const int b = src_argb[0];
-    const int g = src_argb[1];
-    const int r = src_argb[2];
-    const int a = src_argb[3];
-    const int b_sub = src_argb1[0];
-    const int g_sub = src_argb1[1];
-    const int r_sub = src_argb1[2];
-    const int a_sub = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_sub);
-    dst_argb[1] = SHADE(g, g_sub);
-    dst_argb[2] = SHADE(r, r_sub);
-    dst_argb[3] = SHADE(a, a_sub);
-    src_argb += 4;
-    src_argb1 += 4;
-    dst_argb += 4;
-  }
-}
-#undef SHADE
-
-// Sobel functions which mimics SSSE3.
-void SobelXRow_C(const uint8_t* src_y0,
-                 const uint8_t* src_y1,
-                 const uint8_t* src_y2,
-                 uint8_t* dst_sobelx,
-                 int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int a = src_y0[i];
-    int b = src_y1[i];
-    int c = src_y2[i];
-    int a_sub = src_y0[i + 2];
-    int b_sub = src_y1[i + 2];
-    int c_sub = src_y2[i + 2];
-    int a_diff = a - a_sub;
-    int b_diff = b - b_sub;
-    int c_diff = c - c_sub;
-    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
-    dst_sobelx[i] = (uint8_t)(clamp255(sobel));
-  }
-}
-
-void SobelYRow_C(const uint8_t* src_y0,
-                 const uint8_t* src_y1,
-                 uint8_t* dst_sobely,
-                 int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int a = src_y0[i + 0];
-    int b = src_y0[i + 1];
-    int c = src_y0[i + 2];
-    int a_sub = src_y1[i + 0];
-    int b_sub = src_y1[i + 1];
-    int c_sub = src_y1[i + 2];
-    int a_diff = a - a_sub;
-    int b_diff = b - b_sub;
-    int c_diff = c - c_sub;
-    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
-    dst_sobely[i] = (uint8_t)(clamp255(sobel));
-  }
-}
-
-void SobelRow_C(const uint8_t* src_sobelx,
-                const uint8_t* src_sobely,
-                uint8_t* dst_argb,
-                int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int r = src_sobelx[i];
-    int b = src_sobely[i];
-    int s = clamp255(r + b);
-    dst_argb[0] = (uint8_t)(s);
-    dst_argb[1] = (uint8_t)(s);
-    dst_argb[2] = (uint8_t)(s);
-    dst_argb[3] = (uint8_t)(255u);
-    dst_argb += 4;
-  }
-}
-
-void SobelToPlaneRow_C(const uint8_t* src_sobelx,
-                       const uint8_t* src_sobely,
-                       uint8_t* dst_y,
-                       int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int r = src_sobelx[i];
-    int b = src_sobely[i];
-    int s = clamp255(r + b);
-    dst_y[i] = (uint8_t)(s);
-  }
-}
-
-void SobelXYRow_C(const uint8_t* src_sobelx,
-                  const uint8_t* src_sobely,
-                  uint8_t* dst_argb,
-                  int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int r = src_sobelx[i];
-    int b = src_sobely[i];
-    int g = clamp255(r + b);
-    dst_argb[0] = (uint8_t)(b);
-    dst_argb[1] = (uint8_t)(g);
-    dst_argb[2] = (uint8_t)(r);
-    dst_argb[3] = (uint8_t)(255u);
-    dst_argb += 4;
-  }
-}
-
-void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  // Copy a Y to RGB.
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t y = src_y[0];
-    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
-    dst_argb[3] = 255u;
-    dst_argb += 4;
-    ++src_y;
-  }
-}
-
-// Macros to create SIMD specific yuv to rgb conversion constants.
-
-// clang-format off
-
-#if defined(__aarch64__) || defined(__arm__)
-// Bias values to round, and subtract 128 from U and V.
-// For B and R this is negative. For G this is positive.
-#define BB (UB * 128 - YB)
-#define BG (UG * 128 + VG * 128 + YB)
-#define BR (VR * 128 - YB)
-
-#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR)         \
-  {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \
-   {YG, BB, BG, BR, YB, 0, 0, 0}}
-#else
-#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
-  {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
-   {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
-   {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
-   {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
-   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
-#endif
-
-// clang-format on
-
-#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
-      YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
-      YUBCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
-
-// TODO(fbarchard): Generate SIMD structures from float matrix.
-
-// BT.601 limited range YUV to RGB reference
-//  R = (Y - 16) * 1.164             + V * 1.596
-//  G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
-//  B = (Y - 16) * 1.164 + U * 2.018
-// KR = 0.299; KB = 0.114
-
-// U and V contributions to R,G,B.
-#ifdef LIBYUV_UNLIMITED_DATA
-#define UB 129 /* round(2.018 * 64) */
-#else
-#define UB 128 /* max(128, round(2.018 * 64)) */
-#endif
-#define UG 25  /* round(0.391 * 64) */
-#define VG 52  /* round(0.813 * 64) */
-#define VR 102 /* round(1.596 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
-MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-// BT.601 full range YUV to RGB reference (aka JPEG)
-// *  R = Y               + V * 1.40200
-// *  G = Y - U * 0.34414 - V * 0.71414
-// *  B = Y + U * 1.77200
-// KR = 0.299; KB = 0.114
-
-// U and V contributions to R,G,B.
-#define UB 113 /* round(1.77200 * 64) */
-#define UG 22  /* round(0.34414 * 64) */
-#define VG 46  /* round(0.71414 * 64) */
-#define VR 90  /* round(1.40200 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YB 32    /* 64 / 2 */
-
-MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-// BT.709 limited range YUV to RGB reference
-//  R = (Y - 16) * 1.164             + V * 1.793
-//  G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
-//  B = (Y - 16) * 1.164 + U * 2.112
-//  KR = 0.2126, KB = 0.0722
-
-// U and V contributions to R,G,B.
-#ifdef LIBYUV_UNLIMITED_DATA
-#define UB 135 /* round(2.112 * 64) */
-#else
-#define UB 128 /* max(128, round(2.112 * 64)) */
-#endif
-#define UG 14  /* round(0.213 * 64) */
-#define VG 34  /* round(0.533 * 64) */
-#define VR 115 /* round(1.793 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
-MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-// BT.709 full range YUV to RGB reference
-//  R = Y               + V * 1.5748
-//  G = Y - U * 0.18732 - V * 0.46812
-//  B = Y + U * 1.8556
-//  KR = 0.2126, KB = 0.0722
-
-// U and V contributions to R,G,B.
-#define UB 119 /* round(1.8556 * 64) */
-#define UG 12  /* round(0.18732 * 64) */
-#define VG 30  /* round(0.46812 * 64) */
-#define VR 101 /* round(1.5748 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
-#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
-#define YB 32    /* 64 / 2 */
-
-MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-// BT.2020 limited range YUV to RGB reference
-//  R = (Y - 16) * 1.164384                + V * 1.67867
-//  G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
-//  B = (Y - 16) * 1.164384 + U * 2.14177
-// KR = 0.2627; KB = 0.0593
-
-// U and V contributions to R,G,B.
-#ifdef LIBYUV_UNLIMITED_DATA
-#define UB 137 /* round(2.142 * 64) */
-#else
-#define UB 128 /* max(128, round(2.142 * 64)) */
-#endif
-#define UG 12  /* round(0.187326 * 64) */
-#define VG 42  /* round(0.65042 * 64) */
-#define VR 107 /* round(1.67867 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
-#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
-
-MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-// BT.2020 full range YUV to RGB reference
-//  R = Y                + V * 1.474600
-//  G = Y - U * 0.164553 - V * 0.571353
-//  B = Y + U * 1.881400
-// KR = 0.2627; KB = 0.0593
-
-#define UB 120 /* round(1.881400 * 64) */
-#define UG 11  /* round(0.164553 * 64) */
-#define VG 37  /* round(0.571353 * 64) */
-#define VR 94  /* round(1.474600 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
-#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
-#define YB 32    /* 64 / 2 */
-
-MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-#undef BB
-#undef BG
-#undef BR
-
-#undef MAKEYUVCONSTANTS
-
-#if defined(__aarch64__) || defined(__arm__)
-#define LOAD_YUV_CONSTANTS                 \
-  int ub = yuvconstants->kUVCoeff[0];      \
-  int vr = yuvconstants->kUVCoeff[1];      \
-  int ug = yuvconstants->kUVCoeff[2];      \
-  int vg = yuvconstants->kUVCoeff[3];      \
-  int yg = yuvconstants->kRGBCoeffBias[0]; \
-  int bb = yuvconstants->kRGBCoeffBias[1]; \
-  int bg = yuvconstants->kRGBCoeffBias[2]; \
-  int br = yuvconstants->kRGBCoeffBias[3]
-
-#define CALC_RGB16                         \
-  int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
-  int b16 = y1 + (u * ub) - bb;            \
-  int g16 = y1 + bg - (u * ug + v * vg);   \
-  int r16 = y1 + (v * vr) - br
-#else
-#define LOAD_YUV_CONSTANTS            \
-  int ub = yuvconstants->kUVToB[0];   \
-  int ug = yuvconstants->kUVToG[0];   \
-  int vg = yuvconstants->kUVToG[1];   \
-  int vr = yuvconstants->kUVToR[1];   \
-  int yg = yuvconstants->kYToRgb[0];  \
-  int yb = yuvconstants->kYBiasToRgb[0]
-
-#define CALC_RGB16                                \
-  int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
-  int8_t ui = u;                                  \
-  int8_t vi = v;                                  \
-  ui -= 0x80;                                     \
-  vi -= 0x80;                                     \
-  int b16 = y1 + (ui * ub);                       \
-  int g16 = y1 - (ui * ug + vi * vg);             \
-  int r16 = y1 + (vi * vr)
-#endif
-
-// C reference code that mimics the YUV assembly.
-// Reads 8 bit YUV and leaves result as 16 bit.
-static __inline void YuvPixel(uint8_t y,
-                              uint8_t u,
-                              uint8_t v,
-                              uint8_t* b,
-                              uint8_t* g,
-                              uint8_t* r,
-                              const struct YuvConstants* yuvconstants) {
-  LOAD_YUV_CONSTANTS;
-  uint32_t y32 = y * 0x0101;
-  CALC_RGB16;
-  *b = Clamp((int32_t)(b16) >> 6);
-  *g = Clamp((int32_t)(g16) >> 6);
-  *r = Clamp((int32_t)(r16) >> 6);
-}
-
-// Reads 8 bit YUV and leaves result as 16 bit.
-static __inline void YuvPixel8_16(uint8_t y,
-                                  uint8_t u,
-                                  uint8_t v,
-                                  int* b,
-                                  int* g,
-                                  int* r,
-                                  const struct YuvConstants* yuvconstants) {
-  LOAD_YUV_CONSTANTS;
-  uint32_t y32 = y * 0x0101;
-  CALC_RGB16;
-  *b = b16;
-  *g = g16;
-  *r = r16;
-}
-
-// C reference code that mimics the YUV 16 bit assembly.
-// Reads 10 bit YUV and leaves result as 16 bit.
-static __inline void YuvPixel10_16(uint16_t y,
-                                   uint16_t u,
-                                   uint16_t v,
-                                   int* b,
-                                   int* g,
-                                   int* r,
-                                   const struct YuvConstants* yuvconstants) {
-  LOAD_YUV_CONSTANTS;
-  uint32_t y32 = y << 6;
-  u = clamp255(u >> 2);
-  v = clamp255(v >> 2);
-  CALC_RGB16;
-  *b = b16;
-  *g = g16;
-  *r = r16;
-}
-
-// C reference code that mimics the YUV 16 bit assembly.
-// Reads 12 bit YUV and leaves result as 16 bit.
-static __inline void YuvPixel12_16(int16_t y,
-                                   int16_t u,
-                                   int16_t v,
-                                   int* b,
-                                   int* g,
-                                   int* r,
-                                   const struct YuvConstants* yuvconstants) {
-  LOAD_YUV_CONSTANTS;
-  uint32_t y32 = y << 4;
-  u = clamp255(u >> 4);
-  v = clamp255(v >> 4);
-  CALC_RGB16;
-  *b = b16;
-  *g = g16;
-  *r = r16;
-}
-
-// C reference code that mimics the YUV 10 bit assembly.
-// Reads 10 bit YUV and clamps down to 8 bit RGB.
-static __inline void YuvPixel10(uint16_t y,
-                                uint16_t u,
-                                uint16_t v,
-                                uint8_t* b,
-                                uint8_t* g,
-                                uint8_t* r,
-                                const struct YuvConstants* yuvconstants) {
-  int b16;
-  int g16;
-  int r16;
-  YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
-  *b = Clamp(b16 >> 6);
-  *g = Clamp(g16 >> 6);
-  *r = Clamp(r16 >> 6);
-}
-
-// C reference code that mimics the YUV 12 bit assembly.
-// Reads 12 bit YUV and clamps down to 8 bit RGB.
-static __inline void YuvPixel12(uint16_t y,
-                                uint16_t u,
-                                uint16_t v,
-                                uint8_t* b,
-                                uint8_t* g,
-                                uint8_t* r,
-                                const struct YuvConstants* yuvconstants) {
-  int b16;
-  int g16;
-  int r16;
-  YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
-  *b = Clamp(b16 >> 6);
-  *g = Clamp(g16 >> 6);
-  *r = Clamp(r16 >> 6);
-}
-
-// C reference code that mimics the YUV 16 bit assembly.
-// Reads 16 bit YUV and leaves result as 8 bit.
-static __inline void YuvPixel16_8(uint16_t y,
-                                  uint16_t u,
-                                  uint16_t v,
-                                  uint8_t* b,
-                                  uint8_t* g,
-                                  uint8_t* r,
-                                  const struct YuvConstants* yuvconstants) {
-  LOAD_YUV_CONSTANTS;
-  uint32_t y32 = y;
-  u = clamp255(u >> 8);
-  v = clamp255(v >> 8);
-  CALC_RGB16;
-  *b = Clamp((int32_t)(b16) >> 6);
-  *g = Clamp((int32_t)(g16) >> 6);
-  *r = Clamp((int32_t)(r16) >> 6);
-}
-
-// C reference code that mimics the YUV 16 bit assembly.
-// Reads 16 bit YUV and leaves result as 16 bit.
-static __inline void YuvPixel16_16(uint16_t y,
-                                   uint16_t u,
-                                   uint16_t v,
-                                   int* b,
-                                   int* g,
-                                   int* r,
-                                   const struct YuvConstants* yuvconstants) {
-  LOAD_YUV_CONSTANTS;
-  uint32_t y32 = y;
-  u = clamp255(u >> 8);
-  v = clamp255(v >> 8);
-  CALC_RGB16;
-  *b = b16;
-  *g = g16;
-  *r = r16;
-}
-
-// C reference code that mimics the YUV assembly.
-// Reads 8 bit YUV and leaves result as 8 bit.
-static __inline void YPixel(uint8_t y,
-                            uint8_t* b,
-                            uint8_t* g,
-                            uint8_t* r,
-                            const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__) || defined(__arm__)
-  int yg = yuvconstants->kRGBCoeffBias[0];
-  int ygb = yuvconstants->kRGBCoeffBias[4];
-#else
-  int ygb = yuvconstants->kYBiasToRgb[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = Clamp(((int32_t)(y1) + ygb) >> 6);
-  *g = Clamp(((int32_t)(y1) + ygb) >> 6);
-  *r = Clamp(((int32_t)(y1) + ygb) >> 6);
-}
-
-void I444ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    src_y += 1;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 4;  // Advance 1 pixel.
-  }
-}
-
-// Also used for 420
-void I422ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
-             rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-// 10 bit YUV to ARGB
-void I210ToARGBRow_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-               rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
-               rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-               rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void I410ToARGBRow_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-               rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    src_y += 1;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 4;  // Advance 1 pixels.
-  }
-}
-
-void I210AlphaToARGBRow_C(const uint16_t* src_y,
-                          const uint16_t* src_u,
-                          const uint16_t* src_v,
-                          const uint16_t* src_a,
-                          uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-               rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = clamp255(src_a[0] >> 2);
-    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
-               rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = clamp255(src_a[1] >> 2);
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    src_a += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-               rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = clamp255(src_a[0] >> 2);
-  }
-}
-
-void I410AlphaToARGBRow_C(const uint16_t* src_y,
-                          const uint16_t* src_u,
-                          const uint16_t* src_v,
-                          const uint16_t* src_a,
-                          uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-               rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = clamp255(src_a[0] >> 2);
-    src_y += 1;
-    src_u += 1;
-    src_v += 1;
-    src_a += 1;
-    rgb_buf += 4;  // Advance 1 pixels.
-  }
-}
-
-// 12 bit YUV to ARGB
-void I212ToARGBRow_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-               rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
-               rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-               rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
-  uint32_t ar30;
-  b = b >> 4;  // convert 8 bit 10.6 to 10 bit.
-  g = g >> 4;
-  r = r >> 4;
-  b = Clamp10(b);
-  g = Clamp10(g);
-  r = Clamp10(r);
-  ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;
-  (*(uint32_t*)rgb_buf) = ar30;
-}
-
-// 10 bit YUV to 10 bit AR30
-void I210ToAR30Row_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  int b;
-  int g;
-  int r;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
-    StoreAR30(rgb_buf, b, g, r);
-    YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
-    StoreAR30(rgb_buf + 4, b, g, r);
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
-    StoreAR30(rgb_buf, b, g, r);
-  }
-}
-
-// 12 bit YUV to 10 bit AR30
-void I212ToAR30Row_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  int b;
-  int g;
-  int r;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
-    StoreAR30(rgb_buf, b, g, r);
-    YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
-    StoreAR30(rgb_buf + 4, b, g, r);
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
-    StoreAR30(rgb_buf, b, g, r);
-  }
-}
-
-void I410ToAR30Row_C(const uint16_t* src_y,
-                     const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  int b;
-  int g;
-  int r;
-  for (x = 0; x < width; ++x) {
-    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
-    StoreAR30(rgb_buf, b, g, r);
-    src_y += 1;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 4;  // Advance 1 pixel.
-  }
-}
-
-// P210 has 10 bits in msb of 16 bit NV12 style layout.
-void P210ToARGBRow_C(const uint16_t* src_y,
-                     const uint16_t* src_uv,
-                     uint8_t* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
-                 dst_argb + 2, yuvconstants);
-    dst_argb[3] = 255;
-    YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5,
-                 dst_argb + 6, yuvconstants);
-    dst_argb[7] = 255;
-    src_y += 2;
-    src_uv += 2;
-    dst_argb += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
-                 dst_argb + 2, yuvconstants);
-    dst_argb[3] = 255;
-  }
-}
-
-void P410ToARGBRow_C(const uint16_t* src_y,
-                     const uint16_t* src_uv,
-                     uint8_t* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
-                 dst_argb + 2, yuvconstants);
-    dst_argb[3] = 255;
-    src_y += 1;
-    src_uv += 2;
-    dst_argb += 4;  // Advance 1 pixels.
-  }
-}
-
-void P210ToAR30Row_C(const uint16_t* src_y,
-                     const uint16_t* src_uv,
-                     uint8_t* dst_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  int b;
-  int g;
-  int r;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
-    StoreAR30(dst_ar30, b, g, r);
-    YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
-    StoreAR30(dst_ar30 + 4, b, g, r);
-    src_y += 2;
-    src_uv += 2;
-    dst_ar30 += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
-    StoreAR30(dst_ar30, b, g, r);
-  }
-}
-
-void P410ToAR30Row_C(const uint16_t* src_y,
-                     const uint16_t* src_uv,
-                     uint8_t* dst_ar30,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  int b;
-  int g;
-  int r;
-  for (x = 0; x < width; ++x) {
-    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
-    StoreAR30(dst_ar30, b, g, r);
-    src_y += 1;
-    src_uv += 2;
-    dst_ar30 += 4;  // Advance 1 pixel.
-  }
-}
-
-// 8 bit YUV to 10 bit AR30
-// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
-void I422ToAR30Row_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  int b;
-  int g;
-  int r;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
-    StoreAR30(rgb_buf, b, g, r);
-    YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
-    StoreAR30(rgb_buf + 4, b, g, r);
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
-    StoreAR30(rgb_buf, b, g, r);
-  }
-}
-
-void I444AlphaToARGBRow_C(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          const uint8_t* src_a,
-                          uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = src_a[0];
-    src_y += 1;
-    src_u += 1;
-    src_v += 1;
-    src_a += 1;
-    rgb_buf += 4;  // Advance 1 pixel.
-  }
-}
-
-void I422AlphaToARGBRow_C(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          const uint8_t* src_a,
-                          uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = src_a[0];
-    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
-             rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = src_a[1];
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    src_a += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = src_a[0];
-  }
-}
-
-void I422ToRGB24Row_C(const uint8_t* src_y,
-                      const uint8_t* src_u,
-                      const uint8_t* src_v,
-                      uint8_t* rgb_buf,
-                      const struct YuvConstants* yuvconstants,
-                      int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
-             rgb_buf + 5, yuvconstants);
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 6;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-  }
-}
-
-void I422ToARGB4444Row_C(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_argb4444,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint8_t b0;
-  uint8_t g0;
-  uint8_t r0;
-  uint8_t b1;
-  uint8_t g1;
-  uint8_t r1;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
-    b0 = b0 >> 4;
-    g0 = g0 >> 4;
-    r0 = r0 >> 4;
-    b1 = b1 >> 4;
-    g1 = g1 >> 4;
-    r1 = r1 >> 4;
-    *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
-                                 (g1 << 20) | (r1 << 24) | 0xf000f000;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    dst_argb4444 += 4;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    b0 = b0 >> 4;
-    g0 = g0 >> 4;
-    r0 = r0 >> 4;
-    *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
-  }
-}
-
-void I422ToARGB1555Row_C(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_argb1555,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint8_t b0;
-  uint8_t g0;
-  uint8_t r0;
-  uint8_t b1;
-  uint8_t g1;
-  uint8_t r1;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 3;
-    r0 = r0 >> 3;
-    b1 = b1 >> 3;
-    g1 = g1 >> 3;
-    r1 = r1 >> 3;
-    *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
-                                 (g1 << 21) | (r1 << 26) | 0x80008000;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    dst_argb1555 += 4;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 3;
-    r0 = r0 >> 3;
-    *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
-  }
-}
-
-void I422ToRGB565Row_C(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint8_t b0;
-  uint8_t g0;
-  uint8_t r0;
-  uint8_t b1;
-  uint8_t g1;
-  uint8_t r1;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 2;
-    r0 = r0 >> 3;
-    b1 = b1 >> 3;
-    g1 = g1 >> 2;
-    r1 = r1 >> 3;
-    *(uint32_t*)(dst_rgb565) =
-        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    dst_rgb565 += 4;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 2;
-    r0 = r0 >> 3;
-    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
-  }
-}
-
-void NV12ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_uv,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
-             rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_uv += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void NV21ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_vu,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
-             rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_vu += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void NV12ToRGB24Row_C(const uint8_t* src_y,
-                      const uint8_t* src_uv,
-                      uint8_t* rgb_buf,
-                      const struct YuvConstants* yuvconstants,
-                      int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
-             rgb_buf + 5, yuvconstants);
-    src_y += 2;
-    src_uv += 2;
-    rgb_buf += 6;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-  }
-}
-
-void NV21ToRGB24Row_C(const uint8_t* src_y,
-                      const uint8_t* src_vu,
-                      uint8_t* rgb_buf,
-                      const struct YuvConstants* yuvconstants,
-                      int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
-             rgb_buf + 5, yuvconstants);
-    src_y += 2;
-    src_vu += 2;
-    rgb_buf += 6;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-  }
-}
-
-void NV12ToRGB565Row_C(const uint8_t* src_y,
-                       const uint8_t* src_uv,
-                       uint8_t* dst_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint8_t b0;
-  uint8_t g0;
-  uint8_t r0;
-  uint8_t b1;
-  uint8_t g1;
-  uint8_t r1;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
-    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 2;
-    r0 = r0 >> 3;
-    b1 = b1 >> 3;
-    g1 = g1 >> 2;
-    r1 = r1 >> 3;
-    *(uint32_t*)(dst_rgb565) =
-        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
-    src_y += 2;
-    src_uv += 2;
-    dst_rgb565 += 4;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
-    b0 = b0 >> 3;
-    g0 = g0 >> 2;
-    r0 = r0 >> 3;
-    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
-  }
-}
-
-void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
-             rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_yuy2 += 4;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void UYVYToARGBRow_C(const uint8_t* src_uyvy,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
-             rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_uyvy += 4;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
-             rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void I422ToRGBARow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
-             rgb_buf + 3, yuvconstants);
-    rgb_buf[0] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
-             rgb_buf + 7, yuvconstants);
-    rgb_buf[4] = 255;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
-             rgb_buf + 3, yuvconstants);
-    rgb_buf[0] = 255;
-  }
-}
-
-void I400ToARGBRow_C(const uint8_t* src_y,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
-  int x;
-  src += width - 1;
-  for (x = 0; x < width - 1; x += 2) {
-    dst[x] = src[0];
-    dst[x + 1] = src[-1];
-    src -= 2;
-  }
-  if (width & 1) {
-    dst[width - 1] = src[0];
-  }
-}
-
-void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  int x;
-  src_uv += (width - 1) << 1;
-  for (x = 0; x < width; ++x) {
-    dst_uv[0] = src_uv[0];
-    dst_uv[1] = src_uv[1];
-    src_uv -= 2;
-    dst_uv += 2;
-  }
-}
-
-void MirrorSplitUVRow_C(const uint8_t* src_uv,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  int x;
-  src_uv += (width - 1) << 1;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_u[x] = src_uv[0];
-    dst_u[x + 1] = src_uv[-2];
-    dst_v[x] = src_uv[1];
-    dst_v[x + 1] = src_uv[-2 + 1];
-    src_uv -= 4;
-  }
-  if (width & 1) {
-    dst_u[width - 1] = src_uv[0];
-    dst_v[width - 1] = src_uv[1];
-  }
-}
-
-void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
-  int x;
-  const uint32_t* src32 = (const uint32_t*)(src);
-  uint32_t* dst32 = (uint32_t*)(dst);
-  src32 += width - 1;
-  for (x = 0; x < width - 1; x += 2) {
-    dst32[x] = src32[0];
-    dst32[x + 1] = src32[-1];
-    src32 -= 2;
-  }
-  if (width & 1) {
-    dst32[width - 1] = src32[0];
-  }
-}
-
-void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
-  int x;
-  src_rgb24 += width * 3 - 3;
-  for (x = 0; x < width; ++x) {
-    uint8_t b = src_rgb24[0];
-    uint8_t g = src_rgb24[1];
-    uint8_t r = src_rgb24[2];
-    dst_rgb24[0] = b;
-    dst_rgb24[1] = g;
-    dst_rgb24[2] = r;
-    src_rgb24 -= 3;
-    dst_rgb24 += 3;
-  }
-}
-
-void SplitUVRow_C(const uint8_t* src_uv,
-                  uint8_t* dst_u,
-                  uint8_t* dst_v,
-                  int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_u[x] = src_uv[0];
-    dst_u[x + 1] = src_uv[2];
-    dst_v[x] = src_uv[1];
-    dst_v[x + 1] = src_uv[3];
-    src_uv += 4;
-  }
-  if (width & 1) {
-    dst_u[width - 1] = src_uv[0];
-    dst_v[width - 1] = src_uv[1];
-  }
-}
-
-void MergeUVRow_C(const uint8_t* src_u,
-                  const uint8_t* src_v,
-                  uint8_t* dst_uv,
-                  int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_uv[0] = src_u[x];
-    dst_uv[1] = src_v[x];
-    dst_uv[2] = src_u[x + 1];
-    dst_uv[3] = src_v[x + 1];
-    dst_uv += 4;
-  }
-  if (width & 1) {
-    dst_uv[0] = src_u[width - 1];
-    dst_uv[1] = src_v[width - 1];
-  }
-}
-
-void SplitRGBRow_C(const uint8_t* src_rgb,
-                   uint8_t* dst_r,
-                   uint8_t* dst_g,
-                   uint8_t* dst_b,
-                   int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_r[x] = src_rgb[0];
-    dst_g[x] = src_rgb[1];
-    dst_b[x] = src_rgb[2];
-    src_rgb += 3;
-  }
-}
-
-void MergeRGBRow_C(const uint8_t* src_r,
-                   const uint8_t* src_g,
-                   const uint8_t* src_b,
-                   uint8_t* dst_rgb,
-                   int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_rgb[0] = src_r[x];
-    dst_rgb[1] = src_g[x];
-    dst_rgb[2] = src_b[x];
-    dst_rgb += 3;
-  }
-}
-
-void SplitARGBRow_C(const uint8_t* src_argb,
-                    uint8_t* dst_r,
-                    uint8_t* dst_g,
-                    uint8_t* dst_b,
-                    uint8_t* dst_a,
-                    int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_b[x] = src_argb[0];
-    dst_g[x] = src_argb[1];
-    dst_r[x] = src_argb[2];
-    dst_a[x] = src_argb[3];
-    src_argb += 4;
-  }
-}
-
-void MergeARGBRow_C(const uint8_t* src_r,
-                    const uint8_t* src_g,
-                    const uint8_t* src_b,
-                    const uint8_t* src_a,
-                    uint8_t* dst_argb,
-                    int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_argb[0] = src_b[x];
-    dst_argb[1] = src_g[x];
-    dst_argb[2] = src_r[x];
-    dst_argb[3] = src_a[x];
-    dst_argb += 4;
-  }
-}
-
-void MergeXR30Row_C(const uint16_t* src_r,
-                    const uint16_t* src_g,
-                    const uint16_t* src_b,
-                    uint8_t* dst_ar30,
-                    int depth,
-                    int width) {
-  assert(depth >= 10);
-  assert(depth <= 16);
-  int x;
-  int shift = depth - 10;
-  uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
-  for (x = 0; x < width; ++x) {
-    uint32_t r = clamp1023(src_r[x] >> shift);
-    uint32_t g = clamp1023(src_g[x] >> shift);
-    uint32_t b = clamp1023(src_b[x] >> shift);
-    dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
-  }
-}
-
-void MergeAR64Row_C(const uint16_t* src_r,
-                    const uint16_t* src_g,
-                    const uint16_t* src_b,
-                    const uint16_t* src_a,
-                    uint16_t* dst_ar64,
-                    int depth,
-                    int width) {
-  assert(depth >= 1);
-  assert(depth <= 16);
-  int x;
-  int shift = 16 - depth;
-  int max = (1 << depth) - 1;
-  for (x = 0; x < width; ++x) {
-    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
-    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
-    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
-    dst_ar64[3] = ClampMax(src_a[x], max) << shift;
-    dst_ar64 += 4;
-  }
-}
-
-void MergeARGB16To8Row_C(const uint16_t* src_r,
-                         const uint16_t* src_g,
-                         const uint16_t* src_b,
-                         const uint16_t* src_a,
-                         uint8_t* dst_argb,
-                         int depth,
-                         int width) {
-  assert(depth >= 8);
-  assert(depth <= 16);
-  int x;
-  int shift = depth - 8;
-  for (x = 0; x < width; ++x) {
-    dst_argb[0] = clamp255(src_b[x] >> shift);
-    dst_argb[1] = clamp255(src_g[x] >> shift);
-    dst_argb[2] = clamp255(src_r[x] >> shift);
-    dst_argb[3] = clamp255(src_a[x] >> shift);
-    dst_argb += 4;
-  }
-}
-
-void MergeXR64Row_C(const uint16_t* src_r,
-                    const uint16_t* src_g,
-                    const uint16_t* src_b,
-                    uint16_t* dst_ar64,
-                    int depth,
-                    int width) {
-  assert(depth >= 1);
-  assert(depth <= 16);
-  int x;
-  int shift = 16 - depth;
-  int max = (1 << depth) - 1;
-  for (x = 0; x < width; ++x) {
-    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
-    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
-    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
-    dst_ar64[3] = 0xffff;
-    dst_ar64 += 4;
-  }
-}
-
-void MergeXRGB16To8Row_C(const uint16_t* src_r,
-                         const uint16_t* src_g,
-                         const uint16_t* src_b,
-                         uint8_t* dst_argb,
-                         int depth,
-                         int width) {
-  assert(depth >= 8);
-  assert(depth <= 16);
-  int x;
-  int shift = depth - 8;
-  for (x = 0; x < width; ++x) {
-    dst_argb[0] = clamp255(src_b[x] >> shift);
-    dst_argb[1] = clamp255(src_g[x] >> shift);
-    dst_argb[2] = clamp255(src_r[x] >> shift);
-    dst_argb[3] = 0xff;
-    dst_argb += 4;
-  }
-}
-
-void SplitXRGBRow_C(const uint8_t* src_argb,
-                    uint8_t* dst_r,
-                    uint8_t* dst_g,
-                    uint8_t* dst_b,
-                    int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_b[x] = src_argb[0];
-    dst_g[x] = src_argb[1];
-    dst_r[x] = src_argb[2];
-    src_argb += 4;
-  }
-}
-
-void MergeXRGBRow_C(const uint8_t* src_r,
-                    const uint8_t* src_g,
-                    const uint8_t* src_b,
-                    uint8_t* dst_argb,
-                    int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_argb[0] = src_b[x];
-    dst_argb[1] = src_g[x];
-    dst_argb[2] = src_r[x];
-    dst_argb[3] = 255;
-    dst_argb += 4;
-  }
-}
-
-// Convert lsb formats to msb, depending on sample depth.
-void MergeUVRow_16_C(const uint16_t* src_u,
-                     const uint16_t* src_v,
-                     uint16_t* dst_uv,
-                     int depth,
-                     int width) {
-  int shift = 16 - depth;
-  assert(depth >= 8);
-  assert(depth <= 16);
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_uv[0] = src_u[x] << shift;
-    dst_uv[1] = src_v[x] << shift;
-    dst_uv += 2;
-  }
-}
-
-// Convert msb formats to lsb, depending on sample depth.
-void SplitUVRow_16_C(const uint16_t* src_uv,
-                     uint16_t* dst_u,
-                     uint16_t* dst_v,
-                     int depth,
-                     int width) {
-  int shift = 16 - depth;
-  int x;
-  assert(depth >= 8);
-  assert(depth <= 16);
-  for (x = 0; x < width; ++x) {
-    dst_u[x] = src_uv[0] >> shift;
-    dst_v[x] = src_uv[1] >> shift;
-    src_uv += 2;
-  }
-}
-
-void MultiplyRow_16_C(const uint16_t* src_y,
-                      uint16_t* dst_y,
-                      int scale,
-                      int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_y[x] = src_y[x] * scale;
-  }
-}
-
-void DivideRow_16_C(const uint16_t* src_y,
-                    uint16_t* dst_y,
-                    int scale,
-                    int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_y[x] = (src_y[x] * scale) >> 16;
-  }
-}
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 32768 = 9 bits
-// 16384 = 10 bits
-// 4096 = 12 bits
-// 256 = 16 bits
-void Convert16To8Row_C(const uint16_t* src_y,
-                       uint8_t* dst_y,
-                       int scale,
-                       int width) {
-  int x;
-  assert(scale >= 256);
-  assert(scale <= 32768);
-
-  for (x = 0; x < width; ++x) {
-    dst_y[x] = clamp255((src_y[x] * scale) >> 16);
-  }
-}
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 1024 = 10 bits
-void Convert8To16Row_C(const uint8_t* src_y,
-                       uint16_t* dst_y,
-                       int scale,
-                       int width) {
-  int x;
-  scale *= 0x0101;  // replicates the byte.
-  for (x = 0; x < width; ++x) {
-    dst_y[x] = (src_y[x] * scale) >> 16;
-  }
-}
-
-void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
-  memcpy(dst, src, count);
-}
-
-void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {
-  memcpy(dst, src, count * 2);
-}
-
-void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
-  memset(dst, v8, width);
-}
-
-void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
-  }
-}
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_C(const uint8_t* src_yuy2,
-                   int src_stride_yuy2,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width) {
-  // Output a row of UV values, filtering 2 rows of YUY2.
-  int x;
-  for (x = 0; x < width; x += 2) {
-    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
-    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
-    src_yuy2 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
-
-// Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  // Output a row of UV values.
-  int x;
-  for (x = 0; x < width; x += 2) {
-    dst_u[0] = src_yuy2[1];
-    dst_v[0] = src_yuy2[3];
-    src_yuy2 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
-
-// Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  // Output a row of Y values.
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_y[x] = src_yuy2[0];
-    dst_y[x + 1] = src_yuy2[2];
-    src_yuy2 += 4;
-  }
-  if (width & 1) {
-    dst_y[width - 1] = src_yuy2[0];
-  }
-}
-
-// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_C(const uint8_t* src_uyvy,
-                   int src_stride_uyvy,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width) {
-  // Output a row of UV values.
-  int x;
-  for (x = 0; x < width; x += 2) {
-    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
-    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
-    src_uyvy += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
-
-// Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_C(const uint8_t* src_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  // Output a row of UV values.
-  int x;
-  for (x = 0; x < width; x += 2) {
-    dst_u[0] = src_uyvy[0];
-    dst_v[0] = src_uyvy[2];
-    src_uyvy += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
-
-// Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  // Output a row of Y values.
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_y[x] = src_uyvy[1];
-    dst_y[x + 1] = src_uyvy[3];
-    src_uyvy += 4;
-  }
-  if (width & 1) {
-    dst_y[width - 1] = src_uyvy[1];
-  }
-}
-
-#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
-
-// Blend src_argb over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb or src_argb1.
-// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8_t* src_argb,
-                    const uint8_t* src_argb1,
-                    uint8_t* dst_argb,
-                    int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint32_t fb = src_argb[0];
-    uint32_t fg = src_argb[1];
-    uint32_t fr = src_argb[2];
-    uint32_t a = src_argb[3];
-    uint32_t bb = src_argb1[0];
-    uint32_t bg = src_argb1[1];
-    uint32_t br = src_argb1[2];
-    dst_argb[0] = BLEND(fb, bb, a);
-    dst_argb[1] = BLEND(fg, bg, a);
-    dst_argb[2] = BLEND(fr, br, a);
-    dst_argb[3] = 255u;
-
-    fb = src_argb[4 + 0];
-    fg = src_argb[4 + 1];
-    fr = src_argb[4 + 2];
-    a = src_argb[4 + 3];
-    bb = src_argb1[4 + 0];
-    bg = src_argb1[4 + 1];
-    br = src_argb1[4 + 2];
-    dst_argb[4 + 0] = BLEND(fb, bb, a);
-    dst_argb[4 + 1] = BLEND(fg, bg, a);
-    dst_argb[4 + 2] = BLEND(fr, br, a);
-    dst_argb[4 + 3] = 255u;
-    src_argb += 8;
-    src_argb1 += 8;
-    dst_argb += 8;
-  }
-
-  if (width & 1) {
-    uint32_t fb = src_argb[0];
-    uint32_t fg = src_argb[1];
-    uint32_t fr = src_argb[2];
-    uint32_t a = src_argb[3];
-    uint32_t bb = src_argb1[0];
-    uint32_t bg = src_argb1[1];
-    uint32_t br = src_argb1[2];
-    dst_argb[0] = BLEND(fb, bb, a);
-    dst_argb[1] = BLEND(fg, bg, a);
-    dst_argb[2] = BLEND(fr, br, a);
-    dst_argb[3] = 255u;
-  }
-}
-#undef BLEND
-
-#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
-void BlendPlaneRow_C(const uint8_t* src0,
-                     const uint8_t* src1,
-                     const uint8_t* alpha,
-                     uint8_t* dst,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
-    dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
-    src0 += 2;
-    src1 += 2;
-    alpha += 2;
-    dst += 2;
-  }
-  if (width & 1) {
-    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
-  }
-}
-#undef UBLEND
-
-#if defined(__aarch64__) || defined(__arm__)
-#define ATTENUATE(f, a) (f * a + 128) >> 8
-#else
-// This code mimics the SSSE3 version for better testability.
-#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
-#endif
-
-// Multiply source RGB by alpha and store to destination.
-void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  int i;
-  for (i = 0; i < width - 1; i += 2) {
-    uint32_t b = src_argb[0];
-    uint32_t g = src_argb[1];
-    uint32_t r = src_argb[2];
-    uint32_t a = src_argb[3];
-    dst_argb[0] = ATTENUATE(b, a);
-    dst_argb[1] = ATTENUATE(g, a);
-    dst_argb[2] = ATTENUATE(r, a);
-    dst_argb[3] = a;
-    b = src_argb[4];
-    g = src_argb[5];
-    r = src_argb[6];
-    a = src_argb[7];
-    dst_argb[4] = ATTENUATE(b, a);
-    dst_argb[5] = ATTENUATE(g, a);
-    dst_argb[6] = ATTENUATE(r, a);
-    dst_argb[7] = a;
-    src_argb += 8;
-    dst_argb += 8;
-  }
-
-  if (width & 1) {
-    const uint32_t b = src_argb[0];
-    const uint32_t g = src_argb[1];
-    const uint32_t r = src_argb[2];
-    const uint32_t a = src_argb[3];
-    dst_argb[0] = ATTENUATE(b, a);
-    dst_argb[1] = ATTENUATE(g, a);
-    dst_argb[2] = ATTENUATE(r, a);
-    dst_argb[3] = a;
-  }
-}
-#undef ATTENUATE
-
-// Divide source RGB by alpha and store to destination.
-// b = (b * 255 + (a / 2)) / a;
-// g = (g * 255 + (a / 2)) / a;
-// r = (r * 255 + (a / 2)) / a;
-// Reciprocal method is off by 1 on some values. ie 125
-// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
-#define T(a) 0x01000000 + (0x10000 / a)
-const uint32_t fixed_invtbl8[256] = {
-    0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),
-    T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),
-    T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),
-    T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),
-    T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),
-    T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),
-    T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),
-    T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),
-    T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),
-    T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),
-    T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),
-    T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),
-    T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),
-    T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),
-    T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),
-    T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),
-    T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),
-    T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),
-    T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),
-    T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),
-    T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),
-    T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),
-    T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),
-    T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),
-    T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),
-    T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),
-    T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),
-    T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),
-    T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),
-    T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),
-    T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),
-    T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),
-    T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),
-    T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),
-    T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),
-    T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),
-    T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
-#undef T
-
-void ARGBUnattenuateRow_C(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    uint32_t b = src_argb[0];
-    uint32_t g = src_argb[1];
-    uint32_t r = src_argb[2];
-    const uint32_t a = src_argb[3];
-    const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
-    b = (b * ia) >> 8;
-    g = (g * ia) >> 8;
-    r = (r * ia) >> 8;
-    // Clamping should not be necessary but is free in assembly.
-    dst_argb[0] = clamp255(b);
-    dst_argb[1] = clamp255(g);
-    dst_argb[2] = clamp255(r);
-    dst_argb[3] = a;
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-
-void ComputeCumulativeSumRow_C(const uint8_t* row,
-                               int32_t* cumsum,
-                               const int32_t* previous_cumsum,
-                               int width) {
-  int32_t row_sum[4] = {0, 0, 0, 0};
-  int x;
-  for (x = 0; x < width; ++x) {
-    row_sum[0] += row[x * 4 + 0];
-    row_sum[1] += row[x * 4 + 1];
-    row_sum[2] += row[x * 4 + 2];
-    row_sum[3] += row[x * 4 + 3];
-    cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
-    cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
-    cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
-    cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
-  }
-}
-
-void CumulativeSumToAverageRow_C(const int32_t* tl,
-                                 const int32_t* bl,
-                                 int w,
-                                 int area,
-                                 uint8_t* dst,
-                                 int count) {
-  float ooa = 1.0f / area;
-  int i;
-  for (i = 0; i < count; ++i) {
-    dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
-    dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
-    dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
-    dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
-    dst += 4;
-    tl += 4;
-    bl += 4;
-  }
-}
-
-// Copy pixels from rotated source to destination row with a slope.
-LIBYUV_API
-void ARGBAffineRow_C(const uint8_t* src_argb,
-                     int src_argb_stride,
-                     uint8_t* dst_argb,
-                     const float* uv_dudv,
-                     int width) {
-  int i;
-  // Render a row of pixels from source into a buffer.
-  float uv[2];
-  uv[0] = uv_dudv[0];
-  uv[1] = uv_dudv[1];
-  for (i = 0; i < width; ++i) {
-    int x = (int)(uv[0]);
-    int y = (int)(uv[1]);
-    *(uint32_t*)(dst_argb) =
-        *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
-    dst_argb += 4;
-    uv[0] += uv_dudv[2];
-    uv[1] += uv_dudv[3];
-  }
-}
-
-// Blend 2 rows into 1.
-static void HalfRow_C(const uint8_t* src_uv,
-                      ptrdiff_t src_uv_stride,
-                      uint8_t* dst_uv,
-                      int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
-  }
-}
-
-static void HalfRow_16_C(const uint16_t* src_uv,
-                         ptrdiff_t src_uv_stride,
-                         uint16_t* dst_uv,
-                         int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
-  }
-}
-
-// C version 2x2 -> 2x1.
-void InterpolateRow_C(uint8_t* dst_ptr,
-                      const uint8_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      int width,
-                      int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  int x;
-  if (y1_fraction == 0) {
-    memcpy(dst_ptr, src_ptr, width);
-    return;
-  }
-  if (y1_fraction == 128) {
-    HalfRow_C(src_ptr, src_stride, dst_ptr, width);
-    return;
-  }
-  for (x = 0; x < width - 1; x += 2) {
-    dst_ptr[0] =
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
-    dst_ptr[1] =
-        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
-    src_ptr += 2;
-    src_ptr1 += 2;
-    dst_ptr += 2;
-  }
-  if (width & 1) {
-    dst_ptr[0] =
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
-  }
-}
-
-void InterpolateRow_16_C(uint16_t* dst_ptr,
-                         const uint16_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int width,
-                         int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
-  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  int x;
-  if (source_y_fraction == 0) {
-    memcpy(dst_ptr, src_ptr, width * 2);
-    return;
-  }
-  if (source_y_fraction == 128) {
-    HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
-    return;
-  }
-  for (x = 0; x < width - 1; x += 2) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
-    src_ptr += 2;
-    src_ptr1 += 2;
-    dst_ptr += 2;
-  }
-  if (width & 1) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-  }
-}
-
-// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_C(const uint8_t* src_argb,
-                      uint8_t* dst_argb,
-                      const uint8_t* shuffler,
-                      int width) {
-  int index0 = shuffler[0];
-  int index1 = shuffler[1];
-  int index2 = shuffler[2];
-  int index3 = shuffler[3];
-  // Shuffle a row of ARGB.
-  int x;
-  for (x = 0; x < width; ++x) {
-    // To support in-place conversion.
-    uint8_t b = src_argb[index0];
-    uint8_t g = src_argb[index1];
-    uint8_t r = src_argb[index2];
-    uint8_t a = src_argb[index3];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-
-void I422ToYUY2Row_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_frame,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_frame[0] = src_y[0];
-    dst_frame[1] = src_u[0];
-    dst_frame[2] = src_y[1];
-    dst_frame[3] = src_v[0];
-    dst_frame += 4;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-  }
-  if (width & 1) {
-    dst_frame[0] = src_y[0];
-    dst_frame[1] = src_u[0];
-    dst_frame[2] = 0;
-    dst_frame[3] = src_v[0];
-  }
-}
-
-void I422ToUYVYRow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_frame,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_frame[0] = src_u[0];
-    dst_frame[1] = src_y[0];
-    dst_frame[2] = src_v[0];
-    dst_frame[3] = src_y[1];
-    dst_frame += 4;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-  }
-  if (width & 1) {
-    dst_frame[0] = src_u[0];
-    dst_frame[1] = src_y[0];
-    dst_frame[2] = src_v[0];
-    dst_frame[3] = 0;
-  }
-}
-
-void ARGBPolynomialRow_C(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const float* poly,
-                         int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    float b = (float)(src_argb[0]);
-    float g = (float)(src_argb[1]);
-    float r = (float)(src_argb[2]);
-    float a = (float)(src_argb[3]);
-    float b2 = b * b;
-    float g2 = g * g;
-    float r2 = r * r;
-    float a2 = a * a;
-    float db = poly[0] + poly[4] * b;
-    float dg = poly[1] + poly[5] * g;
-    float dr = poly[2] + poly[6] * r;
-    float da = poly[3] + poly[7] * a;
-    float b3 = b2 * b;
-    float g3 = g2 * g;
-    float r3 = r2 * r;
-    float a3 = a2 * a;
-    db += poly[8] * b2;
-    dg += poly[9] * g2;
-    dr += poly[10] * r2;
-    da += poly[11] * a2;
-    db += poly[12] * b3;
-    dg += poly[13] * g3;
-    dr += poly[14] * r3;
-    da += poly[15] * a3;
-
-    dst_argb[0] = Clamp((int32_t)(db));
-    dst_argb[1] = Clamp((int32_t)(dg));
-    dst_argb[2] = Clamp((int32_t)(dr));
-    dst_argb[3] = Clamp((int32_t)(da));
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
-
-// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
-// adjust the source integer range to the half float range desired.
-
-// This magic constant is 2^-112. Multiplying by this
-// is the same as subtracting 112 from the exponent, which
-// is the difference in exponent bias between 32-bit and
-// 16-bit floats. Once we've done this subtraction, we can
-// simply extract the low bits of the exponent and the high
-// bits of the mantissa from our float and we're done.
-
-// Work around GCC 7 punning warning -Wstrict-aliasing
-#if defined(__GNUC__)
-typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
-#else
-typedef uint32_t uint32_alias_t;
-#endif
-
-void HalfFloatRow_C(const uint16_t* src,
-                    uint16_t* dst,
-                    float scale,
-                    int width) {
-  int i;
-  float mult = 1.9259299444e-34f * scale;
-  for (i = 0; i < width; ++i) {
-    float value = src[i] * mult;
-    dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13);
-  }
-}
-
-void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    float value = src[i] * scale;
-    dst[i] = value;
-  }
-}
-
-void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             int width,
-                             const uint8_t* luma,
-                             uint32_t lumacoeff) {
-  uint32_t bc = lumacoeff & 0xff;
-  uint32_t gc = (lumacoeff >> 8) & 0xff;
-  uint32_t rc = (lumacoeff >> 16) & 0xff;
-
-  int i;
-  for (i = 0; i < width - 1; i += 2) {
-    // Luminance in rows, color values in columns.
-    const uint8_t* luma0 =
-        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
-        luma;
-    const uint8_t* luma1;
-    dst_argb[0] = luma0[src_argb[0]];
-    dst_argb[1] = luma0[src_argb[1]];
-    dst_argb[2] = luma0[src_argb[2]];
-    dst_argb[3] = src_argb[3];
-    luma1 =
-        ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
-        luma;
-    dst_argb[4] = luma1[src_argb[4]];
-    dst_argb[5] = luma1[src_argb[5]];
-    dst_argb[6] = luma1[src_argb[6]];
-    dst_argb[7] = src_argb[7];
-    src_argb += 8;
-    dst_argb += 8;
-  }
-  if (width & 1) {
-    // Luminance in rows, color values in columns.
-    const uint8_t* luma0 =
-        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
-        luma;
-    dst_argb[0] = luma0[src_argb[0]];
-    dst_argb[1] = luma0[src_argb[1]];
-    dst_argb[2] = luma0[src_argb[2]];
-    dst_argb[3] = src_argb[3];
-  }
-}
-
-void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
-  int i;
-  for (i = 0; i < width - 1; i += 2) {
-    dst[3] = src[3];
-    dst[7] = src[7];
-    dst += 8;
-    src += 8;
-  }
-  if (width & 1) {
-    dst[3] = src[3];
-  }
-}
-
-void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {
-  int i;
-  for (i = 0; i < width - 1; i += 2) {
-    dst_a[0] = src_argb[3];
-    dst_a[1] = src_argb[7];
-    dst_a += 2;
-    src_argb += 8;
-  }
-  if (width & 1) {
-    dst_a[0] = src_argb[3];
-  }
-}
-
-void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
-  int i;
-  for (i = 0; i < width - 1; i += 2) {
-    dst[3] = src[0];
-    dst[7] = src[1];
-    dst += 8;
-    src += 2;
-  }
-  if (width & 1) {
-    dst[3] = src[0];
-  }
-}
-
-// Maximum temporary width for wrappers to process at a time, in pixels.
-#define MAXTWIDTH 2048
-
-#if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \
-    defined(HAS_I422TORGB565ROW_SSSE3)
-// row_win.cc has asm version, but GCC uses 2 step wrapper.
-void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_rgb565,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
-    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_rgb565 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
-void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             uint8_t* dst_argb1555,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
-    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_argb1555 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
-void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             uint8_t* dst_argb4444,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
-    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_argb4444 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
-void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
-                           const uint8_t* src_uv,
-                           uint8_t* dst_rgb565,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
-    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-    src_y += twidth;
-    src_uv += twidth;
-    dst_rgb565 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_NV12TORGB24ROW_SSSE3)
-void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
-                          const uint8_t* src_uv,
-                          uint8_t* dst_rgb24,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
-    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
-    src_y += twidth;
-    src_uv += twidth;
-    dst_rgb24 += twidth * 3;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_NV21TORGB24ROW_SSSE3)
-void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
-                          const uint8_t* src_vu,
-                          uint8_t* dst_rgb24,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);
-    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
-    src_y += twidth;
-    src_vu += twidth;
-    dst_rgb24 += twidth * 3;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_NV12TORGB24ROW_AVX2)
-void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTORGB24ROW_AVX2)
-    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
-#else
-    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
-#endif
-    src_y += twidth;
-    src_uv += twidth;
-    dst_rgb24 += twidth * 3;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_NV21TORGB24ROW_AVX2)
-void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTORGB24ROW_AVX2)
-    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
-#else
-    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
-#endif
-    src_y += twidth;
-    src_vu += twidth;
-    dst_rgb24 += twidth * 3;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TORGB565ROW_AVX2)
-void I422ToRGB565Row_AVX2(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
-#else
-    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-#endif
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_rgb565 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
-void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
-    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
-#else
-    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
-#endif
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_argb1555 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
-void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
-    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
-#else
-    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
-#endif
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_argb4444 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TORGB24ROW_AVX2)
-void I422ToRGB24Row_AVX2(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTORGB24ROW_AVX2)
-    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
-#else
-    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
-#endif
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_rgb24 += twidth * 3;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_NV12TORGB565ROW_AVX2)
-void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
-                          const uint8_t* src_uv,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
-#else
-    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-#endif
-    src_y += twidth;
-    src_uv += twidth;
-    dst_rgb565 += twidth * 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#ifdef HAS_RGB24TOYJROW_AVX2
-// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
-void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
-    ARGBToYJRow_AVX2(row, dst_yj, twidth);
-    src_rgb24 += twidth * 3;
-    dst_yj += twidth;
-    width -= twidth;
-  }
-}
-#endif  // HAS_RGB24TOYJROW_AVX2
-
-#ifdef HAS_RAWTOYJROW_AVX2
-// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
-void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RAWToARGBRow_SSSE3(src_raw, row, twidth);
-    ARGBToYJRow_AVX2(row, dst_yj, twidth);
-    src_raw += twidth * 3;
-    dst_yj += twidth;
-    width -= twidth;
-  }
-}
-#endif  // HAS_RAWTOYJROW_AVX2
-
-#ifdef HAS_RGB24TOYJROW_SSSE3
-// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
-void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
-    ARGBToYJRow_SSSE3(row, dst_yj, twidth);
-    src_rgb24 += twidth * 3;
-    dst_yj += twidth;
-    width -= twidth;
-  }
-}
-#endif  // HAS_RGB24TOYJROW_SSSE3
-
-#ifdef HAS_RAWTOYJROW_SSSE3
-// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
-void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RAWToARGBRow_SSSE3(src_raw, row, twidth);
-    ARGBToYJRow_SSSE3(row, dst_yj, twidth);
-    src_raw += twidth * 3;
-    dst_yj += twidth;
-    width -= twidth;
-  }
-}
-#endif  // HAS_RAWTOYJROW_SSSE3
-
-float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
-  float fsum = 0.f;
-  int i;
-  for (i = 0; i < width; ++i) {
-    float v = *src++;
-    fsum += v * v;
-    *dst++ = v * scale;
-  }
-  return fsum;
-}
-
-float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
-  float fmax = 0.f;
-  int i;
-  for (i = 0; i < width; ++i) {
-    float v = *src++;
-    float vs = v * scale;
-    fmax = (v > fmax) ? v : fmax;
-    *dst++ = vs;
-  }
-  return fmax;
-}
-
-void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    *dst++ = *src++ * scale;
-  }
-}
-
-void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    *dst++ =
-        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
-    ++src;
-  }
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussCol_C(const uint16_t* src0,
-                const uint16_t* src1,
-                const uint16_t* src2,
-                const uint16_t* src3,
-                const uint16_t* src4,
-                uint32_t* dst,
-                int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
-  }
-}
-
-void GaussRow_F32_C(const float* src, float* dst, int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
-             (1.0f / 256.0f);
-    ++src;
-  }
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussCol_F32_C(const float* src0,
-                    const float* src1,
-                    const float* src2,
-                    const float* src3,
-                    const float* src4,
-                    float* dst,
-                    int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
-  }
-}
-
-// Convert biplanar NV21 to packed YUV24
-void NV21ToYUV24Row_C(const uint8_t* src_y,
-                      const uint8_t* src_vu,
-                      uint8_t* dst_yuv24,
-                      int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_yuv24[0] = src_vu[0];  // V
-    dst_yuv24[1] = src_vu[1];  // U
-    dst_yuv24[2] = src_y[0];   // Y0
-    dst_yuv24[3] = src_vu[0];  // V
-    dst_yuv24[4] = src_vu[1];  // U
-    dst_yuv24[5] = src_y[1];   // Y1
-    src_y += 2;
-    src_vu += 2;
-    dst_yuv24 += 6;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    dst_yuv24[0] = src_vu[0];  // V
-    dst_yuv24[1] = src_vu[1];  // U
-    dst_yuv24[2] = src_y[0];   // Y0
-  }
-}
-
-// Filter 2 rows of AYUV UV's (444) into UV (420).
-// AYUV is VUYA in memory.  UV for NV12 is UV order in memory.
-void AYUVToUVRow_C(const uint8_t* src_ayuv,
-                   int src_stride_ayuv,
-                   uint8_t* dst_uv,
-                   int width) {
-  // Output a row of UV values, filtering 2x2 rows of AYUV.
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
-                 src_ayuv[src_stride_ayuv + 5] + 2) >>
-                2;
-    dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
-                 src_ayuv[src_stride_ayuv + 4] + 2) >>
-                2;
-    src_ayuv += 8;
-    dst_uv += 2;
-  }
-  if (width & 1) {
-    dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
-    dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
-  }
-}
-
-// Filter 2 rows of AYUV UV's (444) into VU (420).
-void AYUVToVURow_C(const uint8_t* src_ayuv,
-                   int src_stride_ayuv,
-                   uint8_t* dst_vu,
-                   int width) {
-  // Output a row of VU values, filtering 2x2 rows of AYUV.
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
-                 src_ayuv[src_stride_ayuv + 4] + 2) >>
-                2;
-    dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
-                 src_ayuv[src_stride_ayuv + 5] + 2) >>
-                2;
-    src_ayuv += 8;
-    dst_vu += 2;
-  }
-  if (width & 1) {
-    dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
-    dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
-  }
-}
-
-// Copy row of AYUV Y's into Y
-void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  // Output a row of Y values.
-  int x;
-  for (x = 0; x < width; ++x) {
-    dst_y[x] = src_ayuv[2];  // v,u,y,a
-    src_ayuv += 4;
-  }
-}
-
-// Convert UV plane of NV12 to VU of NV21.
-void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8_t u = src_uv[0];
-    uint8_t v = src_uv[1];
-    dst_vu[0] = v;
-    dst_vu[1] = u;
-    src_uv += 2;
-    dst_vu += 2;
-  }
-}
-
-void HalfMergeUVRow_C(const uint8_t* src_u,
-                      int src_stride_u,
-                      const uint8_t* src_v,
-                      int src_stride_v,
-                      uint8_t* dst_uv,
-                      int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
-                 src_u[src_stride_u + 1] + 2) >>
-                2;
-    dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
-                 src_v[src_stride_v + 1] + 2) >>
-                2;
-    src_u += 2;
-    src_v += 2;
-    dst_uv += 2;
-  }
-  if (width & 1) {
-    dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
-    dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/row_gcc.cc b/thirdparty/libyuv/source/row_gcc.cc
deleted file mode 100644
index 43e4c71..0000000
--- a/thirdparty/libyuv/source/row_gcc.cc
+++ /dev/null
@@ -1,9195 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__))
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-// Constants for ARGB
-static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
-                               25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
-
-// JPeg full range.
-static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
-                                29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
-
-static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
-                                0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
-#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
-                              112, -74, -38, 0, 112, -74, -38, 0};
-
-static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
-                               127, -84, -43, 0, 127, -84, -43, 0};
-
-static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
-                              -18, -94, 112, 0, -18, -94, 112, 0};
-
-static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
-                               -20, -107, 127, 0, -20, -107, 127, 0};
-
-// Constants for BGRA
-static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
-                               0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
-
-static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
-                              0, -38, -74, 112, 0, -38, -74, 112};
-
-static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
-                              0, 112, -94, -18, 0, 112, -94, -18};
-
-// Constants for ABGR
-static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
-                               66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
-
-static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
-                              -38, -74, 112, 0, -38, -74, 112, 0};
-
-static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
-                              112, -94, -18, 0, 112, -94, -18, 0};
-
-// Constants for RGBA.
-static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
-                               0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
-
-static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
-                              0, 112, -74, -38, 0, 112, -74, -38};
-
-static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
-                              0, -18, -94, 112, 0, -18, -94, 112};
-
-static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
-                               0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
-
-static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
-                               0x8080u, 0x8080u, 0x8080u, 0x8080u};
-
-#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-#ifdef HAS_RGB24TOARGBROW_SSSE3
-
-// Shuffle table for converting RGB24 to ARGB.
-static const uvec8 kShuffleMaskRGB24ToARGB = {
-    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
-
-// Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
-                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
-
-// Shuffle table for converting RAW to RGBA.
-static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u,  4u,  3u,
-                                            14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
-
-// Shuffle table for converting RAW to RGB24.  First 8.
-static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting RAW to RGB24.  Middle 8.
-static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting RAW to RGB24.  Last 8.
-static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGB to RGB24.
-static const uvec8 kShuffleMaskARGBToRGB24 = {
-    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGB to RAW.
-static const uvec8 kShuffleMaskARGBToRAW = {
-    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
-static const uvec8 kShuffleMaskARGBToRGB24_0 = {
-    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
-
-// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
-                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
-                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
-
-// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
-                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
-                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
-
-// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
-                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
-                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
-
-// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
-                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
-                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
-
-// NV21 shuf 8 VU to 16 UV.
-static const lvec8 kShuffleNV21 = {
-    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-};
-#endif  // HAS_RGB24TOARGBROW_SSSE3
-
-#ifdef HAS_J400TOARGBROW_SSE2
-void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "pslld       $0x18,%%xmm5                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"
-      "lea         0x8(%0),%0                    \n"
-      "punpcklbw   %%xmm0,%%xmm0                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklwd   %%xmm0,%%xmm0                 \n"
-      "punpckhwd   %%xmm1,%%xmm1                 \n"
-      "por         %%xmm5,%%xmm0                 \n"
-      "por         %%xmm5,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_J400TOARGBROW_SSE2
-
-#ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
-      "pslld       $0x18,%%xmm5                  \n"
-      "movdqa      %3,%%xmm4                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm3               \n"
-      "lea         0x30(%0),%0                   \n"
-      "movdqa      %%xmm3,%%xmm2                 \n"
-      "palignr     $0x8,%%xmm1,%%xmm2            \n"
-      "pshufb      %%xmm4,%%xmm2                 \n"
-      "por         %%xmm5,%%xmm2                 \n"
-      "palignr     $0xc,%%xmm0,%%xmm1            \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "movdqu      %%xmm2,0x20(%1)               \n"
-      "por         %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm4,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "por         %%xmm5,%%xmm1                 \n"
-      "palignr     $0x4,%%xmm3,%%xmm3            \n"
-      "pshufb      %%xmm4,%%xmm3                 \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "por         %%xmm5,%%xmm3                 \n"
-      "movdqu      %%xmm3,0x30(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_rgb24),              // %0
-        "+r"(dst_argb),               // %1
-        "+r"(width)                   // %2
-      : "m"(kShuffleMaskRGB24ToARGB)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
-      "pslld       $0x18,%%xmm5                  \n"
-      "movdqa      %3,%%xmm4                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm3               \n"
-      "lea         0x30(%0),%0                   \n"
-      "movdqa      %%xmm3,%%xmm2                 \n"
-      "palignr     $0x8,%%xmm1,%%xmm2            \n"
-      "pshufb      %%xmm4,%%xmm2                 \n"
-      "por         %%xmm5,%%xmm2                 \n"
-      "palignr     $0xc,%%xmm0,%%xmm1            \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "movdqu      %%xmm2,0x20(%1)               \n"
-      "por         %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm4,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "por         %%xmm5,%%xmm1                 \n"
-      "palignr     $0x4,%%xmm3,%%xmm3            \n"
-      "pshufb      %%xmm4,%%xmm3                 \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "por         %%xmm5,%%xmm3                 \n"
-      "movdqu      %%xmm3,0x30(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_raw),              // %0
-        "+r"(dst_argb),             // %1
-        "+r"(width)                 // %2
-      : "m"(kShuffleMaskRAWToARGB)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-// Same code as RAWToARGB with different shuffler and A in low bits
-void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
-      "psrld       $0x18,%%xmm5                  \n"
-      "movdqa      %3,%%xmm4                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm3               \n"
-      "lea         0x30(%0),%0                   \n"
-      "movdqa      %%xmm3,%%xmm2                 \n"
-      "palignr     $0x8,%%xmm1,%%xmm2            \n"
-      "pshufb      %%xmm4,%%xmm2                 \n"
-      "por         %%xmm5,%%xmm2                 \n"
-      "palignr     $0xc,%%xmm0,%%xmm1            \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "movdqu      %%xmm2,0x20(%1)               \n"
-      "por         %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm4,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "por         %%xmm5,%%xmm1                 \n"
-      "palignr     $0x4,%%xmm3,%%xmm3            \n"
-      "pshufb      %%xmm4,%%xmm3                 \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "por         %%xmm5,%%xmm3                 \n"
-      "movdqu      %%xmm3,0x30(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_raw),              // %0
-        "+r"(dst_rgba),             // %1
-        "+r"(width)                 // %2
-      : "m"(kShuffleMaskRAWToRGBA)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm3                     \n"
-      "movdqa      %4,%%xmm4                     \n"
-      "movdqa      %5,%%xmm5                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x4(%0),%%xmm1                \n"
-      "movdqu      0x8(%0),%%xmm2                \n"
-      "lea         0x18(%0),%0                   \n"
-      "pshufb      %%xmm3,%%xmm0                 \n"
-      "pshufb      %%xmm4,%%xmm1                 \n"
-      "pshufb      %%xmm5,%%xmm2                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "movq        %%xmm1,0x8(%1)                \n"
-      "movq        %%xmm2,0x10(%1)               \n"
-      "lea         0x18(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_raw),                  // %0
-        "+r"(dst_rgb24),                // %1
-        "+r"(width)                     // %2
-      : "m"(kShuffleMaskRAWToRGB24_0),  // %3
-        "m"(kShuffleMaskRAWToRGB24_1),  // %4
-        "m"(kShuffleMaskRAWToRGB24_2)   // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "mov         $0x1080108,%%eax              \n"
-      "movd        %%eax,%%xmm5                  \n"
-      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
-      "mov         $0x20802080,%%eax             \n"
-      "movd        %%eax,%%xmm6                  \n"
-      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
-      "pcmpeqb     %%xmm3,%%xmm3                 \n"
-      "psllw       $0xb,%%xmm3                   \n"
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "psllw       $0xa,%%xmm4                   \n"
-      "psrlw       $0x5,%%xmm4                   \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "psllw       $0x8,%%xmm7                   \n"
-      "sub         %0,%1                         \n"
-      "sub         %0,%1                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "pand        %%xmm3,%%xmm1                 \n"
-      "psllw       $0xb,%%xmm2                   \n"
-      "pmulhuw     %%xmm5,%%xmm1                 \n"
-      "pmulhuw     %%xmm5,%%xmm2                 \n"
-      "psllw       $0x8,%%xmm1                   \n"
-      "por         %%xmm2,%%xmm1                 \n"
-      "pand        %%xmm4,%%xmm0                 \n"
-      "pmulhuw     %%xmm6,%%xmm0                 \n"
-      "por         %%xmm7,%%xmm0                 \n"
-      "movdqa      %%xmm1,%%xmm2                 \n"
-      "punpcklbw   %%xmm0,%%xmm1                 \n"
-      "punpckhbw   %%xmm0,%%xmm2                 \n"
-      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
-      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
-      "lea         0x10(%0),%0                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-        "xmm6", "xmm7");
-}
-
-void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "mov         $0x1080108,%%eax              \n"
-      "movd        %%eax,%%xmm5                  \n"
-      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
-      "mov         $0x42004200,%%eax             \n"
-      "movd        %%eax,%%xmm6                  \n"
-      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
-      "pcmpeqb     %%xmm3,%%xmm3                 \n"
-      "psllw       $0xb,%%xmm3                   \n"
-      "movdqa      %%xmm3,%%xmm4                 \n"
-      "psrlw       $0x6,%%xmm4                   \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "psllw       $0x8,%%xmm7                   \n"
-      "sub         %0,%1                         \n"
-      "sub         %0,%1                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "psllw       $0x1,%%xmm1                   \n"
-      "psllw       $0xb,%%xmm2                   \n"
-      "pand        %%xmm3,%%xmm1                 \n"
-      "pmulhuw     %%xmm5,%%xmm2                 \n"
-      "pmulhuw     %%xmm5,%%xmm1                 \n"
-      "psllw       $0x8,%%xmm1                   \n"
-      "por         %%xmm2,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "pand        %%xmm4,%%xmm0                 \n"
-      "psraw       $0x8,%%xmm2                   \n"
-      "pmulhuw     %%xmm6,%%xmm0                 \n"
-      "pand        %%xmm7,%%xmm2                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "movdqa      %%xmm1,%%xmm2                 \n"
-      "punpcklbw   %%xmm0,%%xmm1                 \n"
-      "punpckhbw   %%xmm0,%%xmm2                 \n"
-      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
-      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
-      "lea         0x10(%0),%0                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-        "xmm6", "xmm7");
-}
-
-void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "mov         $0xf0f0f0f,%%eax              \n"
-      "movd        %%eax,%%xmm4                  \n"
-      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
-      "movdqa      %%xmm4,%%xmm5                 \n"
-      "pslld       $0x4,%%xmm5                   \n"
-      "sub         %0,%1                         \n"
-      "sub         %0,%1                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "pand        %%xmm4,%%xmm0                 \n"
-      "pand        %%xmm5,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "psllw       $0x4,%%xmm1                   \n"
-      "psrlw       $0x4,%%xmm3                   \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "por         %%xmm3,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm2,%%xmm0                 \n"
-      "punpckhbw   %%xmm2,%%xmm1                 \n"
-      "movdqu      %%xmm0,0x00(%1,%0,2)          \n"
-      "movdqu      %%xmm1,0x10(%1,%0,2)          \n"
-      "lea         0x10(%0),%0                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-
-      "movdqa      %3,%%xmm6                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x30(%0),%%xmm3               \n"
-      "lea         0x40(%0),%0                   \n"
-      "pshufb      %%xmm6,%%xmm0                 \n"
-      "pshufb      %%xmm6,%%xmm1                 \n"
-      "pshufb      %%xmm6,%%xmm2                 \n"
-      "pshufb      %%xmm6,%%xmm3                 \n"
-      "movdqa      %%xmm1,%%xmm4                 \n"
-      "psrldq      $0x4,%%xmm1                   \n"
-      "pslldq      $0xc,%%xmm4                   \n"
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "por         %%xmm4,%%xmm0                 \n"
-      "pslldq      $0x8,%%xmm5                   \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "por         %%xmm5,%%xmm1                 \n"
-      "psrldq      $0x8,%%xmm2                   \n"
-      "pslldq      $0x4,%%xmm3                   \n"
-      "por         %%xmm3,%%xmm2                 \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "movdqu      %%xmm2,0x20(%1)               \n"
-      "lea         0x30(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src),                    // %0
-        "+r"(dst),                    // %1
-        "+r"(width)                   // %2
-      : "m"(kShuffleMaskARGBToRGB24)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-
-      "movdqa      %3,%%xmm6                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x30(%0),%%xmm3               \n"
-      "lea         0x40(%0),%0                   \n"
-      "pshufb      %%xmm6,%%xmm0                 \n"
-      "pshufb      %%xmm6,%%xmm1                 \n"
-      "pshufb      %%xmm6,%%xmm2                 \n"
-      "pshufb      %%xmm6,%%xmm3                 \n"
-      "movdqa      %%xmm1,%%xmm4                 \n"
-      "psrldq      $0x4,%%xmm1                   \n"
-      "pslldq      $0xc,%%xmm4                   \n"
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "por         %%xmm4,%%xmm0                 \n"
-      "pslldq      $0x8,%%xmm5                   \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "por         %%xmm5,%%xmm1                 \n"
-      "psrldq      $0x8,%%xmm2                   \n"
-      "pslldq      $0x4,%%xmm3                   \n"
-      "por         %%xmm3,%%xmm2                 \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "movdqu      %%xmm2,0x20(%1)               \n"
-      "lea         0x30(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src),                  // %0
-        "+r"(dst),                  // %1
-        "+r"(width)                 // %2
-      : "m"(kShuffleMaskARGBToRAW)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-#ifdef HAS_ARGBTORGB24ROW_AVX2
-// vpermd for 12+12 to 24
-static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
-
-void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm6                  \n"
-      "vmovdqa     %4,%%ymm7                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vmovdqu     0x40(%0),%%ymm2               \n"
-      "vmovdqu     0x60(%0),%%ymm3               \n"
-      "lea         0x80(%0),%0                   \n"
-      "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
-      "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
-      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
-      "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
-      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
-      "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
-      "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
-      "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
-      "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
-      "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
-      "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
-      "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
-      "vpermq      $0x93,%%ymm3,%%ymm3           \n"
-      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
-      "vmovdqu     %%ymm2,0x40(%1)               \n"
-      "lea         0x60(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),                     // %0
-        "+r"(dst),                     // %1
-        "+r"(width)                    // %2
-      : "m"(kShuffleMaskARGBToRGB24),  // %3
-        "m"(kPermdRGB24_AVX)           // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
-// Shuffle table for converting ARGBToRGB24
-static const ulvec8 kPermARGBToRGB24_0 = {
-    0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
-    14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
-    29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
-static const ulvec8 kPermARGBToRGB24_1 = {
-    10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
-    25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
-    40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
-static const ulvec8 kPermARGBToRGB24_2 = {
-    21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
-    36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
-    50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
-
-void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vmovdqa     %3,%%ymm5                     \n"
-      "vmovdqa     %4,%%ymm6                     \n"
-      "vmovdqa     %5,%%ymm7                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vmovdqu     0x40(%0),%%ymm2               \n"
-      "vmovdqu     0x60(%0),%%ymm3               \n"
-      "lea         0x80(%0),%0                   \n"
-      "vpermt2b    %%ymm1,%%ymm5,%%ymm0          \n"
-      "vpermt2b    %%ymm2,%%ymm6,%%ymm1          \n"
-      "vpermt2b    %%ymm3,%%ymm7,%%ymm2          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "vmovdqu     %%ymm2,0x40(%1)               \n"
-      "lea         0x60(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),                // %0
-        "+r"(dst),                // %1
-        "+r"(width)               // %2
-      : "m"(kPermARGBToRGB24_0),  // %3
-        "m"(kPermARGBToRGB24_1),  // %4
-        "m"(kPermARGBToRGB24_2)   // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
-}
-#endif
-
-#ifdef HAS_ARGBTORAWROW_AVX2
-void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm6                  \n"
-      "vmovdqa     %4,%%ymm7                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vmovdqu     0x40(%0),%%ymm2               \n"
-      "vmovdqu     0x60(%0),%%ymm3               \n"
-      "lea         0x80(%0),%0                   \n"
-      "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
-      "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
-      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
-      "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
-      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
-      "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
-      "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
-      "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
-      "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
-      "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
-      "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
-      "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
-      "vpermq      $0x93,%%ymm3,%%ymm3           \n"
-      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
-      "vmovdqu     %%ymm2,0x40(%1)               \n"
-      "lea         0x60(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),                   // %0
-        "+r"(dst),                   // %1
-        "+r"(width)                  // %2
-      : "m"(kShuffleMaskARGBToRAW),  // %3
-        "m"(kPermdRGB24_AVX)         // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm3,%%xmm3                 \n"
-      "psrld       $0x1b,%%xmm3                  \n"
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "psrld       $0x1a,%%xmm4                  \n"
-      "pslld       $0x5,%%xmm4                   \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "pslld       $0xb,%%xmm5                   \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "pslld       $0x8,%%xmm0                   \n"
-      "psrld       $0x3,%%xmm1                   \n"
-      "psrld       $0x5,%%xmm2                   \n"
-      "psrad       $0x10,%%xmm0                  \n"
-      "pand        %%xmm3,%%xmm1                 \n"
-      "pand        %%xmm4,%%xmm2                 \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm1                 \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "packssdw    %%xmm0,%%xmm0                 \n"
-      "lea         0x10(%0),%0                   \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
-                                uint8_t* dst,
-                                const uint32_t dither4,
-                                int width) {
-  asm volatile(
-      "movd        %3,%%xmm6                     \n"
-      "punpcklbw   %%xmm6,%%xmm6                 \n"
-      "movdqa      %%xmm6,%%xmm7                 \n"
-      "punpcklwd   %%xmm6,%%xmm6                 \n"
-      "punpckhwd   %%xmm7,%%xmm7                 \n"
-      "pcmpeqb     %%xmm3,%%xmm3                 \n"
-      "psrld       $0x1b,%%xmm3                  \n"
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "psrld       $0x1a,%%xmm4                  \n"
-      "pslld       $0x5,%%xmm4                   \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "pslld       $0xb,%%xmm5                   \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "paddusb     %%xmm6,%%xmm0                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "pslld       $0x8,%%xmm0                   \n"
-      "psrld       $0x3,%%xmm1                   \n"
-      "psrld       $0x5,%%xmm2                   \n"
-      "psrad       $0x10,%%xmm0                  \n"
-      "pand        %%xmm3,%%xmm1                 \n"
-      "pand        %%xmm4,%%xmm2                 \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm1                 \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "packssdw    %%xmm0,%%xmm0                 \n"
-      "lea         0x10(%0),%0                   \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),    // %0
-        "+r"(dst),    // %1
-        "+r"(width)   // %2
-      : "m"(dither4)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
-                                uint8_t* dst,
-                                const uint32_t dither4,
-                                int width) {
-  asm volatile(
-      "vbroadcastss %3,%%xmm6                    \n"
-      "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
-      "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
-      "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
-      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
-      "vpsrld      $0x1b,%%ymm3,%%ymm3           \n"
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrld      $0x1a,%%ymm4,%%ymm4           \n"
-      "vpslld      $0x5,%%ymm4,%%ymm4            \n"
-      "vpslld      $0xb,%%ymm3,%%ymm5            \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vpaddusb    %%ymm6,%%ymm0,%%ymm0          \n"
-      "vpsrld      $0x5,%%ymm0,%%ymm2            \n"
-      "vpsrld      $0x3,%%ymm0,%%ymm1            \n"
-      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
-      "vpand       %%ymm4,%%ymm2,%%ymm2          \n"
-      "vpand       %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
-      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "lea         0x20(%0),%0                   \n"
-      "vmovdqu     %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),    // %0
-        "+r"(dst),    // %1
-        "+r"(width)   // %2
-      : "m"(dither4)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
-
-void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "psrld       $0x1b,%%xmm4                  \n"
-      "movdqa      %%xmm4,%%xmm5                 \n"
-      "pslld       $0x5,%%xmm5                   \n"
-      "movdqa      %%xmm4,%%xmm6                 \n"
-      "pslld       $0xa,%%xmm6                   \n"
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "pslld       $0xf,%%xmm7                   \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm3                 \n"
-      "psrad       $0x10,%%xmm0                  \n"
-      "psrld       $0x3,%%xmm1                   \n"
-      "psrld       $0x6,%%xmm2                   \n"
-      "psrld       $0x9,%%xmm3                   \n"
-      "pand        %%xmm7,%%xmm0                 \n"
-      "pand        %%xmm4,%%xmm1                 \n"
-      "pand        %%xmm5,%%xmm2                 \n"
-      "pand        %%xmm6,%%xmm3                 \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "por         %%xmm3,%%xmm2                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "packssdw    %%xmm0,%%xmm0                 \n"
-      "lea         0x10(%0),%0                   \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
-}
-
-void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "psllw       $0xc,%%xmm4                   \n"
-      "movdqa      %%xmm4,%%xmm3                 \n"
-      "psrlw       $0x8,%%xmm3                   \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pand        %%xmm3,%%xmm0                 \n"
-      "pand        %%xmm4,%%xmm1                 \n"
-      "psrlq       $0x4,%%xmm0                   \n"
-      "psrlq       $0x8,%%xmm1                   \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "lea         0x10(%0),%0                   \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif  // HAS_RGB24TOARGBROW_SSSE3
-
-/*
-
-ARGBToAR30Row:
-
-Red Blue
-With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
-produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
-wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
-(1024+4)*16 for red.
-
-Alpha Green
-Alpha and Green are already in the high bits so vpand can zero out the other
-bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
-could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
-would be a simple multiplier to shift it into position.  It wants a gap of 10
-above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
-more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
-and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
-result left 10 to position the A and G channels.
-*/
-
-// Shuffle table for converting RAW to RGB24.  Last 8.
-static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
-                                   128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
-
-static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
-                                   128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
-
-static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
-static const uint32_t kMaskRB10 = 0x3ff003ff;
-static const uint32_t kMaskAG10 = 0xc000ff00;
-static const uint32_t kMulAG10 = 64 * 65536 + 1028;
-
-void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
-      "movd        %4,%%xmm3                     \n"  // multipler for RB
-      "movd        %5,%%xmm4                     \n"  // mask for R10 B10
-      "movd        %6,%%xmm5                     \n"  // mask for AG
-      "movd        %7,%%xmm6                     \n"  // multipler for AG
-      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
-      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
-      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
-      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
-      "sub         %0,%1                         \n"
-
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
-      "pand        %%xmm5,%%xmm0                 \n"  // A0G0
-      "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
-      "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
-      "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
-      "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
-      "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
-      "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
-      "add         $0x10,%0                      \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-
-      : "+r"(src),          // %0
-        "+r"(dst),          // %1
-        "+r"(width)         // %2
-      : "m"(kShuffleRB30),  // %3
-        "m"(kMulRB10),      // %4
-        "m"(kMaskRB10),     // %5
-        "m"(kMaskAG10),     // %6
-        "m"(kMulAG10)       // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
-      "movd        %4,%%xmm3                     \n"  // multipler for RB
-      "movd        %5,%%xmm4                     \n"  // mask for R10 B10
-      "movd        %6,%%xmm5                     \n"  // mask for AG
-      "movd        %7,%%xmm6                     \n"  // multipler for AG
-      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
-      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
-      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
-      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
-      "sub         %0,%1                         \n"
-
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
-      "pand        %%xmm5,%%xmm0                 \n"  // A0G0
-      "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
-      "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
-      "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
-      "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
-      "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
-      "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
-      "add         $0x10,%0                      \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-
-      : "+r"(src),          // %0
-        "+r"(dst),          // %1
-        "+r"(width)         // %2
-      : "m"(kShuffleBR30),  // %3  reversed shuffler
-        "m"(kMulRB10),      // %4
-        "m"(kMaskRB10),     // %5
-        "m"(kMaskAG10),     // %6
-        "m"(kMulAG10)       // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-#ifdef HAS_ARGBTOAR30ROW_AVX2
-void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
-      "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
-      "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
-      "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
-      "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
-      "sub         %0,%1                         \n"
-
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ARGB pixels
-      "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
-      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
-      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
-      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
-      "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
-      "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
-      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
-      "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
-      "add         $0x20,%0                      \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-
-      : "+r"(src),          // %0
-        "+r"(dst),          // %1
-        "+r"(width)         // %2
-      : "m"(kShuffleRB30),  // %3
-        "m"(kMulRB10),      // %4
-        "m"(kMaskRB10),     // %5
-        "m"(kMaskAG10),     // %6
-        "m"(kMulAG10)       // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_ABGRTOAR30ROW_AVX2
-void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
-      "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
-      "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
-      "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
-      "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
-      "sub         %0,%1                         \n"
-
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ABGR pixels
-      "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
-      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
-      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
-      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
-      "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
-      "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
-      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
-      "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
-      "add         $0x20,%0                      \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-
-      : "+r"(src),          // %0
-        "+r"(dst),          // %1
-        "+r"(width)         // %2
-      : "m"(kShuffleBR30),  // %3  reversed shuffler
-        "m"(kMulRB10),      // %4
-        "m"(kMaskRB10),     // %5
-        "m"(kMaskAG10),     // %6
-        "m"(kMulAG10)       // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
-                                         10, 9, 8, 11, 14, 13, 12, 15};
-
-static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
-                                           6, 6, 5, 5, 4, 4, 7, 7};
-static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
-                                           14, 14, 13, 13, 12, 12, 15, 15};
-
-void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
-                         uint16_t* dst_ar64,
-                         int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm0,%%xmm0                 \n"
-      "punpckhbw   %%xmm1,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_ar64),  // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-
-void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
-                         uint16_t* dst_ab64,
-                         int width) {
-  asm volatile(
-
-      "movdqa      %3,%%xmm2                     \n"
-      "movdqa      %4,%%xmm3                     \n" LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pshufb      %%xmm2,%%xmm0                 \n"
-      "pshufb      %%xmm3,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),             // %0
-        "+r"(dst_ab64),             // %1
-        "+r"(width)                 // %2
-      : "m"(kShuffleARGBToAB64Lo),  // %3
-        "m"(kShuffleARGBToAB64Hi)   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-
-void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
-                         uint8_t* dst_argb,
-                         int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "psrlw       $8,%%xmm0                     \n"
-      "psrlw       $8,%%xmm1                     \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ar64),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-
-void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
-                         uint8_t* dst_argb,
-                         int width) {
-  asm volatile(
-
-      "movdqa      %3,%%xmm2                     \n" LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "psrlw       $8,%%xmm0                     \n"
-      "psrlw       $8,%%xmm1                     \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "pshufb      %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ab64),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleARGBToABGR)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-
-#ifdef HAS_ARGBTOAR64ROW_AVX2
-void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
-                        uint16_t* dst_ar64,
-                        int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
-      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_ar64),  // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif
-
-#ifdef HAS_ARGBTOAB64ROW_AVX2
-void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
-                        uint16_t* dst_ab64,
-                        int width) {
-  asm volatile(
-
-      "vbroadcastf128 %3,%%ymm2                  \n"
-      "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpshufb     %%ymm3,%%ymm0,%%ymm1          \n"
-      "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),             // %0
-        "+r"(dst_ab64),             // %1
-        "+r"(width)                 // %2
-      : "m"(kShuffleARGBToAB64Lo),  // %3
-        "m"(kShuffleARGBToAB64Hi)   // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif
-
-#ifdef HAS_AR64TOARGBROW_AVX2
-void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
-                        uint8_t* dst_argb,
-                        int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vpsrlw      $8,%%ymm0,%%ymm0              \n"
-      "vpsrlw      $8,%%ymm1,%%ymm1              \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x40(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ar64),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif
-
-#ifdef HAS_AB64TOARGBROW_AVX2
-void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
-                        uint8_t* dst_argb,
-                        int width) {
-  asm volatile(
-
-      "vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vpsrlw      $8,%%ymm0,%%ymm0              \n"
-      "vpsrlw      $8,%%ymm1,%%ymm1              \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x40(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ab64),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleARGBToABGR)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif
-
-// clang-format off
-
-// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
-// round parameter is register containing value to add before shift.
-#define RGBTOY(round)                            \
-  "1:                                        \n" \
-  "movdqu    (%0),%%xmm0                     \n" \
-  "movdqu    0x10(%0),%%xmm1                 \n" \
-  "movdqu    0x20(%0),%%xmm2                 \n" \
-  "movdqu    0x30(%0),%%xmm3                 \n" \
-  "psubb     %%xmm5,%%xmm0                   \n" \
-  "psubb     %%xmm5,%%xmm1                   \n" \
-  "psubb     %%xmm5,%%xmm2                   \n" \
-  "psubb     %%xmm5,%%xmm3                   \n" \
-  "movdqu    %%xmm4,%%xmm6                   \n" \
-  "pmaddubsw %%xmm0,%%xmm6                   \n" \
-  "movdqu    %%xmm4,%%xmm0                   \n" \
-  "pmaddubsw %%xmm1,%%xmm0                   \n" \
-  "movdqu    %%xmm4,%%xmm1                   \n" \
-  "pmaddubsw %%xmm2,%%xmm1                   \n" \
-  "movdqu    %%xmm4,%%xmm2                   \n" \
-  "pmaddubsw %%xmm3,%%xmm2                   \n" \
-  "lea       0x40(%0),%0                     \n" \
-  "phaddw    %%xmm0,%%xmm6                   \n" \
-  "phaddw    %%xmm2,%%xmm1                   \n" \
-  "prefetcht0 1280(%0)                       \n" \
-  "paddw     %%" #round ",%%xmm6             \n" \
-  "paddw     %%" #round ",%%xmm1             \n" \
-  "psrlw     $0x8,%%xmm6                     \n" \
-  "psrlw     $0x8,%%xmm1                     \n" \
-  "packuswb  %%xmm1,%%xmm6                   \n" \
-  "movdqu    %%xmm6,(%1)                     \n" \
-  "lea       0x10(%1),%1                     \n" \
-  "sub       $0x10,%2                        \n" \
-  "jg        1b                              \n"
-
-#define RGBTOY_AVX2(round)                                       \
-  "1:                                        \n"                 \
-  "vmovdqu    (%0),%%ymm0                    \n"                 \
-  "vmovdqu    0x20(%0),%%ymm1                \n"                 \
-  "vmovdqu    0x40(%0),%%ymm2                \n"                 \
-  "vmovdqu    0x60(%0),%%ymm3                \n"                 \
-  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
-  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
-  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
-  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
-  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
-  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
-  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
-  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
-  "lea       0x80(%0),%0                     \n"                 \
-  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
-  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
-  "prefetcht0 1280(%0)                       \n"                 \
-  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
-  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
-  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
-  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
-  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
-  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
-  "vmovdqu    %%ymm0,(%1)                    \n"                 \
-  "lea       0x20(%1),%1                     \n"                 \
-  "sub       $0x20,%2                        \n"                 \
-  "jg        1b                              \n"                 \
-  "vzeroupper                                \n"
-
-// clang-format on
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
-      "movdqa      %5,%%xmm7                     \n"
-
-      LABELALIGN RGBTOY(xmm7)
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kARGBToY),   // %3
-        "m"(kSub128),    // %4
-        "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBTOYROW_SSSE3
-
-#ifdef HAS_ARGBTOYJROW_SSSE3
-// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16.
-void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
-
-      LABELALIGN RGBTOY(xmm5)
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kARGBToYJ),  // %3
-        "m"(kSub128)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_ARGBTOYJROW_SSSE3
-
-#ifdef HAS_RGBATOYJROW_SSSE3
-// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16.
-void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
-
-      LABELALIGN RGBTOY(xmm5)
-      : "+r"(src_rgba),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kRGBAToYJ),  // %3
-        "m"(kSub128)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_RGBATOYJROW_SSSE3
-
-#ifdef HAS_ARGBTOYROW_AVX2
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
-
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
-      "vbroadcastf128 %4,%%ymm5                  \n"
-      "vbroadcastf128 %5,%%ymm7                  \n"
-      "vmovdqu     %6,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(ymm7)
-      : "+r"(src_argb),         // %0
-        "+r"(dst_y),            // %1
-        "+r"(width)             // %2
-      : "m"(kARGBToY),          // %3
-        "m"(kSub128),           // %4
-        "m"(kAddY16),           // %5
-        "m"(kPermdARGBToY_AVX)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBTOYROW_AVX2
-
-#ifdef HAS_ABGRTOYROW_AVX2
-// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
-void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
-      "vbroadcastf128 %4,%%ymm5                  \n"
-      "vbroadcastf128 %5,%%ymm7                  \n"
-      "vmovdqu     %6,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(ymm7)
-      : "+r"(src_abgr),         // %0
-        "+r"(dst_y),            // %1
-        "+r"(width)             // %2
-      : "m"(kABGRToY),          // %3
-        "m"(kSub128),           // %4
-        "m"(kAddY16),           // %5
-        "m"(kPermdARGBToY_AVX)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ABGRTOYROW_AVX2
-
-#ifdef HAS_ARGBTOYJROW_AVX2
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
-      "vbroadcastf128 %4,%%ymm5                  \n"
-      "vmovdqu     %5,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(ymm5)
-      : "+r"(src_argb),         // %0
-        "+r"(dst_y),            // %1
-        "+r"(width)             // %2
-      : "m"(kARGBToYJ),         // %3
-        "m"(kSub128),           // %4
-        "m"(kPermdARGBToY_AVX)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBTOYJROW_AVX2
-
-#ifdef HAS_RGBATOYJROW_AVX2
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
-      "vbroadcastf128 %4,%%ymm5                  \n"
-      "vmovdqu     %5,%%ymm6                     \n"
-
-      LABELALIGN RGBTOY_AVX2(
-      ymm5) "vzeroupper                                \n"
-      : "+r"(src_rgba),         // %0
-        "+r"(dst_y),            // %1
-        "+r"(width)             // %2
-      : "m"(kRGBAToYJ),         // %3
-        "m"(kSub128),           // %4
-        "m"(kPermdARGBToY_AVX)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_RGBATOYJROW_AVX2
-
-#ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
-      "movdqa      %6,%%xmm4                     \n"
-      "movdqa      %7,%%xmm5                     \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm1                 \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqu      0x30(%0),%%xmm6               \n"
-      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-
-      "lea         0x40(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm1,%%xmm0           \n"
-      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqa      %%xmm2,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm6,%%xmm2           \n"
-      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm2,%%xmm6                 \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm3,%%xmm1                 \n"
-      "pmaddubsw   %%xmm3,%%xmm6                 \n"
-      "phaddw      %%xmm2,%%xmm0                 \n"
-      "phaddw      %%xmm6,%%xmm1                 \n"
-      "psraw       $0x8,%%xmm0                   \n"
-      "psraw       $0x8,%%xmm1                   \n"
-      "packsswb    %%xmm1,%%xmm0                 \n"
-      "paddb       %%xmm5,%%xmm0                 \n"
-      "movlps      %%xmm0,(%1)                   \n"
-      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),                    // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kARGBToV),                     // %5
-        "m"(kARGBToU),                     // %6
-        "m"(kAddUV128)                     // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-#endif  // HAS_ARGBTOUVROW_SSSE3
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
-void ARGBToUVRow_AVX2(const uint8_t* src_argb,
-                      int src_stride_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
-      "vbroadcastf128 %6,%%ymm6                  \n"
-      "vbroadcastf128 %7,%%ymm7                  \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vmovdqu     0x40(%0),%%ymm2               \n"
-      "vmovdqu     0x60(%0),%%ymm3               \n"
-      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
-      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
-      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
-      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
-      "lea         0x80(%0),%0                   \n"
-      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
-      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
-      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
-      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
-      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
-      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
-
-      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
-      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
-      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
-      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
-      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpshufb     %8,%%ymm0,%%ymm0              \n"
-      "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
-
-      "vextractf128 $0x0,%%ymm0,(%1)             \n"
-      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),                    // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kAddUV128),                    // %5
-        "m"(kARGBToV),                     // %6
-        "m"(kARGBToU),                     // %7
-        "m"(kShufARGBToUV_AVX)             // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBTOUVROW_AVX2
-
-#ifdef HAS_ABGRTOUVROW_AVX2
-void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
-                      int src_stride_abgr,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
-      "vbroadcastf128 %6,%%ymm6                  \n"
-      "vbroadcastf128 %7,%%ymm7                  \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vmovdqu     0x40(%0),%%ymm2               \n"
-      "vmovdqu     0x60(%0),%%ymm3               \n"
-      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
-      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
-      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
-      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
-      "lea         0x80(%0),%0                   \n"
-      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
-      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
-      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
-      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
-      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
-      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
-
-      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
-      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
-      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
-      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
-      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpshufb     %8,%%ymm0,%%ymm0              \n"
-      "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
-
-      "vextractf128 $0x0,%%ymm0,(%1)             \n"
-      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_abgr),                    // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_abgr)),  // %4
-        "m"(kAddUV128),                    // %5
-        "m"(kABGRToV),                     // %6
-        "m"(kABGRToU),                     // %7
-        "m"(kShufARGBToUV_AVX)             // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ABGRTOUVROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
-      "vbroadcastf128 %6,%%ymm6                  \n"
-      "vbroadcastf128 %7,%%ymm7                  \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vmovdqu     0x40(%0),%%ymm2               \n"
-      "vmovdqu     0x60(%0),%%ymm3               \n"
-      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
-      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
-      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
-      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
-      "lea         0x80(%0),%0                   \n"
-      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
-      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
-      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
-      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
-      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
-      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
-
-      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
-      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
-      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
-      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
-      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpshufb     %8,%%ymm0,%%ymm0              \n"
-
-      "vextractf128 $0x0,%%ymm0,(%1)             \n"
-      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),                    // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kSub128),                      // %5
-        "m"(kARGBToVJ),                    // %6
-        "m"(kARGBToUJ),                    // %7
-        "m"(kShufARGBToUV_AVX)             // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBTOUVJROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
-                        int src_stride_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
-      "movdqa      %6,%%xmm4                     \n"
-      "movdqa      %7,%%xmm5                     \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm1                 \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqu      0x30(%0),%%xmm6               \n"
-      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-
-      "lea         0x40(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm1,%%xmm0           \n"
-      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqa      %%xmm2,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm6,%%xmm2           \n"
-      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm2,%%xmm6                 \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm3,%%xmm1                 \n"
-      "pmaddubsw   %%xmm3,%%xmm6                 \n"
-      "phaddw      %%xmm2,%%xmm0                 \n"
-      "phaddw      %%xmm6,%%xmm1                 \n"
-      "paddw       %%xmm5,%%xmm0                 \n"
-      "paddw       %%xmm5,%%xmm1                 \n"
-      "psraw       $0x8,%%xmm0                   \n"
-      "psraw       $0x8,%%xmm1                   \n"
-      "packsswb    %%xmm1,%%xmm0                 \n"
-      "movlps      %%xmm0,(%1)                   \n"
-      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),                    // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kARGBToVJ),                    // %5
-        "m"(kARGBToUJ),                    // %6
-        "m"(kSub128)                       // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-#endif  // HAS_ARGBTOUVJROW_SSSE3
-
-#ifdef HAS_ARGBTOUV444ROW_SSSE3
-void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  asm volatile(
-      "movdqa      %4,%%xmm3                     \n"
-      "movdqa      %5,%%xmm4                     \n"
-      "movdqa      %6,%%xmm5                     \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x30(%0),%%xmm6               \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"
-      "pmaddubsw   %%xmm4,%%xmm1                 \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm4,%%xmm6                 \n"
-      "phaddw      %%xmm1,%%xmm0                 \n"
-      "phaddw      %%xmm6,%%xmm2                 \n"
-      "psraw       $0x8,%%xmm0                   \n"
-      "psraw       $0x8,%%xmm2                   \n"
-      "packsswb    %%xmm2,%%xmm0                 \n"
-      "paddb       %%xmm5,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x30(%0),%%xmm6               \n"
-      "pmaddubsw   %%xmm3,%%xmm0                 \n"
-      "pmaddubsw   %%xmm3,%%xmm1                 \n"
-      "pmaddubsw   %%xmm3,%%xmm2                 \n"
-      "pmaddubsw   %%xmm3,%%xmm6                 \n"
-      "phaddw      %%xmm1,%%xmm0                 \n"
-      "phaddw      %%xmm6,%%xmm2                 \n"
-      "psraw       $0x8,%%xmm0                   \n"
-      "psraw       $0x8,%%xmm2                   \n"
-      "packsswb    %%xmm2,%%xmm0                 \n"
-      "paddb       %%xmm5,%%xmm0                 \n"
-      "lea         0x40(%0),%0                   \n"
-      "movdqu      %%xmm0,0x00(%1,%2,1)          \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+rm"(width)     // %3
-      : "m"(kARGBToV),   // %4
-        "m"(kARGBToU),   // %5
-        "m"(kAddUV128)   // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
-}
-#endif  // HAS_ARGBTOUV444ROW_SSSE3
-
-void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
-      "movdqa      %5,%%xmm7                     \n"
-
-      LABELALIGN RGBTOY(xmm7)
-      : "+r"(src_bgra),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kBGRAToY),   // %3
-        "m"(kSub128),    // %4
-        "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
-                       int src_stride_bgra,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
-      "movdqa      %6,%%xmm4                     \n"
-      "movdqa      %7,%%xmm5                     \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm1                 \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqu      0x30(%0),%%xmm6               \n"
-      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-
-      "lea         0x40(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm1,%%xmm0           \n"
-      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqa      %%xmm2,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm6,%%xmm2           \n"
-      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm2,%%xmm6                 \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm3,%%xmm1                 \n"
-      "pmaddubsw   %%xmm3,%%xmm6                 \n"
-      "phaddw      %%xmm2,%%xmm0                 \n"
-      "phaddw      %%xmm6,%%xmm1                 \n"
-      "psraw       $0x8,%%xmm0                   \n"
-      "psraw       $0x8,%%xmm1                   \n"
-      "packsswb    %%xmm1,%%xmm0                 \n"
-      "paddb       %%xmm5,%%xmm0                 \n"
-      "movlps      %%xmm0,(%1)                   \n"
-      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_bgra),                    // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_bgra)),  // %4
-        "m"(kBGRAToV),                     // %5
-        "m"(kBGRAToU),                     // %6
-        "m"(kAddUV128)                     // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
-      "movdqa      %5,%%xmm7                     \n"
-
-      LABELALIGN RGBTOY(xmm7)
-      : "+r"(src_abgr),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kABGRToY),   // %3
-        "m"(kSub128),    // %4
-        "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
-      "movdqa      %5,%%xmm7                     \n"
-
-      LABELALIGN RGBTOY(xmm7)
-      : "+r"(src_rgba),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kRGBAToY),   // %3
-        "m"(kSub128),    // %4
-        "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
-                       int src_stride_abgr,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
-      "movdqa      %6,%%xmm4                     \n"
-      "movdqa      %7,%%xmm5                     \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm1                 \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqu      0x30(%0),%%xmm6               \n"
-      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-
-      "lea         0x40(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm1,%%xmm0           \n"
-      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqa      %%xmm2,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm6,%%xmm2           \n"
-      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm2,%%xmm6                 \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm3,%%xmm1                 \n"
-      "pmaddubsw   %%xmm3,%%xmm6                 \n"
-      "phaddw      %%xmm2,%%xmm0                 \n"
-      "phaddw      %%xmm6,%%xmm1                 \n"
-      "psraw       $0x8,%%xmm0                   \n"
-      "psraw       $0x8,%%xmm1                   \n"
-      "packsswb    %%xmm1,%%xmm0                 \n"
-      "paddb       %%xmm5,%%xmm0                 \n"
-      "movlps      %%xmm0,(%1)                   \n"
-      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_abgr),                    // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_abgr)),  // %4
-        "m"(kABGRToV),                     // %5
-        "m"(kABGRToU),                     // %6
-        "m"(kAddUV128)                     // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
-                       int src_stride_rgba,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
-      "movdqa      %6,%%xmm4                     \n"
-      "movdqa      %7,%%xmm5                     \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm1                 \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqu      0x30(%0),%%xmm6               \n"
-      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-
-      "lea         0x40(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm1,%%xmm0           \n"
-      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqa      %%xmm2,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm6,%%xmm2           \n"
-      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm2,%%xmm6                 \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm3,%%xmm1                 \n"
-      "pmaddubsw   %%xmm3,%%xmm6                 \n"
-      "phaddw      %%xmm2,%%xmm0                 \n"
-      "phaddw      %%xmm6,%%xmm1                 \n"
-      "psraw       $0x8,%%xmm0                   \n"
-      "psraw       $0x8,%%xmm1                   \n"
-      "packsswb    %%xmm1,%%xmm0                 \n"
-      "paddb       %%xmm5,%%xmm0                 \n"
-      "movlps      %%xmm0,(%1)                   \n"
-      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_rgba),                    // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_rgba)),  // %4
-        "m"(kRGBAToV),                     // %5
-        "m"(kRGBAToU),                     // %6
-        "m"(kAddUV128)                     // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
-
-// Read 8 UV from 444
-#define READYUV444                                                \
-  "movq       (%[u_buf]),%%xmm3                               \n" \
-  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
-
-// Read 4 UV from 422, upsample to 8 UV
-#define READYUV422                                                \
-  "movd       (%[u_buf]),%%xmm3                               \n" \
-  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
-  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
-
-// Read 4 UV from 422 10 bit, upsample to 8 UV
-// TODO(fbarchard): Consider shufb to replace pack/unpack
-// TODO(fbarchard): Consider pmulhuw to replace psraw
-// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
-#define READYUV210                                                \
-  "movq       (%[u_buf]),%%xmm3                               \n" \
-  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
-  "psraw      $2,%%xmm3                                       \n" \
-  "packuswb   %%xmm3,%%xmm3                                   \n" \
-  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
-  "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "psllw      $6,%%xmm4                                       \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
-
-#define READYUVA210                                               \
-  "movq       (%[u_buf]),%%xmm3                               \n" \
-  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
-  "psraw      $2,%%xmm3                                       \n" \
-  "packuswb   %%xmm3,%%xmm3                                   \n" \
-  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
-  "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "psllw      $6,%%xmm4                                       \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
-  "movdqu     (%[a_buf]),%%xmm5                               \n" \
-  "psraw      $2,%%xmm5                                       \n" \
-  "packuswb   %%xmm5,%%xmm5                                   \n" \
-  "lea        0x10(%[a_buf]),%[a_buf]                         \n"
-
-// Read 8 UV from 444 10 bit
-#define READYUV410                                                \
-  "movdqu     (%[u_buf]),%%xmm3                               \n" \
-  "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
-  "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
-  "psraw      $2,%%xmm3                                       \n" \
-  "psraw      $2,%%xmm2                                       \n" \
-  "movdqa     %%xmm3,%%xmm1                                   \n" \
-  "punpcklwd  %%xmm2,%%xmm3                                   \n" \
-  "punpckhwd  %%xmm2,%%xmm1                                   \n" \
-  "packuswb   %%xmm1,%%xmm3                                   \n" \
-  "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "psllw      $6,%%xmm4                                       \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
-
-// Read 8 UV from 444 10 bit.  With 8 Alpha.
-#define READYUVA410                                               \
-  "movdqu     (%[u_buf]),%%xmm3                               \n" \
-  "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
-  "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
-  "psraw      $2,%%xmm3                                       \n" \
-  "psraw      $2,%%xmm2                                       \n" \
-  "movdqa     %%xmm3,%%xmm1                                   \n" \
-  "punpcklwd  %%xmm2,%%xmm3                                   \n" \
-  "punpckhwd  %%xmm2,%%xmm1                                   \n" \
-  "packuswb   %%xmm1,%%xmm3                                   \n" \
-  "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "psllw      $0x6,%%xmm4                                     \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
-  "movdqu     (%[a_buf]),%%xmm5                               \n" \
-  "psraw      $2,%%xmm5                                       \n" \
-  "packuswb   %%xmm5,%%xmm5                                   \n" \
-  "lea        0x10(%[a_buf]),%[a_buf]                         \n"
-
-// Read 4 UV from 422 12 bit, upsample to 8 UV
-#define READYUV212                                                \
-  "movq       (%[u_buf]),%%xmm3                               \n" \
-  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
-  "psraw      $0x4,%%xmm3                                     \n" \
-  "packuswb   %%xmm3,%%xmm3                                   \n" \
-  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
-  "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "psllw      $0x4,%%xmm4                                     \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
-
-// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                               \
-  "movd       (%[u_buf]),%%xmm3                               \n" \
-  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
-  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
-  "movq       (%[a_buf]),%%xmm5                               \n" \
-  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
-
-// Read 8 UV from 444.  With 8 Alpha.
-#define READYUVA444                                               \
-  "movq       (%[u_buf]),%%xmm3                               \n" \
-  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
-  "movq       (%[a_buf]),%%xmm5                               \n" \
-  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
-
-// Read 4 UV from NV12, upsample to 8 UV
-#define READNV12                                                  \
-  "movq       (%[uv_buf]),%%xmm3                              \n" \
-  "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
-  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
-
-// Read 4 VU from NV21, upsample to 8 UV
-#define READNV21                                                  \
-  "movq       (%[vu_buf]),%%xmm3                              \n" \
-  "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
-  "pshufb     %[kShuffleNV21], %%xmm3                         \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
-
-// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2                                                  \
-  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
-  "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
-  "movdqu     (%[yuy2_buf]),%%xmm3                            \n" \
-  "pshufb     %[kShuffleYUY2UV], %%xmm3                       \n" \
-  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
-
-// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY                                                  \
-  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
-  "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
-  "movdqu     (%[uyvy_buf]),%%xmm3                            \n" \
-  "pshufb     %[kShuffleUYVYUV], %%xmm3                       \n" \
-  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
-
-// Read 4 UV from P210, upsample to 8 UV
-#define READP210                                                  \
-  "movdqu     (%[uv_buf]),%%xmm3                              \n" \
-  "lea        0x10(%[uv_buf]),%[uv_buf]                       \n" \
-  "psrlw      $0x8,%%xmm3                                     \n" \
-  "packuswb   %%xmm3,%%xmm3                                   \n" \
-  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
-  "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
-
-// Read 8 UV from P410
-#define READP410                                                  \
-  "movdqu     (%[uv_buf]),%%xmm3                              \n" \
-  "movdqu     0x10(%[uv_buf]),%%xmm1                          \n" \
-  "lea        0x20(%[uv_buf]),%[uv_buf]                       \n" \
-  "psrlw      $0x8,%%xmm3                                     \n" \
-  "psrlw      $0x8,%%xmm1                                     \n" \
-  "packuswb   %%xmm1,%%xmm3                                   \n" \
-  "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
-
-#if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants)                              \
-  "pcmpeqb    %%xmm13,%%xmm13                                 \n" \
-  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
-  "pxor       %%xmm12,%%xmm12                                 \n" \
-  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
-  "psllw      $7,%%xmm13                                      \n" \
-  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
-  "pshufb     %%xmm12,%%xmm13                                 \n" \
-  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
-  "movdqa     128(%[yuvconstants]),%%xmm12                    \n"
-
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB16(yuvconstants)                                  \
-  "psubb      %%xmm13,%%xmm3                                  \n" \
-  "pmulhuw    %%xmm11,%%xmm4                                  \n" \
-  "movdqa     %%xmm8,%%xmm0                                   \n" \
-  "movdqa     %%xmm9,%%xmm1                                   \n" \
-  "movdqa     %%xmm10,%%xmm2                                  \n" \
-  "paddw      %%xmm12,%%xmm4                                  \n" \
-  "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
-  "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
-  "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
-  "paddsw     %%xmm4,%%xmm0                                   \n" \
-  "paddsw     %%xmm4,%%xmm2                                   \n" \
-  "psubsw     %%xmm1,%%xmm4                                   \n" \
-  "movdqa     %%xmm4,%%xmm1                                   \n"
-
-#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
-
-#else
-#define YUVTORGB_SETUP(yuvconstants)
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB16(yuvconstants)                                  \
-  "pcmpeqb    %%xmm0,%%xmm0                                   \n" \
-  "pxor       %%xmm1,%%xmm1                                   \n" \
-  "psllw      $7,%%xmm0                                       \n" \
-  "pshufb     %%xmm1,%%xmm0                                   \n" \
-  "psubb      %%xmm0,%%xmm3                                   \n" \
-  "pmulhuw    96(%[yuvconstants]),%%xmm4                      \n" \
-  "movdqa     (%[yuvconstants]),%%xmm0                        \n" \
-  "movdqa     32(%[yuvconstants]),%%xmm1                      \n" \
-  "movdqa     64(%[yuvconstants]),%%xmm2                      \n" \
-  "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
-  "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
-  "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
-  "movdqa     128(%[yuvconstants]),%%xmm3                     \n" \
-  "paddw      %%xmm3,%%xmm4                                   \n" \
-  "paddsw     %%xmm4,%%xmm0                                   \n" \
-  "paddsw     %%xmm4,%%xmm2                                   \n" \
-  "psubsw     %%xmm1,%%xmm4                                   \n" \
-  "movdqa     %%xmm4,%%xmm1                                   \n"
-
-#define YUVTORGB_REGS
-#endif
-
-#define YUVTORGB(yuvconstants)                                    \
-  YUVTORGB16(yuvconstants)                                        \
-  "psraw      $0x6,%%xmm0                                     \n" \
-  "psraw      $0x6,%%xmm1                                     \n" \
-  "psraw      $0x6,%%xmm2                                     \n" \
-  "packuswb   %%xmm0,%%xmm0                                   \n" \
-  "packuswb   %%xmm1,%%xmm1                                   \n" \
-  "packuswb   %%xmm2,%%xmm2                                   \n"
-
-// Store 8 ARGB values.
-#define STOREARGB                                                  \
-  "punpcklbw  %%xmm1,%%xmm0                                    \n" \
-  "punpcklbw  %%xmm5,%%xmm2                                    \n" \
-  "movdqa     %%xmm0,%%xmm1                                    \n" \
-  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
-  "punpckhwd  %%xmm2,%%xmm1                                    \n" \
-  "movdqu     %%xmm0,(%[dst_argb])                             \n" \
-  "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
-  "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
-
-// Store 8 RGBA values.
-#define STORERGBA                                                  \
-  "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
-  "punpcklbw %%xmm2,%%xmm1                                     \n" \
-  "punpcklbw %%xmm0,%%xmm5                                     \n" \
-  "movdqa    %%xmm5,%%xmm0                                     \n" \
-  "punpcklwd %%xmm1,%%xmm5                                     \n" \
-  "punpckhwd %%xmm1,%%xmm0                                     \n" \
-  "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
-  "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
-  "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
-
-// Store 8 AR30 values.
-#define STOREAR30                                                  \
-  "psraw      $0x4,%%xmm0                                      \n" \
-  "psraw      $0x4,%%xmm1                                      \n" \
-  "psraw      $0x4,%%xmm2                                      \n" \
-  "pminsw     %%xmm7,%%xmm0                                    \n" \
-  "pminsw     %%xmm7,%%xmm1                                    \n" \
-  "pminsw     %%xmm7,%%xmm2                                    \n" \
-  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
-  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
-  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
-  "psllw      $0x4,%%xmm2                                      \n" \
-  "movdqa     %%xmm0,%%xmm3                                    \n" \
-  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
-  "punpckhwd  %%xmm2,%%xmm3                                    \n" \
-  "movdqa     %%xmm1,%%xmm2                                    \n" \
-  "punpcklwd  %%xmm5,%%xmm1                                    \n" \
-  "punpckhwd  %%xmm5,%%xmm2                                    \n" \
-  "pslld      $0xa,%%xmm1                                      \n" \
-  "pslld      $0xa,%%xmm2                                      \n" \
-  "por        %%xmm1,%%xmm0                                    \n" \
-  "por        %%xmm2,%%xmm3                                    \n" \
-  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
-  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
-  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
-
-void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV444
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-#ifdef HAS_I444ALPHATOARGBROW_SSSE3
-void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
-                                     const uint8_t* u_buf,
-                                     const uint8_t* v_buf,
-                                     const uint8_t* a_buf,
-                                     uint8_t* dst_argb,
-                                     const struct YuvConstants* yuvconstants,
-                                     int width) {
-  // clang-format off
-  asm volatile (
-  YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-
-  LABELALIGN
-      "1:                                        \n"
-  READYUVA444
-  YUVTORGB(yuvconstants)
-  STOREARGB
-      "subl        $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_I444ALPHATOARGBROW_SSSE3
-
-void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
-                                 const uint8_t* u_buf,
-                                 const uint8_t* v_buf,
-                                 uint8_t* dst_rgb24,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
-      "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
-      "sub         %[u_buf],%[v_buf]             \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV422
-    YUVTORGB(yuvconstants)
-      "punpcklbw   %%xmm1,%%xmm0                 \n"
-      "punpcklbw   %%xmm2,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklwd   %%xmm2,%%xmm0                 \n"
-      "punpckhwd   %%xmm2,%%xmm1                 \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm6,%%xmm1                 \n"
-      "palignr     $0xc,%%xmm0,%%xmm1            \n"
-      "movq        %%xmm0,(%[dst_rgb24])         \n"
-      "movdqu      %%xmm1,0x8(%[dst_rgb24])      \n"
-      "lea         0x18(%[dst_rgb24]),%[dst_rgb24] \n"
-      "subl        $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
-#if defined(__i386__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
-    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
-    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-
-void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_ar30,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
-      "psrlw       $14,%%xmm5                    \n"
-      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV422
-    YUVTORGB16(yuvconstants)
-    STOREAR30
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-// 10 bit YUV to ARGB
-void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* u_buf,
-                                const uint16_t* v_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV210
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-// 12 bit YUV to ARGB
-void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* u_buf,
-                                const uint16_t* v_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV212
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-// 10 bit YUV to AR30
-void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* u_buf,
-                                const uint16_t* v_buf,
-                                uint8_t* dst_ar30,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $14,%%xmm5                    \n"
-      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV210
-    YUVTORGB16(yuvconstants)
-    STOREAR30
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-// 12 bit YUV to AR30
-void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* u_buf,
-                                const uint16_t* v_buf,
-                                uint8_t* dst_ar30,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $14,%%xmm5                    \n"
-      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV212
-    YUVTORGB16(yuvconstants)
-    STOREAR30
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-// 10 bit YUV to ARGB
-void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* u_buf,
-                                const uint16_t* v_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV410
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-#ifdef HAS_I210ALPHATOARGBROW_SSSE3
-// 10 bit YUVA to ARGB
-void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
-                                     const uint16_t* u_buf,
-                                     const uint16_t* v_buf,
-                                     const uint16_t* a_buf,
-                                     uint8_t* dst_argb,
-                                     const struct YuvConstants* yuvconstants,
-                                     int width) {
-  asm volatile(
-      YUVTORGB_SETUP(
-          yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
-
-      LABELALIGN "1:                                        \n" READYUVA210
-          YUVTORGB(yuvconstants) STOREARGB
-      "subl        $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-      : [y_buf] "+r"(y_buf),  // %[y_buf]
-        [u_buf] "+r"(u_buf),  // %[u_buf]
-        [v_buf] "+r"(v_buf),  // %[v_buf]
-        [a_buf] "+r"(a_buf),
-        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__)
-        [width] "+m"(width)  // %[width]
-#else
-        [width] "+rm"(width)  // %[width]
-#endif
-      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-        "xmm5");
-}
-#endif
-
-#ifdef HAS_I410ALPHATOARGBROW_SSSE3
-// 10 bit YUVA to ARGB
-void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
-                                     const uint16_t* u_buf,
-                                     const uint16_t* v_buf,
-                                     const uint16_t* a_buf,
-                                     uint8_t* dst_argb,
-                                     const struct YuvConstants* yuvconstants,
-                                     int width) {
-  // clang-format off
-  asm volatile(
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUVA410
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "subl        $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-    : [y_buf] "+r"(y_buf),  // %[y_buf]
-      [u_buf] "+r"(u_buf),  // %[u_buf]
-      [v_buf] "+r"(v_buf),  // %[v_buf]
-      [a_buf] "+r"(a_buf),
-      [dst_argb] "+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__)
-      [width] "+m"(width)  // %[width]
-#else
-      [width] "+rm"(width)  // %[width]
-#endif
-    : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-      "xmm5");
-  // clang-format on
-}
-#endif
-
-// 10 bit YUV to AR30
-void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* u_buf,
-                                const uint16_t* v_buf,
-                                uint8_t* dst_ar30,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $14,%%xmm5                    \n"
-      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV410
-    YUVTORGB16(yuvconstants)
-    STOREAR30
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)   // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
-                                     const uint8_t* u_buf,
-                                     const uint8_t* v_buf,
-                                     const uint8_t* a_buf,
-                                     uint8_t* dst_argb,
-                                     const struct YuvConstants* yuvconstants,
-                                     int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUVA422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "subl        $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_I422ALPHATOARGBROW_SSSE3
-
-void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* uv_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READNV12
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-
-void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* vu_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READNV21
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-
-void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUY2
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
-    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-
-void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READUYVY
-    YUVTORGB(yuvconstants)
-    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleUYVYY]"m"(kShuffleUYVYY),
-    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-
-void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* uv_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile(
-      YUVTORGB_SETUP(
-      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-      LABELALIGN "1:                                        \n" READP210
-          YUVTORGB(yuvconstants) STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-      : [y_buf] "+r"(y_buf),              // %[y_buf]
-        [uv_buf] "+r"(uv_buf),            // %[u_buf]
-        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
-        [width] "+rm"(width)              // %[width]
-      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-        "xmm5");
-}
-
-void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* uv_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile(
-      YUVTORGB_SETUP(
-      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-      LABELALIGN "1:                                        \n" READP410
-          YUVTORGB(yuvconstants) STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-      : [y_buf] "+r"(y_buf),              // %[y_buf]
-        [uv_buf] "+r"(uv_buf),            // %[u_buf]
-        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
-        [width] "+rm"(width)              // %[width]
-      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-        "xmm5");
-}
-
-void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* uv_buf,
-                                uint8_t* dst_ar30,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $14,%%xmm5                    \n"
-      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
-
-    LABELALIGN
-      "1:                                        \n"
-    READP210
-    YUVTORGB16(yuvconstants)
-    STOREAR30
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),              // %[y_buf]
-    [uv_buf]"+r"(uv_buf),            // %[uv_buf]
-    [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
-    [width]"+rm"(width)              // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* uv_buf,
-                                uint8_t* dst_ar30,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $14,%%xmm5                    \n"
-      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
-
-    LABELALIGN
-      "1:                                        \n"
-    READP410
-    YUVTORGB16(yuvconstants)
-    STOREAR30
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),              // %[y_buf]
-    [uv_buf]"+r"(uv_buf),            // %[uv_buf]
-    [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
-    [width]"+rm"(width)              // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_rgba,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV422
-    YUVTORGB(yuvconstants)
-    STORERGBA
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-#endif  // HAS_I422TOARGBROW_SSSE3
-
-// Read 16 UV from 444
-#define READYUV444_AVX2                                               \
-  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
-  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
-  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
-  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
-  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
-  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
-
-// Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2                                               \
-  "vmovq      (%[u_buf]),%%xmm3                                   \n" \
-  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
-  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
-  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
-  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
-
-// Read 8 UV from 210, upsample to 16 UV
-// TODO(fbarchard): Consider vshufb to replace pack/unpack
-// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
-#define READYUV210_AVX2                                            \
-  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
-  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
-  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
-  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
-  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
-  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
-  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
-  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
-  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
-  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
-  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
-
-// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
-#define READYUVA210_AVX2                                           \
-  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
-  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
-  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
-  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
-  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
-  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
-  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
-  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
-  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
-  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
-  "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
-  "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
-  "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
-  "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
-  "lea        0x20(%[a_buf]),%[a_buf]                          \n"
-
-// Read 16 UV from 410
-#define READYUV410_AVX2                                            \
-  "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
-  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
-  "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
-  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
-  "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
-  "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
-  "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
-  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
-  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
-  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
-
-// Read 8 UV from 212 12 bit, upsample to 16 UV
-#define READYUV212_AVX2                                            \
-  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
-  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
-  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
-  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
-  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
-  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
-  "vpsraw     $0x4,%%ymm3,%%ymm3                               \n" \
-  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
-  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
-  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $0x4,%%ymm4,%%ymm4                               \n" \
-  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
-
-// Read 16 UV from 410. With 16 Alpha.
-#define READYUVA410_AVX2                                           \
-  "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
-  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
-  "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
-  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
-  "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
-  "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
-  "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
-  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
-  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
-  "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
-  "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
-  "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
-  "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
-  "lea        0x20(%[a_buf]),%[a_buf]                          \n"
-
-// Read 16 UV from 444.  With 16 Alpha.
-#define READYUVA444_AVX2                                              \
-  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
-  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
-  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
-  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
-  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
-  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
-  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
-  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
-  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
-
-// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2                                              \
-  "vmovq      (%[u_buf]),%%xmm3                                   \n" \
-  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
-  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
-  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
-  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
-  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
-  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
-  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
-
-// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2                                                 \
-  "vmovdqu    (%[uv_buf]),%%xmm3                                  \n" \
-  "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
-  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
-  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
-
-// Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2                                                 \
-  "vmovdqu    (%[vu_buf]),%%xmm3                                  \n" \
-  "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
-  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
-  "vpshufb     %[kShuffleNV21], %%ymm3, %%ymm3                    \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
-
-// Read 4 UV from P210, upsample to 8 UV
-#define READP210_AVX2                                                 \
-  "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
-  "lea        0x20(%[uv_buf]),%[uv_buf]                           \n" \
-  "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
-  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                                \n" \
-  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
-  "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
-  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
-
-// Read 8 UV from P410
-#define READP410_AVX2                                                 \
-  "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
-  "vmovdqu    0x20(%[uv_buf]),%%ymm1                              \n" \
-  "lea        0x40(%[uv_buf]),%[uv_buf]                           \n" \
-  "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
-  "vpsrlw     $0x8,%%ymm1,%%ymm1                                  \n" \
-  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                                \n" \
-  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
-  "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
-  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
-
-// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2                                                 \
-  "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
-  "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
-  "vmovdqu    (%[yuy2_buf]),%%ymm3                                \n" \
-  "vpshufb    %[kShuffleYUY2UV], %%ymm3, %%ymm3                   \n" \
-  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
-
-// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2                                                 \
-  "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
-  "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
-  "vmovdqu    (%[uyvy_buf]),%%ymm3                                \n" \
-  "vpshufb    %[kShuffleUYVYUV], %%ymm3, %%ymm3                   \n" \
-  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
-
-#if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants)                             \
-  "vpcmpeqb    %%xmm13,%%xmm13,%%xmm13                            \n" \
-  "vmovdqa     (%[yuvconstants]),%%ymm8                           \n" \
-  "vpsllw      $7,%%xmm13,%%xmm13                                 \n" \
-  "vmovdqa     32(%[yuvconstants]),%%ymm9                         \n" \
-  "vpbroadcastb %%xmm13,%%ymm13                                   \n" \
-  "vmovdqa     64(%[yuvconstants]),%%ymm10                        \n" \
-  "vmovdqa     96(%[yuvconstants]),%%ymm11                        \n" \
-  "vmovdqa     128(%[yuvconstants]),%%ymm12                       \n"
-
-#define YUVTORGB16_AVX2(yuvconstants)                                 \
-  "vpsubb      %%ymm13,%%ymm3,%%ymm3                              \n" \
-  "vpmulhuw    %%ymm11,%%ymm4,%%ymm4                              \n" \
-  "vpmaddubsw  %%ymm3,%%ymm8,%%ymm0                               \n" \
-  "vpmaddubsw  %%ymm3,%%ymm9,%%ymm1                               \n" \
-  "vpmaddubsw  %%ymm3,%%ymm10,%%ymm2                              \n" \
-  "vpaddw      %%ymm4,%%ymm12,%%ymm4                              \n" \
-  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
-  "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
-  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
-
-#define YUVTORGB_REGS_AVX2 \
-  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
-
-#else  // Convert 16 pixels: 16 UV and 16 Y.
-
-#define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB16_AVX2(yuvconstants)                                 \
-  "vpcmpeqb    %%xmm0,%%xmm0,%%xmm0                               \n" \
-  "vpsllw      $7,%%xmm0,%%xmm0                                   \n" \
-  "vpbroadcastb %%xmm0,%%ymm0                                     \n" \
-  "vpsubb      %%ymm0,%%ymm3,%%ymm3                               \n" \
-  "vpmulhuw    96(%[yuvconstants]),%%ymm4,%%ymm4                  \n" \
-  "vmovdqa     (%[yuvconstants]),%%ymm0                           \n" \
-  "vmovdqa     32(%[yuvconstants]),%%ymm1                         \n" \
-  "vmovdqa     64(%[yuvconstants]),%%ymm2                         \n" \
-  "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0                               \n" \
-  "vpmaddubsw  %%ymm3,%%ymm1,%%ymm1                               \n" \
-  "vpmaddubsw  %%ymm3,%%ymm2,%%ymm2                               \n" \
-  "vmovdqa     128(%[yuvconstants]),%%ymm3                        \n" \
-  "vpaddw      %%ymm4,%%ymm3,%%ymm4                               \n" \
-  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
-  "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
-  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
-
-#define YUVTORGB_REGS_AVX2
-#endif
-
-#define YUVTORGB_AVX2(yuvconstants)                                   \
-  YUVTORGB16_AVX2(yuvconstants)                                       \
-  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
-  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
-  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
-  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
-  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
-  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
-
-// Store 16 ARGB values.
-#define STOREARGB_AVX2                                                \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
-  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
-  "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
-  "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
-  "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
-  "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
-  "lea        0x40(%[dst_argb]), %[dst_argb]                      \n"
-
-// Store 16 AR30 values.
-#define STOREAR30_AVX2                                                \
-  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
-  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
-  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
-  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
-  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
-  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
-  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
-  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
-  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
-  "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
-  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
-  "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
-  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
-  "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
-  "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
-  "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
-  "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
-  "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
-  "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
-  "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
-  "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
-
-#ifdef HAS_I444TOARGBROW_AVX2
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV444_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I444TOARGBROW_AVX2
-
-#if defined(HAS_I422TOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV422_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I422TOARGBROW_AVX2
-
-#if defined(HAS_I422TOAR30ROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
-void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_ar30,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
-      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
-      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV422_AVX2
-    YUVTORGB16_AVX2(yuvconstants)
-    STOREAR30_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_I422TOAR30ROW_AVX2
-
-#if defined(HAS_I210TOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
-                               const uint16_t* u_buf,
-                               const uint16_t* v_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV210_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I210TOARGBROW_AVX2
-
-#if defined(HAS_I212TOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
-                               const uint16_t* u_buf,
-                               const uint16_t* v_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV212_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I212TOARGBROW_AVX2
-
-#if defined(HAS_I210TOAR30ROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
-void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
-                               const uint16_t* u_buf,
-                               const uint16_t* v_buf,
-                               uint8_t* dst_ar30,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
-      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
-      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV210_AVX2
-    YUVTORGB16_AVX2(yuvconstants)
-    STOREAR30_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_I210TOAR30ROW_AVX2
-
-#if defined(HAS_I212TOAR30ROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
-void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
-                               const uint16_t* u_buf,
-                               const uint16_t* v_buf,
-                               uint8_t* dst_ar30,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
-      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
-      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV212_AVX2
-    YUVTORGB16_AVX2(yuvconstants)
-    STOREAR30_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_I212TOAR30ROW_AVX2
-
-#if defined(HAS_I410TOARGBROW_AVX2)
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
-                               const uint16_t* u_buf,
-                               const uint16_t* v_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV410_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I410TOARGBROW_AVX2
-
-#if defined(HAS_I210ALPHATOARGBROW_AVX2)
-// 16 pixels
-// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
-void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
-                                    const uint16_t* u_buf,
-                                    const uint16_t* v_buf,
-                                    const uint16_t* a_buf,
-                                    uint8_t* dst_argb,
-                                    const struct YuvConstants* yuvconstants,
-                                    int width) {
-  asm volatile(
-      YUVTORGB_SETUP_AVX2(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
-
-      LABELALIGN "1:                                        \n" READYUVA210_AVX2
-          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
-      "subl        $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-
-      : [y_buf] "+r"(y_buf),        // %[y_buf]
-        [u_buf] "+r"(u_buf),        // %[u_buf]
-        [v_buf] "+r"(v_buf),        // %[v_buf]
-        [a_buf] "+r"(a_buf),        // %[a_buf]
-        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__)
-        [width] "+m"(width)  // %[width]
-#else
-        [width] "+rm"(width)  // %[width]
-#endif
-      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
-        "xmm4", "xmm5");
-}
-#endif  // HAS_I210TOARGBROW_AVX2
-
-#if defined(HAS_I410ALPHATOARGBROW_AVX2)
-// 16 pixels
-// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
-void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
-                                    const uint16_t* u_buf,
-                                    const uint16_t* v_buf,
-                                    const uint16_t* a_buf,
-                                    uint8_t* dst_argb,
-                                    const struct YuvConstants* yuvconstants,
-                                    int width) {
-  asm volatile(
-      YUVTORGB_SETUP_AVX2(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
-
-      LABELALIGN "1:                                        \n" READYUVA410_AVX2
-          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
-      "subl        $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-
-      : [y_buf] "+r"(y_buf),        // %[y_buf]
-        [u_buf] "+r"(u_buf),        // %[u_buf]
-        [v_buf] "+r"(v_buf),        // %[v_buf]
-        [a_buf] "+r"(a_buf),        // %[a_buf]
-        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__)
-        [width] "+m"(width)  // %[width]
-#else
-        [width] "+rm"(width)  // %[width]
-#endif
-      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
-        "xmm4", "xmm5");
-}
-#endif  // HAS_I410TOARGBROW_AVX2
-
-#if defined(HAS_I410TOAR30ROW_AVX2)
-// 16 pixels
-// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
-void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
-                               const uint16_t* u_buf,
-                               const uint16_t* v_buf,
-                               uint8_t* dst_ar30,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
-      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
-      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV410_AVX2
-    YUVTORGB16_AVX2(yuvconstants)
-    STOREAR30_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_I410TOAR30ROW_AVX2
-
-#if defined(HAS_I444ALPHATOARGBROW_AVX2)
-// 16 pixels
-// 16 UV values with 16 Y and 16 A producing 16 ARGB.
-void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
-                                    const uint8_t* u_buf,
-                                    const uint8_t* v_buf,
-                                    const uint8_t* a_buf,
-                                    uint8_t* dst_argb,
-                                    const struct YuvConstants* yuvconstants,
-                                    int width) {
-  // clang-format off
-  asm volatile (
-  YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-
-  LABELALIGN
-      "1:                                        \n"
-  READYUVA444_AVX2
-  YUVTORGB_AVX2(yuvconstants)
-  STOREARGB_AVX2
-      "subl        $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_I444ALPHATOARGBROW_AVX2
-
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
-                                    const uint8_t* u_buf,
-                                    const uint8_t* v_buf,
-                                    const uint8_t* a_buf,
-                                    uint8_t* dst_argb,
-                                    const struct YuvConstants* yuvconstants,
-                                    int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUVA422_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "subl        $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_I422ALPHATOARGBROW_AVX2
-
-#if defined(HAS_I422TORGBAROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUV422_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-
-    // Step 3: Weave into RGBA
-    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
-    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
-    "vmovdqu    %%ymm0,(%[dst_argb])           \n"
-    "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
-    "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
-    "sub        $0x10,%[width]                 \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I422TORGBAROW_AVX2
-
-#if defined(HAS_NV12TOARGBROW_AVX2)
-// 16 pixels.
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
-                               const uint8_t* uv_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READNV12_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_NV12TOARGBROW_AVX2
-
-#if defined(HAS_NV21TOARGBROW_AVX2)
-// 16 pixels.
-// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
-                               const uint8_t* vu_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READNV21_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_NV21TOARGBROW_AVX2
-
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-// 16 pixels.
-// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READYUY2_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
-    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_YUY2TOARGBROW_AVX2
-
-#if defined(HAS_UYVYTOARGBROW_AVX2)
-// 16 pixels.
-// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READUYVY_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleUYVYY]"m"(kShuffleUYVYY),
-    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_UYVYTOARGBROW_AVX2
-
-#if defined(HAS_P210TOARGBROW_AVX2)
-// 16 pixels.
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
-                               const uint16_t* uv_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READP210_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_P210TOARGBROW_AVX2
-
-#if defined(HAS_P410TOARGBROW_AVX2)
-// 16 pixels.
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
-                               const uint16_t* uv_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READP410_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_P410TOARGBROW_AVX2
-
-#if defined(HAS_P210TOAR30ROW_AVX2)
-// 16 pixels
-// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
-void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
-                               const uint16_t* uv_buf,
-                               uint8_t* dst_ar30,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
-      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
-      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READP210_AVX2
-    YUVTORGB16_AVX2(yuvconstants)
-    STOREAR30_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_P210TOAR30ROW_AVX2
-
-#if defined(HAS_P410TOAR30ROW_AVX2)
-// 16 pixels
-// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
-void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
-                               const uint16_t* uv_buf,
-                               uint8_t* dst_ar30,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
-      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
-      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
-
-    LABELALIGN
-      "1:                                        \n"
-    READP410_AVX2
-    YUVTORGB16_AVX2(yuvconstants)
-    STOREAR30_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-
-      "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_P410TOAR30ROW_AVX2
-
-#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8_t* y_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
-      "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
-      "pslld       $0x18,%%xmm4                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-      "movq      (%0),%%xmm0                     \n"
-      "lea       0x8(%0),%0                      \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "paddsw    %%xmm3,%%xmm0                   \n"
-      "psraw     $6, %%xmm0                      \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-
-      // Step 2: Weave into ARGB
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklwd %%xmm0,%%xmm0                   \n"
-      "punpckhwd %%xmm1,%%xmm1                   \n"
-      "por       %%xmm4,%%xmm0                   \n"
-      "por       %%xmm4,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(y_buf),       // %0
-        "+r"(dst_argb),    // %1
-        "+rm"(width)       // %2
-      : "r"(yuvconstants)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif  // HAS_I400TOARGBROW_SSE2
-
-#ifdef HAS_I400TOARGBROW_AVX2
-// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
-// note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8_t* y_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
-      "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
-      "vpslld      $0x18,%%ymm4,%%ymm4           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
-      "vmovdqu    (%0),%%xmm0                    \n"
-      "lea        0x10(%0),%0                    \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddsw    %%ymm3,%%ymm0,%%ymm0           \n"
-      "vpsraw     $0x6,%%ymm0,%%ymm0             \n"
-      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
-      "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-      "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
-      "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
-      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "lea        0x40(%1),%1                     \n"
-      "sub        $0x10,%2                       \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(y_buf),       // %0
-        "+r"(dst_argb),    // %1
-        "+rm"(width)       // %2
-      : "r"(yuvconstants)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif  // HAS_I400TOARGBROW_AVX2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
-                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
-
-void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-
-      "movdqa      %3,%%xmm5                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src),           // %0
-        "+r"(dst),           // %1
-        "+r"(temp_width)     // %2
-      : "m"(kShuffleMirror)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
-}
-#endif  // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_AVX2
-void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-
-      "vbroadcastf128 %3,%%ymm5                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),           // %0
-        "+r"(dst),           // %1
-        "+r"(temp_width)     // %2
-      : "m"(kShuffleMirror)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
-}
-#endif  // HAS_MIRRORROW_AVX2
-
-#ifdef HAS_MIRRORUVROW_SSSE3
-// Shuffle table for reversing the UV.
-static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
-                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
-
-void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-
-      "movdqa      %3,%%xmm5                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_uv),          // %0
-        "+r"(dst_uv),          // %1
-        "+r"(temp_width)       // %2
-      : "m"(kShuffleMirrorUV)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
-}
-#endif  // HAS_MIRRORUVROW_SSSE3
-
-#ifdef HAS_MIRRORUVROW_AVX2
-void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-
-      "vbroadcastf128 %3,%%ymm5                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_uv),          // %0
-        "+r"(dst_uv),          // %1
-        "+r"(temp_width)       // %2
-      : "m"(kShuffleMirrorUV)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
-}
-#endif  // HAS_MIRRORUVROW_AVX2
-
-#ifdef HAS_MIRRORSPLITUVROW_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
-                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-void MirrorSplitUVRow_SSSE3(const uint8_t* src,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-      "movdqa      %4,%%xmm1                     \n"
-      "lea         -0x10(%0,%3,2),%0             \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "lea         -0x10(%0),%0                  \n"
-      "pshufb      %%xmm1,%%xmm0                 \n"
-      "movlpd      %%xmm0,(%1)                   \n"
-      "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $8,%3                         \n"
-      "jg          1b                            \n"
-      : "+r"(src),                  // %0
-        "+r"(dst_u),                // %1
-        "+r"(dst_v),                // %2
-        "+r"(temp_width)            // %3
-      : "m"(kShuffleMirrorSplitUV)  // %4
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_MIRRORSPLITUVROW_SSSE3
-
-#ifdef HAS_RGB24MIRRORROW_SSSE3
-
-// Shuffle first 5 pixels to last 5 mirrored.  first byte zero
-static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
-                                         7u,   8u,  3u,  4u,  5u, 0u,  1u,  2u};
-
-// Shuffle last 5 pixels to first 5 mirrored.  last byte zero
-static const uvec8 kShuffleMirrorRGB1 = {
-    13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
-
-// Shuffle 5 pixels at a time (15 bytes)
-void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
-                          uint8_t* dst_rgb24,
-                          int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  src_rgb24 += width * 3 - 48;
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"  // first 5
-      "movdqu      15(%0),%%xmm1                 \n"  // next 5
-      "movdqu      30(%0),%%xmm2                 \n"  // next 5
-      "movdqu      32(%0),%%xmm3                 \n"  // last 1 special
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "pshufb      %%xmm4,%%xmm1                 \n"
-      "pshufb      %%xmm4,%%xmm2                 \n"
-      "pshufb      %%xmm5,%%xmm3                 \n"
-      "lea         -0x30(%0),%0                  \n"
-      "movdqu      %%xmm0,32(%1)                 \n"  // last 5
-      "movdqu      %%xmm1,17(%1)                 \n"  // next 5
-      "movdqu      %%xmm2,2(%1)                  \n"  // next 5
-      "movlpd      %%xmm3,0(%1)                  \n"  // first 1
-      "lea         0x30(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_rgb24),          // %0
-        "+r"(dst_rgb24),          // %1
-        "+r"(temp_width)          // %2
-      : "m"(kShuffleMirrorRGB0),  // %3
-        "m"(kShuffleMirrorRGB1)   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_RGB24MIRRORROW_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSE2
-
-void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-
-      "lea         -0x10(%0,%2,4),%0             \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
-      "lea         -0x10(%0),%0                  \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),        // %0
-        "+r"(dst),        // %1
-        "+r"(temp_width)  // %2
-      :
-      : "memory", "cc", "xmm0");
-}
-#endif  // HAS_ARGBMIRRORROW_SSE2
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-
-      "vmovdqu     %3,%%ymm5                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),                    // %0
-        "+r"(dst),                    // %1
-        "+r"(temp_width)              // %2
-      : "m"(kARGBShuffleMirror_AVX2)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
-}
-#endif  // HAS_ARGBMIRRORROW_AVX2
-
-#ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm2            \n"
-      "vpsrlw      $0x8,%%ymm1,%%ymm3            \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpackuswb   %%ymm3,%%ymm2,%%ymm2          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_u),   // %1
-        "+r"(dst_v),   // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SPLITUVROW_AVX2
-
-#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $0x8,%%xmm5                   \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "pand        %%xmm5,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "psrlw       $0x8,%%xmm2                   \n"
-      "psrlw       $0x8,%%xmm3                   \n"
-      "packuswb    %%xmm3,%%xmm2                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm2,0x00(%1,%2,1)          \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_u),   // %1
-        "+r"(dst_v),   // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SPLITUVROW_SSE2
-
-#ifdef HAS_MERGEUVROW_AVX2
-void MergeUVRow_AVX2(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width) {
-  asm volatile(
-
-      "sub         %0,%1                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x00(%0,%1,1),%%ymm1          \n"
-      "lea         0x20(%0),%0                   \n"
-      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm2          \n"
-      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vextractf128 $0x0,%%ymm2,(%2)             \n"
-      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
-      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
-      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-      "lea         0x40(%2),%2                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_MERGEUVROW_AVX2
-
-#ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width) {
-  asm volatile(
-
-      "sub         %0,%1                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpcklbw   %%xmm1,%%xmm0                 \n"
-      "punpckhbw   %%xmm1,%%xmm2                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "movdqu      %%xmm2,0x10(%2)               \n"
-      "lea         0x20(%2),%2                   \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_MERGEUVROW_SSE2
-
-#ifdef HAS_MERGEUVROW_16_AVX2
-void MergeUVRow_16_AVX2(const uint16_t* src_u,
-                        const uint16_t* src_v,
-                        uint16_t* dst_uv,
-                        int depth,
-                        int width) {
-  depth = 16 - depth;
-  // clang-format off
-  asm volatile (
-      "vmovd       %4,%%xmm3                     \n"
-      "sub         %0,%1                         \n"
-
-    // 16 pixels per loop.
-    LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     (%0,%1,1),%%ymm1              \n"
-      "add         $0x20,%0                      \n"
-
-      "vpsllw      %%xmm3,%%ymm0,%%ymm0          \n"
-      "vpsllw      %%xmm3,%%ymm1,%%ymm1          \n"
-      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm2          \n"  // mutates
-      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vextractf128 $0x0,%%ymm2,(%2)             \n"
-      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
-      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
-      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-      "add         $0x40,%2                      \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : "+r"(src_u),   // %0
-    "+r"(src_v),   // %1
-    "+r"(dst_uv),  // %2
-    "+r"(width)    // %3
-  : "r"(depth)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-  // clang-format on
-}
-#endif  // HAS_MERGEUVROW_AVX2
-
-#ifdef HAS_SPLITUVROW_16_AVX2
-const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8,  9,  12, 13,
-                                 2, 3, 6, 7, 10, 11, 14, 15};
-void SplitUVRow_16_AVX2(const uint16_t* src_uv,
-                        uint16_t* dst_u,
-                        uint16_t* dst_v,
-                        int depth,
-                        int width) {
-  depth = 16 - depth;
-  // clang-format off
-  asm volatile (
-      "vmovd       %4,%%xmm3                     \n"
-      "vbroadcastf128 %5,%%ymm4                  \n"
-      "sub         %1,%2                         \n"
-
-    // 16 pixels per loop.
-    LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "add         $0x40,%0                      \n"
-
-      "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
-      "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
-      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
-      "vextractf128 $0x0,%%ymm0,(%1)             \n"
-      "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
-      "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
-      "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
-      "add         $0x20,%1                      \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : "+r"(src_uv),   // %0
-    "+r"(dst_u),    // %1
-    "+r"(dst_v),    // %2
-    "+r"(width)     // %3
-  : "r"(depth),     // %4
-    "m"(kSplitUVShuffle16) // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-  // clang-format on
-}
-#endif  // HAS_SPLITUVROW_16_AVX2
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
-#ifdef HAS_MULTIPLYROW_16_AVX2
-void MultiplyRow_16_AVX2(const uint16_t* src_y,
-                         uint16_t* dst_y,
-                         int scale,
-                         int width) {
-  // clang-format off
-  asm volatile (
-      "vmovd       %3,%%xmm3                     \n"
-      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-      "vbroadcastss %%xmm3,%%ymm3                \n"
-      "sub         %0,%1                         \n"
-
-    // 32 pixels per loop.
-    LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
-      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
-      "vmovdqu     %%ymm0,(%0,%1)                \n"
-      "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
-      "add         $0x40,%0                      \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm3");
-  // clang-format on
-}
-#endif  // HAS_MULTIPLYROW_16_AVX2
-
-// Use scale to convert msb formats to lsb, depending how many bits there are:
-// 512 = 9 bits
-// 1024 = 10 bits
-// 4096 = 12 bits
-// 65536 = 16 bits
-#ifdef HAS_DIVIDEROW_16_AVX2
-void DivideRow_16_AVX2(const uint16_t* src_y,
-                       uint16_t* dst_y,
-                       int scale,
-                       int width) {
-  // clang-format off
-  asm volatile (
-      "vmovd       %3,%%xmm3                     \n"
-      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-      "vbroadcastss %%xmm3,%%ymm3                \n"
-      "sub         %0,%1                         \n"
-
-    // 32 pixels per loop.
-    LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
-      "vmovdqu     %%ymm0,(%0,%1)                \n"
-      "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
-      "add         $0x40,%0                      \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width),    // %2
-    "+r"(scale)     // %3
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm3");
-  // clang-format on
-}
-#endif  // HAS_MULTIPLYROW_16_AVX2
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 32768 = 9 bits
-// 16384 = 10 bits
-// 4096 = 12 bits
-// 256 = 16 bits
-void Convert16To8Row_SSSE3(const uint16_t* src_y,
-                           uint8_t* dst_y,
-                           int scale,
-                           int width) {
-  // clang-format off
-  asm volatile (
-      "movd        %3,%%xmm2                     \n"
-      "punpcklwd   %%xmm2,%%xmm2                 \n"
-      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
-
-    // 32 pixels per loop.
-    LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "add         $0x20,%0                      \n"
-      "pmulhuw     %%xmm2,%%xmm0                 \n"
-      "pmulhuw     %%xmm2,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "add         $0x10,%1                      \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
-}
-
-#ifdef HAS_CONVERT16TO8ROW_AVX2
-void Convert16To8Row_AVX2(const uint16_t* src_y,
-                          uint8_t* dst_y,
-                          int scale,
-                          int width) {
-  // clang-format off
-  asm volatile (
-      "vmovd       %3,%%xmm2                     \n"
-      "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
-      "vbroadcastss %%xmm2,%%ymm2                \n"
-
-    // 32 pixels per loop.
-    LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "add         $0x40,%0                      \n"
-      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "add         $0x20,%1                      \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
-}
-#endif  // HAS_CONVERT16TO8ROW_AVX2
-
-// Use scale to convert to lsb formats depending how many bits there are:
-// 512 = 9 bits
-// 1024 = 10 bits
-// 4096 = 12 bits
-// TODO(fbarchard): reduce to SSE2
-void Convert8To16Row_SSE2(const uint8_t* src_y,
-                          uint16_t* dst_y,
-                          int scale,
-                          int width) {
-  // clang-format off
-  asm volatile (
-      "movd        %3,%%xmm2                     \n"
-      "punpcklwd   %%xmm2,%%xmm2                 \n"
-      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
-
-    // 32 pixels per loop.
-    LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm0,%%xmm0                 \n"
-      "punpckhbw   %%xmm1,%%xmm1                 \n"
-      "add         $0x10,%0                      \n"
-      "pmulhuw     %%xmm2,%%xmm0                 \n"
-      "pmulhuw     %%xmm2,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "add         $0x20,%1                      \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
-}
-
-#ifdef HAS_CONVERT8TO16ROW_AVX2
-void Convert8To16Row_AVX2(const uint8_t* src_y,
-                          uint16_t* dst_y,
-                          int scale,
-                          int width) {
-  // clang-format off
-  asm volatile (
-      "vmovd       %3,%%xmm2                     \n"
-      "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
-      "vbroadcastss %%xmm2,%%ymm2                \n"
-
-    // 32 pixels per loop.
-    LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "add         $0x20,%0                      \n"
-      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
-      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "add         $0x40,%1                      \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
-}
-#endif  // HAS_CONVERT8TO16ROW_AVX2
-
-#ifdef HAS_SPLITRGBROW_SSSE3
-
-// Shuffle table for converting RGB to Planar.
-static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
-                                          128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          2u,   5u,   8u,   11u,  14u,  128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u, 128u, 1u,
-                                          4u,   7u,   10u,  13u};
-
-static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
-                                          128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
-                                          3u,   6u,   9u,   12u,  15u,  128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u, 128u, 2u,
-                                          5u,   8u,   11u,  14u};
-
-static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
-                                          128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
-                                          4u,   7u,   10u,  13u,  128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u, 0u,   3u,
-                                          6u,   9u,   12u,  15u};
-
-void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "pshufb      %5, %%xmm0                    \n"
-      "pshufb      %6, %%xmm1                    \n"
-      "pshufb      %7, %%xmm2                    \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "pshufb      %8, %%xmm0                    \n"
-      "pshufb      %9, %%xmm1                    \n"
-      "pshufb      %10, %%xmm2                   \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "pshufb      %11, %%xmm0                   \n"
-      "pshufb      %12, %%xmm1                   \n"
-      "pshufb      %13, %%xmm2                   \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%3)                   \n"
-      "lea         0x10(%3),%3                   \n"
-      "lea         0x30(%0),%0                   \n"
-      "sub         $0x10,%4                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_rgb),             // %0
-        "+r"(dst_r),               // %1
-        "+r"(dst_g),               // %2
-        "+r"(dst_b),               // %3
-        "+r"(width)                // %4
-      : "m"(kShuffleMaskRGBToR0),  // %5
-        "m"(kShuffleMaskRGBToR1),  // %6
-        "m"(kShuffleMaskRGBToR2),  // %7
-        "m"(kShuffleMaskRGBToG0),  // %8
-        "m"(kShuffleMaskRGBToG1),  // %9
-        "m"(kShuffleMaskRGBToG2),  // %10
-        "m"(kShuffleMaskRGBToB0),  // %11
-        "m"(kShuffleMaskRGBToB1),  // %12
-        "m"(kShuffleMaskRGBToB2)   // %13
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_SPLITRGBROW_SSSE3
-
-#ifdef HAS_MERGERGBROW_SSSE3
-
-// Shuffle table for converting RGB to Planar.
-static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
-                                          2u, 128u, 128u, 3u, 128u, 128u,
-                                          4u, 128u, 128u, 5u};
-static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
-                                          128u, 2u, 128u, 128u, 3u, 128u,
-                                          128u, 4u, 128u, 128u};
-static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
-                                          128u, 128u, 2u, 128u, 128u, 3u,
-                                          128u, 128u, 4u, 128u};
-
-static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
-                                          7u, 128u, 128u, 8u, 128u, 128u,
-                                          9u, 128u, 128u, 10u};
-static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
-                                          128u, 7u, 128u, 128u, 8u, 128u,
-                                          128u, 9u, 128u, 128u};
-static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
-                                          128u, 128u, 8u,  128u, 128u, 9u,
-                                          128u, 128u, 10u, 128u};
-
-static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
-                                          12u, 128u, 128u, 13u, 128u, 128u,
-                                          14u, 128u, 128u, 15u};
-static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
-                                          128u, 13u, 128u, 128u, 14u, 128u,
-                                          128u, 15u, 128u, 128u};
-static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
-                                          128u, 128u, 13u, 128u, 128u, 14u,
-                                          128u, 128u, 15u, 128u};
-
-void MergeRGBRow_SSSE3(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       uint8_t* dst_rgb,
-                       int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      (%1),%%xmm1                   \n"
-      "movdqu      (%2),%%xmm2                   \n"
-      "pshufb      %5, %%xmm0                    \n"
-      "pshufb      %6, %%xmm1                    \n"
-      "pshufb      %7, %%xmm2                    \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%3)                   \n"
-
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      (%1),%%xmm1                   \n"
-      "movdqu      (%2),%%xmm2                   \n"
-      "pshufb      %8, %%xmm0                    \n"
-      "pshufb      %9, %%xmm1                    \n"
-      "pshufb      %10, %%xmm2                   \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,16(%3)                 \n"
-
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      (%1),%%xmm1                   \n"
-      "movdqu      (%2),%%xmm2                   \n"
-      "pshufb      %11, %%xmm0                   \n"
-      "pshufb      %12, %%xmm1                   \n"
-      "pshufb      %13, %%xmm2                   \n"
-      "por         %%xmm1,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,32(%3)                 \n"
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "lea         0x30(%3),%3                   \n"
-      "sub         $0x10,%4                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_r),               // %0
-        "+r"(src_g),               // %1
-        "+r"(src_b),               // %2
-        "+r"(dst_rgb),             // %3
-        "+r"(width)                // %4
-      : "m"(kShuffleMaskRToRGB0),  // %5
-        "m"(kShuffleMaskGToRGB0),  // %6
-        "m"(kShuffleMaskBToRGB0),  // %7
-        "m"(kShuffleMaskRToRGB1),  // %8
-        "m"(kShuffleMaskGToRGB1),  // %9
-        "m"(kShuffleMaskBToRGB1),  // %10
-        "m"(kShuffleMaskRToRGB2),  // %11
-        "m"(kShuffleMaskGToRGB2),  // %12
-        "m"(kShuffleMaskBToRGB2)   // %13
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_MERGERGBROW_SSSE3
-
-#ifdef HAS_MERGEARGBROW_SSE2
-void MergeARGBRow_SSE2(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       const uint8_t* src_a,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-
-      "sub         %0,%1                         \n"
-      "sub         %0,%2                         \n"
-      "sub         %0,%3                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "movq        (%0,%2),%%xmm0                \n"  // B
-      "movq        (%0),%%xmm1                   \n"  // R
-      "movq        (%0,%1),%%xmm2                \n"  // G
-      "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
-      "movq        (%0,%3),%%xmm1                \n"  // A
-      "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
-      "movdqa      %%xmm0,%%xmm1                 \n"  // BR
-      "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
-      "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
-      "movdqu      %%xmm0,(%4)                   \n"
-      "movdqu      %%xmm1,16(%4)                 \n"
-
-      "lea         8(%0),%0                      \n"
-      "lea         32(%4),%4                     \n"
-      "sub         $0x8,%5                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_argb),  // %4
-        "+r"(width)      // %5
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif
-
-#ifdef HAS_MERGEXRGBROW_SSE2
-void MergeXRGBRow_SSE2(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "movq        (%2),%%xmm0                   \n"  // B
-      "movq        (%0),%%xmm1                   \n"  // R
-      "movq        (%1),%%xmm2                   \n"  // G
-      "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
-      "pcmpeqd     %%xmm1,%%xmm1                 \n"  // A(255)
-      "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
-      "movdqa      %%xmm0,%%xmm1                 \n"  // BR
-      "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
-      "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
-      "movdqu      %%xmm0,(%3)                   \n"
-      "movdqu      %%xmm1,16(%3)                 \n"
-
-      "lea         8(%0),%0                      \n"
-      "lea         8(%1),%1                      \n"
-      "lea         8(%2),%2                      \n"
-      "lea         32(%3),%3                     \n"
-      "sub         $0x8,%4                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_argb),  // %3
-        "+r"(width)      // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_MERGEARGBROW_SSE2
-
-#ifdef HAS_MERGEARGBROW_AVX2
-void MergeARGBRow_AVX2(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       const uint8_t* src_a,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-
-      "sub         %0,%1                         \n"
-      "sub         %0,%2                         \n"
-      "sub         %0,%3                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "vmovdqu     (%0,%2),%%xmm0                \n"  // B
-      "vmovdqu     (%0,%1),%%xmm1                \n"  // R
-      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
-      "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // A
-      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
-      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
-      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
-      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
-      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
-      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
-      "vmovdqu     %%ymm0,(%4)                   \n"  // First 8
-      "vmovdqu     %%ymm1,32(%4)                 \n"  // Next 8
-
-      "lea         16(%0),%0                     \n"
-      "lea         64(%4),%4                     \n"
-      "sub         $0x10,%5                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_argb),  // %4
-        "+r"(width)      // %5
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif
-
-#ifdef HAS_MERGEXRGBROW_AVX2
-void MergeXRGBRow_AVX2(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "vmovdqu     (%2),%%xmm0                   \n"  // B
-      "vpcmpeqd    %%ymm1,%%ymm1,%%ymm1          \n"  // A(255)
-      "vinserti128 $0,(%1),%%ymm1,%%ymm1         \n"  // R
-      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
-      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
-      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
-      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
-      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
-      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
-      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
-      "vmovdqu     %%ymm0,(%3)                   \n"  // First 8
-      "vmovdqu     %%ymm1,32(%3)                 \n"  // Next 8
-
-      "lea         16(%0),%0                     \n"
-      "lea         16(%1),%1                     \n"
-      "lea         16(%2),%2                     \n"
-      "lea         64(%3),%3                     \n"
-      "sub         $0x10,%4                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_argb),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_MERGEARGBROW_AVX2
-
-#ifdef HAS_SPLITARGBROW_SSE2
-void SplitARGBRow_SSE2(const uint8_t* src_argb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       uint8_t* dst_a,
-                       int width) {
-  asm volatile(
-
-      "sub         %1,%2                         \n"
-      "sub         %1,%3                         \n"
-      "sub         %1,%4                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
-      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
-      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
-      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
-      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
-      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
-      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
-      "movlps      %%xmm0,(%1,%3)                \n"  // B
-      "movhps      %%xmm0,(%1,%2)                \n"  // G
-      "movlps      %%xmm2,(%1)                   \n"  // R
-      "movhps      %%xmm2,(%1,%4)                \n"  // A
-
-      "lea         32(%0),%0                     \n"
-      "lea         8(%1),%1                      \n"
-      "sub         $0x8,%5                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_r),     // %1
-        "+r"(dst_g),     // %2
-        "+r"(dst_b),     // %3
-        "+r"(dst_a),     // %4
-        "+rm"(width)     // %5
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif
-
-#ifdef HAS_SPLITXRGBROW_SSE2
-void SplitXRGBRow_SSE2(const uint8_t* src_argb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
-      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
-      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
-      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
-      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
-      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
-      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
-      "movlps      %%xmm0,(%3)                   \n"  // B
-      "movhps      %%xmm0,(%2)                   \n"  // G
-      "movlps      %%xmm2,(%1)                   \n"  // R
-
-      "lea         32(%0),%0                     \n"
-      "lea         8(%1),%1                      \n"
-      "lea         8(%2),%2                      \n"
-      "lea         8(%3),%3                      \n"
-      "sub         $0x8,%4                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_r),     // %1
-        "+r"(dst_g),     // %2
-        "+r"(dst_b),     // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif
-
-static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8,  12, 1, 5, 9,  13,
-                                            2, 6, 10, 14, 3, 7, 11, 15};
-#ifdef HAS_SPLITARGBROW_SSSE3
-void SplitARGBRow_SSSE3(const uint8_t* src_argb,
-                        uint8_t* dst_r,
-                        uint8_t* dst_g,
-                        uint8_t* dst_b,
-                        uint8_t* dst_a,
-                        int width) {
-  asm volatile(
-
-      "movdqa      %6,%%xmm3                     \n"
-      "sub         %1,%2                         \n"
-      "sub         %1,%3                         \n"
-      "sub         %1,%4                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
-      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
-      "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
-      "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
-      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
-      "movlps      %%xmm0,(%1,%3)                \n"  // B
-      "movhps      %%xmm0,(%1,%2)                \n"  // G
-      "movlps      %%xmm2,(%1)                   \n"  // R
-      "movhps      %%xmm2,(%1,%4)                \n"  // A
-
-      "lea         32(%0),%0                     \n"
-      "lea         8(%1),%1                      \n"
-      "subl        $0x8,%5                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_r),     // %1
-        "+r"(dst_g),     // %2
-        "+r"(dst_b),     // %3
-        "+r"(dst_a),     // %4
-#if defined(__i386__)
-        "+m"(width)  // %5
-#else
-        "+rm"(width)          // %5
-#endif
-      : "m"(kShuffleMaskARGBSplit)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-#endif
-
-#ifdef HAS_SPLITXRGBROW_SSSE3
-void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
-                        uint8_t* dst_r,
-                        uint8_t* dst_g,
-                        uint8_t* dst_b,
-                        int width) {
-  asm volatile(
-
-      "movdqa      %5,%%xmm3                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
-      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
-      "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
-      "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
-      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
-      "movlps      %%xmm0,(%3)                   \n"  // B
-      "movhps      %%xmm0,(%2)                   \n"  // G
-      "movlps      %%xmm2,(%1)                   \n"  // R
-
-      "lea         32(%0),%0                     \n"
-      "lea         8(%1),%1                      \n"
-      "lea         8(%2),%2                      \n"
-      "lea         8(%3),%3                      \n"
-      "sub         $0x8,%4                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),             // %0
-        "+r"(dst_r),                // %1
-        "+r"(dst_g),                // %2
-        "+r"(dst_b),                // %3
-        "+r"(width)                 // %4
-      : "m"(kShuffleMaskARGBSplit)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-#endif
-
-#ifdef HAS_SPLITARGBROW_AVX2
-static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
-void SplitARGBRow_AVX2(const uint8_t* src_argb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       uint8_t* dst_a,
-                       int width) {
-  asm volatile(
-
-      "sub         %1,%2                         \n"
-      "sub         %1,%3                         \n"
-      "sub         %1,%4                         \n"
-      "vmovdqa     %7,%%ymm3                     \n"
-      "vbroadcastf128 %6,%%ymm4                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
-      "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
-      "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
-      "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
-      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
-      "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
-      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
-      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
-      "vmovdqu     %%xmm0,(%1,%3)                \n"  // B
-      "vextracti128 $1,%%ymm0,(%1)               \n"  // R
-      "vmovdqu     %%xmm2,(%1,%2)                \n"  // G
-      "vextracti128 $1,%%ymm2,(%1,%4)            \n"  // A
-      "lea         64(%0),%0                     \n"
-      "lea         16(%1),%1                     \n"
-      "subl        $0x10,%5                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_r),     // %1
-        "+r"(dst_g),     // %2
-        "+r"(dst_b),     // %3
-        "+r"(dst_a),     // %4
-#if defined(__i386__)
-        "+m"(width)  // %5
-#else
-        "+rm"(width)          // %5
-#endif
-      : "m"(kShuffleMaskARGBSplit),   // %6
-        "m"(kShuffleMaskARGBPermute)  // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif
-
-#ifdef HAS_SPLITXRGBROW_AVX2
-void SplitXRGBRow_AVX2(const uint8_t* src_argb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       int width) {
-  asm volatile(
-
-      "vmovdqa     %6,%%ymm3                     \n"
-      "vbroadcastf128 %5,%%ymm4                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
-      "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
-      "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
-      "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
-      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
-      "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
-      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
-      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
-      "vmovdqu     %%xmm0,(%3)                   \n"  // B
-      "vextracti128 $1,%%ymm0,(%1)               \n"  // R
-      "vmovdqu     %%xmm2,(%2)                   \n"  // G
-
-      "lea         64(%0),%0                     \n"
-      "lea         16(%1),%1                     \n"
-      "lea         16(%2),%2                     \n"
-      "lea         16(%3),%3                     \n"
-      "sub         $0x10,%4                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),               // %0
-        "+r"(dst_r),                  // %1
-        "+r"(dst_g),                  // %2
-        "+r"(dst_b),                  // %3
-        "+r"(width)                   // %4
-      : "m"(kShuffleMaskARGBSplit),   // %5
-        "m"(kShuffleMaskARGBPermute)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif
-
-#ifdef HAS_MERGEXR30ROW_AVX2
-void MergeXR30Row_AVX2(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       uint8_t* dst_ar30,
-                       int depth,
-                       int width) {
-  int shift = depth - 10;
-  asm volatile(
-
-      "sub         %0,%1                         \n"
-      "sub         %0,%2                         \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
-      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
-      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"
-      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
-      "vmovd       %5,%%xmm4                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     (%0,%1),%%ymm1                \n"
-      "vmovdqu     (%0,%2),%%ymm2                \n"
-      "vpsrlw      %%xmm4,%%ymm0,%%ymm0          \n"
-      "vpsrlw      %%xmm4,%%ymm1,%%ymm1          \n"
-      "vpsrlw      %%xmm4,%%ymm2,%%ymm2          \n"
-      "vpminuw     %%ymm0,%%ymm6,%%ymm0          \n"
-      "vpminuw     %%ymm1,%%ymm6,%%ymm1          \n"
-      "vpminuw     %%ymm2,%%ymm6,%%ymm2          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
-      "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
-      "vpsllw      $0x4,%%ymm0,%%ymm0            \n"  // Shift R to target bit
-      "vpunpckhwd  %%ymm0,%%ymm2,%%ymm3          \n"  // RB
-      "vpunpcklwd  %%ymm0,%%ymm2,%%ymm0          \n"
-      "vpunpckhwd  %%ymm5,%%ymm1,%%ymm2          \n"  // AG
-      "vpunpcklwd  %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpslld      $0xa,%%ymm1,%%ymm1            \n"  // Shift AG to target bit
-      "vpslld      $0xa,%%ymm2,%%ymm2            \n"
-      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // Combine
-      "vpor        %%ymm2,%%ymm3,%%ymm3          \n"
-      "vmovdqu     %%ymm0,(%3)                   \n"
-      "vmovdqu     %%ymm3,0x20(%3)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x40(%3),%3                   \n"
-      "sub         $0x10,%4                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_ar30),  // %3
-        "+r"(width)      // %4
-#if defined(__i386__)
-      : "m"(shift)  // %5
-#else
-      : "rm"(shift)           // %5
-#endif
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif
-
-#ifdef HAS_MERGEAR64ROW_AVX2
-static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
-void MergeAR64Row_AVX2(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       const uint16_t* src_a,
-                       uint16_t* dst_ar64,
-                       int depth,
-                       int width) {
-  int shift = 16 - depth;
-  int mask = (1 << depth) - 1;
-  mask = (mask << 16) + mask;
-  asm volatile(
-
-      "sub         %0,%1                         \n"
-      "sub         %0,%2                         \n"
-      "sub         %0,%3                         \n"
-      "vmovdqa     %8,%%ymm5                     \n"
-      "vmovd       %6,%%xmm6                     \n"
-      "vbroadcastss %7,%%ymm7                    \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"  // R
-      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
-      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
-      "vmovdqu     (%0,%3),%%ymm3                \n"  // A
-      "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
-      "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
-      "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
-      "vpminuw     %%ymm3,%%ymm7,%%ymm3          \n"
-      "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
-      "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
-      "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
-      "vpsllw      %%xmm6,%%ymm3,%%ymm3          \n"
-      "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
-      "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
-      "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
-      "vpermd      %%ymm3,%%ymm5,%%ymm3          \n"
-      "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
-      "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
-      "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
-      "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
-      "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
-      "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
-      "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
-      "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
-      "vmovdqu     %%ymm3,(%4)                   \n"
-      "vmovdqu     %%ymm2,0x20(%4)               \n"
-      "vmovdqu     %%ymm4,0x40(%4)               \n"
-      "vmovdqu     %%ymm1,0x60(%4)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x80(%4),%4                   \n"
-      "subl        $0x10,%5                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_ar64),  // %4
-#if defined(__i386__)
-        "+m"(width)  // %5
-#else
-        "+rm"(width)          // %5
-#endif
-      : "m"(shift),            // %6
-        "m"(mask),             // %7
-        "m"(MergeAR64Permute)  // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-#ifdef HAS_MERGEXR64ROW_AVX2
-void MergeXR64Row_AVX2(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       uint16_t* dst_ar64,
-                       int depth,
-                       int width) {
-  int shift = 16 - depth;
-  int mask = (1 << depth) - 1;
-  mask = (mask << 16) + mask;
-  asm volatile(
-
-      "sub         %0,%1                         \n"
-      "sub         %0,%2                         \n"
-      "vmovdqa     %7,%%ymm5                     \n"
-      "vmovd       %5,%%xmm6                     \n"
-      "vbroadcastss %6,%%ymm7                    \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"  // R
-      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
-      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
-      "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
-      "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
-      "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
-      "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
-      "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
-      "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
-      "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
-      "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
-      "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
-      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"  // A (0xffff)
-      "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
-      "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
-      "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
-      "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
-      "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
-      "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
-      "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
-      "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
-      "vmovdqu     %%ymm3,(%3)                   \n"
-      "vmovdqu     %%ymm2,0x20(%3)               \n"
-      "vmovdqu     %%ymm4,0x40(%3)               \n"
-      "vmovdqu     %%ymm1,0x60(%3)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x80(%3),%3                   \n"
-      "subl        $0x10,%4                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_r),           // %0
-        "+r"(src_g),           // %1
-        "+r"(src_b),           // %2
-        "+r"(dst_ar64),        // %3
-        "+r"(width)            // %4
-      : "m"(shift),            // %5
-        "m"(mask),             // %6
-        "m"(MergeAR64Permute)  // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-#ifdef HAS_MERGEARGB16TO8ROW_AVX2
-static const uvec8 MergeARGB16To8Shuffle = {0, 8,  1, 9,  2, 10, 3, 11,
-                                            4, 12, 5, 13, 6, 14, 7, 15};
-void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
-                            const uint16_t* src_g,
-                            const uint16_t* src_b,
-                            const uint16_t* src_a,
-                            uint8_t* dst_argb,
-                            int depth,
-                            int width) {
-  int shift = depth - 8;
-  asm volatile(
-
-      "sub         %0,%1                         \n"
-      "sub         %0,%2                         \n"
-      "sub         %0,%3                         \n"
-      "vbroadcastf128 %7,%%ymm5                  \n"
-      "vmovd       %6,%%xmm6                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"  // R
-      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
-      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
-      "vmovdqu     (%0,%3),%%ymm3                \n"  // A
-      "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
-      "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
-      "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
-      "vpsrlw      %%xmm6,%%ymm3,%%ymm3          \n"
-      "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
-      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
-      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
-      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
-      "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
-      "vmovdqu     %%ymm2,(%4)                   \n"
-      "vmovdqu     %%ymm0,0x20(%4)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x40(%4),%4                   \n"
-      "subl        $0x10,%5                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_argb),  // %4
-#if defined(__i386__)
-        "+m"(width)  // %5
-#else
-        "+rm"(width)          // %5
-#endif
-      : "m"(shift),                 // %6
-        "m"(MergeARGB16To8Shuffle)  // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
-void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
-                            const uint16_t* src_g,
-                            const uint16_t* src_b,
-                            uint8_t* dst_argb,
-                            int depth,
-                            int width) {
-  int shift = depth - 8;
-  asm volatile(
-
-      "sub         %0,%1                         \n"
-      "sub         %0,%2                         \n"
-      "vbroadcastf128 %6,%%ymm5                  \n"
-      "vmovd       %5,%%xmm6                     \n"
-      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
-      "vpsrlw      $8,%%ymm3,%%ymm3              \n"  // A (0xff)
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"  // R
-      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
-      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
-      "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
-      "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
-      "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
-      "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
-      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
-      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
-      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
-      "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
-      "vmovdqu     %%ymm2,(%3)                   \n"
-      "vmovdqu     %%ymm0,0x20(%3)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x40(%3),%3                   \n"
-      "subl        $0x10,%4                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_r),                // %0
-        "+r"(src_g),                // %1
-        "+r"(src_b),                // %2
-        "+r"(dst_argb),             // %3
-        "+r"(width)                 // %4
-      : "m"(shift),                 // %5
-        "m"(MergeARGB16To8Shuffle)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "test        $0xf,%0                       \n"
-      "jne         2f                            \n"
-      "test        $0xf,%1                       \n"
-      "jne         2f                            \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqa      (%0),%%xmm0                   \n"
-      "movdqa      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "movdqa      %%xmm0,(%1)                   \n"
-      "movdqa      %%xmm1,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "jmp         9f                            \n"
-
-      LABELALIGN
-      "2:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          2b                            \n"
-
-      LABELALIGN "9:                                        \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_AVX
-void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x40,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_COPYROW_AVX
-
-#ifdef HAS_COPYROW_ERMS
-// Multiple of 1.
-void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
-  size_t width_tmp = (size_t)(width);
-  asm volatile(
-
-      "rep         movsb                         \n"
-      : "+S"(src),       // %0
-        "+D"(dst),       // %1
-        "+c"(width_tmp)  // %2
-      :
-      : "memory", "cc");
-}
-#endif  // HAS_COPYROW_ERMS
-
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-// width in pixels
-void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm0,%%xmm0                 \n"
-      "pslld       $0x18,%%xmm0                  \n"
-      "pcmpeqb     %%xmm1,%%xmm1                 \n"
-      "psrld       $0x8,%%xmm1                   \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "movdqu      0x10(%0),%%xmm3               \n"
-      "lea         0x20(%0),%0                   \n"
-      "movdqu      (%1),%%xmm4                   \n"
-      "movdqu      0x10(%1),%%xmm5               \n"
-      "pand        %%xmm0,%%xmm2                 \n"
-      "pand        %%xmm0,%%xmm3                 \n"
-      "pand        %%xmm1,%%xmm4                 \n"
-      "pand        %%xmm1,%%xmm5                 \n"
-      "por         %%xmm4,%%xmm2                 \n"
-      "por         %%xmm5,%%xmm3                 \n"
-      "movdqu      %%xmm2,(%1)                   \n"
-      "movdqu      %%xmm3,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBCOPYALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-// width in pixels
-void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm1                   \n"
-      "vmovdqu     0x20(%0),%%ymm2               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
-      "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
-      "vmovdqu     %%ymm1,(%1)                   \n"
-      "vmovdqu     %%ymm2,0x20(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_ARGBCOPYALPHAROW_AVX2
-
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-// width in pixels
-void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0), %%xmm0                  \n"
-      "movdqu      0x10(%0), %%xmm1              \n"
-      "lea         0x20(%0), %0                  \n"
-      "psrld       $0x18, %%xmm0                 \n"
-      "psrld       $0x18, %%xmm1                 \n"
-      "packssdw    %%xmm1, %%xmm0                \n"
-      "packuswb    %%xmm0, %%xmm0                \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "lea         0x8(%1), %1                   \n"
-      "sub         $0x8, %2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_a),     // %1
-        "+rm"(width)     // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
-
-#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
-static const uvec8 kShuffleAlphaShort_AVX2 = {
-    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
-    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
-
-void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width) {
-  asm volatile(
-      "vmovdqa     %3,%%ymm4                     \n"
-      "vbroadcastf128 %4,%%ymm5                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0), %%ymm0                  \n"
-      "vmovdqu     0x20(%0), %%ymm1              \n"
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // vpsrld $0x18, %%ymm0
-      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
-      "vmovdqu     0x40(%0), %%ymm2              \n"
-      "vmovdqu     0x60(%0), %%ymm3              \n"
-      "lea         0x80(%0), %0                  \n"
-      "vpackssdw   %%ymm1, %%ymm0, %%ymm0        \n"  // mutates
-      "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
-      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
-      "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // mutates
-      "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates.
-      "vpermd      %%ymm0,%%ymm4,%%ymm0          \n"  // unmutate.
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20, %2                     \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),               // %0
-        "+r"(dst_a),                  // %1
-        "+rm"(width)                  // %2
-      : "m"(kPermdARGBToY_AVX),       // %3
-        "m"(kShuffleAlphaShort_AVX2)  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-// width in pixels
-void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm0,%%xmm0                 \n"
-      "pslld       $0x18,%%xmm0                  \n"
-      "pcmpeqb     %%xmm1,%%xmm1                 \n"
-      "psrld       $0x8,%%xmm1                   \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm2                   \n"
-      "lea         0x8(%0),%0                    \n"
-      "punpcklbw   %%xmm2,%%xmm2                 \n"
-      "punpckhwd   %%xmm2,%%xmm3                 \n"
-      "punpcklwd   %%xmm2,%%xmm2                 \n"
-      "movdqu      (%1),%%xmm4                   \n"
-      "movdqu      0x10(%1),%%xmm5               \n"
-      "pand        %%xmm0,%%xmm2                 \n"
-      "pand        %%xmm0,%%xmm3                 \n"
-      "pand        %%xmm1,%%xmm4                 \n"
-      "pand        %%xmm1,%%xmm5                 \n"
-      "por         %%xmm4,%%xmm2                 \n"
-      "por         %%xmm5,%%xmm3                 \n"
-      "movdqu      %%xmm2,(%1)                   \n"
-      "movdqu      %%xmm3,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-// width in pixels
-void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vpmovzxbd   (%0),%%ymm1                   \n"
-      "vpmovzxbd   0x8(%0),%%ymm2                \n"
-      "lea         0x10(%0),%0                   \n"
-      "vpslld      $0x18,%%ymm1,%%ymm1           \n"
-      "vpslld      $0x18,%%ymm2,%%ymm2           \n"
-      "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
-      "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
-      "vmovdqu     %%ymm1,(%1)                   \n"
-      "vmovdqu     %%ymm2,0x20(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
-
-#ifdef HAS_SETROW_X86
-void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
-  size_t width_tmp = (size_t)(width >> 2);
-  const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-  asm volatile(
-
-      "rep         stosl                         \n"
-      : "+D"(dst),       // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v32)         // %2
-      : "memory", "cc");
-}
-
-void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
-  size_t width_tmp = (size_t)(width);
-  asm volatile(
-
-      "rep         stosb                         \n"
-      : "+D"(dst),       // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v8)          // %2
-      : "memory", "cc");
-}
-
-void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
-  size_t width_tmp = (size_t)(width);
-  asm volatile(
-
-      "rep         stosl                         \n"
-      : "+D"(dst_argb),  // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v32)         // %2
-      : "memory", "cc");
-}
-#endif  // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $0x8,%%xmm5                   \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "pand        %%xmm5,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $0x8,%%xmm5                   \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
-      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
-      "lea         0x20(%0),%0                   \n"
-      "pavgb       %%xmm2,%%xmm0                 \n"
-      "pavgb       %%xmm3,%%xmm1                 \n"
-      "psrlw       $0x8,%%xmm0                   \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm1                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "movq        %%xmm1,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_yuy2),               // %0
-        "+r"(dst_u),                  // %1
-        "+r"(dst_v),                  // %2
-        "+r"(width)                   // %3
-      : "r"((intptr_t)(stride_yuy2))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $0x8,%%xmm5                   \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "psrlw       $0x8,%%xmm0                   \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm1                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "movq        %%xmm1,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "psrlw       $0x8,%%xmm0                   \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-
-void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $0x8,%%xmm5                   \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
-      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
-      "lea         0x20(%0),%0                   \n"
-      "pavgb       %%xmm2,%%xmm0                 \n"
-      "pavgb       %%xmm3,%%xmm1                 \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "pand        %%xmm5,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm1                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "movq        %%xmm1,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_uyvy),               // %0
-        "+r"(dst_u),                  // %1
-        "+r"(dst_v),                  // %2
-        "+r"(width)                   // %3
-      : "r"((intptr_t)(stride_uyvy))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrlw       $0x8,%%xmm5                   \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "pand        %%xmm5,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm1                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "movq        %%xmm1,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_YUY2TOYROW_AVX2
-void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
-      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vextractf128 $0x0,%%ymm1,(%1)             \n"
-      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_yuy2),               // %0
-        "+r"(dst_u),                  // %1
-        "+r"(dst_v),                  // %2
-        "+r"(width)                   // %3
-      : "r"((intptr_t)(stride_yuy2))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vextractf128 $0x0,%%ymm1,(%1)             \n"
-      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
-      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vextractf128 $0x0,%%ymm1,(%1)             \n"
-      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_uyvy),               // %0
-        "+r"(dst_u),                  // %1
-        "+r"(dst_v),                  // %2
-        "+r"(width)                   // %3
-      : "r"((intptr_t)(stride_uyvy))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vextractf128 $0x0,%%ymm1,(%1)             \n"
-      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_YUY2TOYROW_AVX2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
-                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
-
-// Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
-                        const uint8_t* src_argb1,
-                        uint8_t* dst_argb,
-                        int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "psrlw       $0xf,%%xmm7                   \n"
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "psrlw       $0x8,%%xmm6                   \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psllw       $0x8,%%xmm5                   \n"
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "pslld       $0x18,%%xmm4                  \n"
-      "sub         $0x4,%3                       \n"
-      "jl          49f                           \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "40:                                       \n"
-      "movdqu      (%0),%%xmm3                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqa      %%xmm3,%%xmm0                 \n"
-      "pxor        %%xmm4,%%xmm3                 \n"
-      "movdqu      (%1),%%xmm2                   \n"
-      "pshufb      %4,%%xmm3                     \n"
-      "pand        %%xmm6,%%xmm2                 \n"
-      "paddw       %%xmm7,%%xmm3                 \n"
-      "pmullw      %%xmm3,%%xmm2                 \n"
-      "movdqu      (%1),%%xmm1                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "por         %%xmm4,%%xmm0                 \n"
-      "pmullw      %%xmm3,%%xmm1                 \n"
-      "psrlw       $0x8,%%xmm2                   \n"
-      "paddusb     %%xmm2,%%xmm0                 \n"
-      "pand        %%xmm5,%%xmm1                 \n"
-      "paddusb     %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%3                       \n"
-      "jge         40b                           \n"
-
-      "49:                                       \n"
-      "add         $0x3,%3                       \n"
-      "jl          99f                           \n"
-
-      // 1 pixel loop.
-      "91:                                       \n"
-      "movd        (%0),%%xmm3                   \n"
-      "lea         0x4(%0),%0                    \n"
-      "movdqa      %%xmm3,%%xmm0                 \n"
-      "pxor        %%xmm4,%%xmm3                 \n"
-      "movd        (%1),%%xmm2                   \n"
-      "pshufb      %4,%%xmm3                     \n"
-      "pand        %%xmm6,%%xmm2                 \n"
-      "paddw       %%xmm7,%%xmm3                 \n"
-      "pmullw      %%xmm3,%%xmm2                 \n"
-      "movd        (%1),%%xmm1                   \n"
-      "lea         0x4(%1),%1                    \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "por         %%xmm4,%%xmm0                 \n"
-      "pmullw      %%xmm3,%%xmm1                 \n"
-      "psrlw       $0x8,%%xmm2                   \n"
-      "paddusb     %%xmm2,%%xmm0                 \n"
-      "pand        %%xmm5,%%xmm1                 \n"
-      "paddusb     %%xmm1,%%xmm0                 \n"
-      "movd        %%xmm0,(%2)                   \n"
-      "lea         0x4(%2),%2                    \n"
-      "sub         $0x1,%3                       \n"
-      "jge         91b                           \n"
-      "99:                                       \n"
-      : "+r"(src_argb),     // %0
-        "+r"(src_argb1),    // %1
-        "+r"(dst_argb),     // %2
-        "+r"(width)         // %3
-      : "m"(kShuffleAlpha)  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_SSSE3
-// Blend 8 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_SSSE3(const uint8_t* src0,
-                         const uint8_t* src1,
-                         const uint8_t* alpha,
-                         uint8_t* dst,
-                         int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psllw       $0x8,%%xmm5                   \n"
-      "mov         $0x80808080,%%eax             \n"
-      "movd        %%eax,%%xmm6                  \n"
-      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
-      "mov         $0x807f807f,%%eax             \n"
-      "movd        %%eax,%%xmm7                  \n"
-      "pshufd      $0x0,%%xmm7,%%xmm7            \n"
-      "sub         %2,%0                         \n"
-      "sub         %2,%1                         \n"
-      "sub         %2,%3                         \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%2),%%xmm0                   \n"
-      "punpcklbw   %%xmm0,%%xmm0                 \n"
-      "pxor        %%xmm5,%%xmm0                 \n"
-      "movq        (%0,%2,1),%%xmm1              \n"
-      "movq        (%1,%2,1),%%xmm2              \n"
-      "punpcklbw   %%xmm2,%%xmm1                 \n"
-      "psubb       %%xmm6,%%xmm1                 \n"
-      "pmaddubsw   %%xmm1,%%xmm0                 \n"
-      "paddw       %%xmm7,%%xmm0                 \n"
-      "psrlw       $0x8,%%xmm0                   \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movq        %%xmm0,(%3,%2,1)              \n"
-      "lea         0x8(%2),%2                    \n"
-      "sub         $0x8,%4                       \n"
-      "jg          1b                            \n"
-      : "+r"(src0),   // %0
-        "+r"(src1),   // %1
-        "+r"(alpha),  // %2
-        "+r"(dst),    // %3
-        "+rm"(width)  // %4
-        ::"memory",
-        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
-}
-#endif  // HAS_BLENDPLANEROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 32 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_AVX2(const uint8_t* src0,
-                        const uint8_t* src1,
-                        const uint8_t* alpha,
-                        uint8_t* dst,
-                        int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-      "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
-      "mov         $0x80808080,%%eax             \n"
-      "vmovd       %%eax,%%xmm6                  \n"
-      "vbroadcastss %%xmm6,%%ymm6                \n"
-      "mov         $0x807f807f,%%eax             \n"
-      "vmovd       %%eax,%%xmm7                  \n"
-      "vbroadcastss %%xmm7,%%ymm7                \n"
-      "sub         %2,%0                         \n"
-      "sub         %2,%1                         \n"
-      "sub         %2,%3                         \n"
-
-      // 32 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%2),%%ymm0                   \n"
-      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm3          \n"
-      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpxor       %%ymm5,%%ymm3,%%ymm3          \n"
-      "vpxor       %%ymm5,%%ymm0,%%ymm0          \n"
-      "vmovdqu     (%0,%2,1),%%ymm1              \n"
-      "vmovdqu     (%1,%2,1),%%ymm2              \n"
-      "vpunpckhbw  %%ymm2,%%ymm1,%%ymm4          \n"
-      "vpunpcklbw  %%ymm2,%%ymm1,%%ymm1          \n"
-      "vpsubb      %%ymm6,%%ymm4,%%ymm4          \n"
-      "vpsubb      %%ymm6,%%ymm1,%%ymm1          \n"
-      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
-      "vpmaddubsw  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpaddw      %%ymm7,%%ymm3,%%ymm3          \n"
-      "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
-      "vpsrlw      $0x8,%%ymm3,%%ymm3            \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%3,%2,1)              \n"
-      "lea         0x20(%2),%2                   \n"
-      "sub         $0x20,%4                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src0),   // %0
-        "+r"(src1),   // %1
-        "+r"(alpha),  // %2
-        "+r"(dst),    // %3
-        "+rm"(width)  // %4
-        ::"memory",
-        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_BLENDPLANEROW_AVX2
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
-                                     7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
-static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-                                     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
-// Attenuate 4 pixels at a time.
-void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm3,%%xmm3                 \n"
-      "pslld       $0x18,%%xmm3                  \n"
-      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "punpcklbw   %%xmm1,%%xmm1                 \n"
-      "pmulhuw     %%xmm1,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "punpckhbw   %%xmm2,%%xmm2                 \n"
-      "pmulhuw     %%xmm2,%%xmm1                 \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "pand        %%xmm3,%%xmm2                 \n"
-      "psrlw       $0x8,%%xmm0                   \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),       // %0
-        "+r"(dst_argb),       // %1
-        "+r"(width)           // %2
-      : "m"(kShuffleAlpha0),  // %3
-        "m"(kShuffleAlpha1)   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
-                                         128u, 128u, 14u,  15u, 14u, 15u,
-                                         14u,  15u,  128u, 128u};
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
-                           uint8_t* dst_argb,
-                           int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-      "vpslld      $0x18,%%ymm5,%%ymm5           \n"
-      "sub         %0,%1                         \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm6                   \n"
-      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
-      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
-      "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
-      "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
-      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpand       %%ymm5,%%ymm6,%%ymm6          \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
-      "lea         0x20(%0),%0                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleAlpha_AVX2)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             int width) {
-  uintptr_t alpha;
-  asm volatile(
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movzb       0x03(%0),%3                   \n"
-      "punpcklbw   %%xmm0,%%xmm0                 \n"
-      "movd        0x00(%4,%3,4),%%xmm2          \n"
-      "movzb       0x07(%0),%3                   \n"
-      "movd        0x00(%4,%3,4),%%xmm3          \n"
-      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
-      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
-      "movlhps     %%xmm3,%%xmm2                 \n"
-      "pmulhuw     %%xmm2,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "movzb       0x0b(%0),%3                   \n"
-      "punpckhbw   %%xmm1,%%xmm1                 \n"
-      "movd        0x00(%4,%3,4),%%xmm2          \n"
-      "movzb       0x0f(%0),%3                   \n"
-      "movd        0x00(%4,%3,4),%%xmm3          \n"
-      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
-      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
-      "movlhps     %%xmm3,%%xmm2                 \n"
-      "pmulhuw     %%xmm2,%%xmm1                 \n"
-      "lea         0x10(%0),%0                   \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),     // %0
-        "+r"(dst_argb),     // %1
-        "+r"(width),        // %2
-        "=&r"(alpha)        // %3
-      : "r"(fixed_invtbl8)  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
-// Unattenuate 8 pixels at a time.
-void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             int width) {
-  uintptr_t alpha;
-  asm volatile(
-      "sub         %0,%1                         \n"
-      "vbroadcastf128 %5,%%ymm5                  \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      // replace VPGATHER
-      "movzb       0x03(%0),%3                   \n"
-      "vmovd       0x00(%4,%3,4),%%xmm0          \n"
-      "movzb       0x07(%0),%3                   \n"
-      "vmovd       0x00(%4,%3,4),%%xmm1          \n"
-      "movzb       0x0b(%0),%3                   \n"
-      "vpunpckldq  %%xmm1,%%xmm0,%%xmm6          \n"
-      "vmovd       0x00(%4,%3,4),%%xmm2          \n"
-      "movzb       0x0f(%0),%3                   \n"
-      "vmovd       0x00(%4,%3,4),%%xmm3          \n"
-      "movzb       0x13(%0),%3                   \n"
-      "vpunpckldq  %%xmm3,%%xmm2,%%xmm7          \n"
-      "vmovd       0x00(%4,%3,4),%%xmm0          \n"
-      "movzb       0x17(%0),%3                   \n"
-      "vmovd       0x00(%4,%3,4),%%xmm1          \n"
-      "movzb       0x1b(%0),%3                   \n"
-      "vpunpckldq  %%xmm1,%%xmm0,%%xmm0          \n"
-      "vmovd       0x00(%4,%3,4),%%xmm2          \n"
-      "movzb       0x1f(%0),%3                   \n"
-      "vmovd       0x00(%4,%3,4),%%xmm3          \n"
-      "vpunpckldq  %%xmm3,%%xmm2,%%xmm2          \n"
-      "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
-      "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
-      "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
-      // end of VPGATHER
-
-      "vmovdqu     (%0),%%ymm6                   \n"
-      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
-      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
-      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm2          \n"
-      "vpunpckhwd  %%ymm3,%%ymm3,%%ymm3          \n"
-      "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
-      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
-      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
-      "lea         0x20(%0),%0                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),                 // %0
-        "+r"(dst_argb),                 // %1
-        "+r"(width),                    // %2
-        "=&r"(alpha)                    // %3
-      : "r"(fixed_invtbl8),             // %4
-        "m"(kUnattenShuffleAlpha_AVX2)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBUNATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "psubb       %%xmm5,%%xmm0                 \n"
-      "psubb       %%xmm5,%%xmm1                 \n"
-      "movdqu      %%xmm4,%%xmm6                 \n"
-      "pmaddubsw   %%xmm0,%%xmm6                 \n"
-      "movdqu      %%xmm4,%%xmm0                 \n"
-      "pmaddubsw   %%xmm1,%%xmm0                 \n"
-      "phaddw      %%xmm0,%%xmm6                 \n"
-      "paddw       %%xmm5,%%xmm6                 \n"
-      "psrlw       $0x8,%%xmm6                   \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "movdqu      0x10(%0),%%xmm3               \n"
-      "lea         0x20(%0),%0                   \n"
-      "psrld       $0x18,%%xmm2                  \n"
-      "psrld       $0x18,%%xmm3                  \n"
-      "packuswb    %%xmm3,%%xmm2                 \n"
-      "packuswb    %%xmm2,%%xmm2                 \n"
-      "movdqa      %%xmm6,%%xmm3                 \n"
-      "punpcklbw   %%xmm6,%%xmm6                 \n"
-      "punpcklbw   %%xmm2,%%xmm3                 \n"
-      "movdqa      %%xmm6,%%xmm1                 \n"
-      "punpcklwd   %%xmm3,%%xmm6                 \n"
-      "punpckhwd   %%xmm3,%%xmm1                 \n"
-      "movdqu      %%xmm6,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "m"(kARGBToYJ),  // %3
-        "m"(kSub128)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone
-static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
-                                   17, 68, 35, 0, 17, 68, 35, 0};
-
-static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
-                                   22, 88, 45, 0, 22, 88, 45, 0};
-
-static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
-                                   24, 98, 50, 0, 24, 98, 50, 0};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movdqa      %2,%%xmm2                     \n"
-      "movdqa      %3,%%xmm3                     \n"
-      "movdqa      %4,%%xmm4                     \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm6               \n"
-      "pmaddubsw   %%xmm2,%%xmm0                 \n"
-      "pmaddubsw   %%xmm2,%%xmm6                 \n"
-      "phaddw      %%xmm6,%%xmm0                 \n"
-      "psrlw       $0x7,%%xmm0                   \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm5                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "pmaddubsw   %%xmm3,%%xmm5                 \n"
-      "pmaddubsw   %%xmm3,%%xmm1                 \n"
-      "phaddw      %%xmm1,%%xmm5                 \n"
-      "psrlw       $0x7,%%xmm5                   \n"
-      "packuswb    %%xmm5,%%xmm5                 \n"
-      "punpcklbw   %%xmm5,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm5                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "pmaddubsw   %%xmm4,%%xmm5                 \n"
-      "pmaddubsw   %%xmm4,%%xmm1                 \n"
-      "phaddw      %%xmm1,%%xmm5                 \n"
-      "psrlw       $0x7,%%xmm5                   \n"
-      "packuswb    %%xmm5,%%xmm5                 \n"
-      "movdqu      (%0),%%xmm6                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "psrld       $0x18,%%xmm6                  \n"
-      "psrld       $0x18,%%xmm1                  \n"
-      "packuswb    %%xmm1,%%xmm6                 \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "punpcklbw   %%xmm6,%%xmm5                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklwd   %%xmm5,%%xmm0                 \n"
-      "punpckhwd   %%xmm5,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%0)                   \n"
-      "movdqu      %%xmm1,0x10(%0)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "sub         $0x8,%1                       \n"
-      "jg          1b                            \n"
-      : "+r"(dst_argb),      // %0
-        "+r"(width)          // %1
-      : "m"(kARGBToSepiaB),  // %2
-        "m"(kARGBToSepiaG),  // %3
-        "m"(kARGBToSepiaR)   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
-                              uint8_t* dst_argb,
-                              const int8_t* matrix_argb,
-                              int width) {
-  asm volatile(
-      "movdqu      (%3),%%xmm5                   \n"
-      "pshufd      $0x00,%%xmm5,%%xmm2           \n"
-      "pshufd      $0x55,%%xmm5,%%xmm3           \n"
-      "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
-      "pshufd      $0xff,%%xmm5,%%xmm5           \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm7               \n"
-      "pmaddubsw   %%xmm2,%%xmm0                 \n"
-      "pmaddubsw   %%xmm2,%%xmm7                 \n"
-      "movdqu      (%0),%%xmm6                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "pmaddubsw   %%xmm3,%%xmm6                 \n"
-      "pmaddubsw   %%xmm3,%%xmm1                 \n"
-      "phaddsw     %%xmm7,%%xmm0                 \n"
-      "phaddsw     %%xmm1,%%xmm6                 \n"
-      "psraw       $0x6,%%xmm0                   \n"
-      "psraw       $0x6,%%xmm6                   \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "punpcklbw   %%xmm6,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "movdqu      0x10(%0),%%xmm7               \n"
-      "pmaddubsw   %%xmm4,%%xmm1                 \n"
-      "pmaddubsw   %%xmm4,%%xmm7                 \n"
-      "phaddsw     %%xmm7,%%xmm1                 \n"
-      "movdqu      (%0),%%xmm6                   \n"
-      "movdqu      0x10(%0),%%xmm7               \n"
-      "pmaddubsw   %%xmm5,%%xmm6                 \n"
-      "pmaddubsw   %%xmm5,%%xmm7                 \n"
-      "phaddsw     %%xmm7,%%xmm6                 \n"
-      "psraw       $0x6,%%xmm1                   \n"
-      "psraw       $0x6,%%xmm6                   \n"
-      "packuswb    %%xmm1,%%xmm1                 \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "punpcklbw   %%xmm6,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm6                 \n"
-      "punpcklwd   %%xmm1,%%xmm0                 \n"
-      "punpckhwd   %%xmm1,%%xmm6                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm6,0x10(%1)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(dst_argb),   // %1
-        "+r"(width)       // %2
-      : "r"(matrix_argb)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
-                          int scale,
-                          int interval_size,
-                          int interval_offset,
-                          int width) {
-  asm volatile(
-      "movd        %2,%%xmm2                     \n"
-      "movd        %3,%%xmm3                     \n"
-      "movd        %4,%%xmm4                     \n"
-      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
-      "pshufd      $0x44,%%xmm2,%%xmm2           \n"
-      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
-      "pshufd      $0x44,%%xmm3,%%xmm3           \n"
-      "pshuflw     $0x40,%%xmm4,%%xmm4           \n"
-      "pshufd      $0x44,%%xmm4,%%xmm4           \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "pslld       $0x18,%%xmm6                  \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "punpcklbw   %%xmm5,%%xmm0                 \n"
-      "pmulhuw     %%xmm2,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "punpckhbw   %%xmm5,%%xmm1                 \n"
-      "pmulhuw     %%xmm2,%%xmm1                 \n"
-      "pmullw      %%xmm3,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm7                   \n"
-      "pmullw      %%xmm3,%%xmm1                 \n"
-      "pand        %%xmm6,%%xmm7                 \n"
-      "paddw       %%xmm4,%%xmm0                 \n"
-      "paddw       %%xmm4,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "por         %%xmm7,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%0)                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "sub         $0x4,%1                       \n"
-      "jg          1b                            \n"
-      : "+r"(dst_argb),       // %0
-        "+r"(width)           // %1
-      : "r"(scale),           // %2
-        "r"(interval_size),   // %3
-        "r"(interval_offset)  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_ARGBSHADEROW_SSE2
-// Shade 4 pixels at a time by specified value.
-void ARGBShadeRow_SSE2(const uint8_t* src_argb,
-                       uint8_t* dst_argb,
-                       int width,
-                       uint32_t value) {
-  asm volatile(
-      "movd        %3,%%xmm2                     \n"
-      "punpcklbw   %%xmm2,%%xmm2                 \n"
-      "punpcklqdq  %%xmm2,%%xmm2                 \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm0,%%xmm0                 \n"
-      "punpckhbw   %%xmm1,%%xmm1                 \n"
-      "pmulhuw     %%xmm2,%%xmm0                 \n"
-      "pmulhuw     %%xmm2,%%xmm1                 \n"
-      "psrlw       $0x8,%%xmm0                   \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(value)       // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_ARGBSHADEROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-
-      "pxor        %%xmm5,%%xmm5                 \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqu      (%1),%%xmm2                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "movdqu      %%xmm0,%%xmm1                 \n"
-      "movdqu      %%xmm2,%%xmm3                 \n"
-      "punpcklbw   %%xmm0,%%xmm0                 \n"
-      "punpckhbw   %%xmm1,%%xmm1                 \n"
-      "punpcklbw   %%xmm5,%%xmm2                 \n"
-      "punpckhbw   %%xmm5,%%xmm3                 \n"
-      "pmulhuw     %%xmm2,%%xmm0                 \n"
-      "pmulhuw     %%xmm3,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%3                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_ARGBMULTIPLYROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-
-      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm1                   \n"
-      "lea         0x20(%0),%0                   \n"
-      "vmovdqu     (%1),%%ymm3                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
-      "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
-      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
-      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
-      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%2)                   \n"
-      "lea         0x20(%2),%2                   \n"
-      "sub         $0x8,%3                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_ARGBMULTIPLYROW_AVX2
-
-#ifdef HAS_ARGBADDROW_SSE2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8_t* src_argb,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqu      (%1),%%xmm1                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "paddusb     %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%3                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_ARGBADDROW_SSE2
-
-#ifdef HAS_ARGBADDROW_AVX2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8_t* src_argb,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "lea         0x20(%0),%0                   \n"
-      "vpaddusb    (%1),%%ymm0,%%ymm0            \n"
-      "lea         0x20(%1),%1                   \n"
-      "vmovdqu     %%ymm0,(%2)                   \n"
-      "lea         0x20(%2),%2                   \n"
-      "sub         $0x8,%3                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0");
-}
-#endif  // HAS_ARGBADDROW_AVX2
-
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqu      (%1),%%xmm1                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "psubusb     %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%3                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_ARGBSUBTRACTROW_SSE2
-
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "lea         0x20(%0),%0                   \n"
-      "vpsubusb    (%1),%%ymm0,%%ymm0            \n"
-      "lea         0x20(%1),%1                   \n"
-      "vmovdqu     %%ymm0,(%2)                   \n"
-      "lea         0x20(%2),%2                   \n"
-      "sub         $0x8,%3                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0");
-}
-#endif  // HAS_ARGBSUBTRACTROW_AVX2
-
-#ifdef HAS_SOBELXROW_SSE2
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-void SobelXRow_SSE2(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    const uint8_t* src_y2,
-                    uint8_t* dst_sobelx,
-                    int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
-      "sub         %0,%2                         \n"
-      "sub         %0,%3                         \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"
-      "movq        0x2(%0),%%xmm1                \n"
-      "punpcklbw   %%xmm5,%%xmm0                 \n"
-      "punpcklbw   %%xmm5,%%xmm1                 \n"
-      "psubw       %%xmm1,%%xmm0                 \n"
-      "movq        0x00(%0,%1,1),%%xmm1          \n"
-      "movq        0x02(%0,%1,1),%%xmm2          \n"
-      "punpcklbw   %%xmm5,%%xmm1                 \n"
-      "punpcklbw   %%xmm5,%%xmm2                 \n"
-      "psubw       %%xmm2,%%xmm1                 \n"
-      "movq        0x00(%0,%2,1),%%xmm2          \n"
-      "movq        0x02(%0,%2,1),%%xmm3          \n"
-      "punpcklbw   %%xmm5,%%xmm2                 \n"
-      "punpcklbw   %%xmm5,%%xmm3                 \n"
-      "psubw       %%xmm3,%%xmm2                 \n"
-      "paddw       %%xmm2,%%xmm0                 \n"
-      "paddw       %%xmm1,%%xmm0                 \n"
-      "paddw       %%xmm1,%%xmm0                 \n"
-      "pxor        %%xmm1,%%xmm1                 \n"
-      "psubw       %%xmm0,%%xmm1                 \n"
-      "pmaxsw      %%xmm1,%%xmm0                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movq        %%xmm0,0x00(%0,%3,1)          \n"
-      "lea         0x8(%0),%0                    \n"
-      "sub         $0x8,%4                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_y0),      // %0
-        "+r"(src_y1),      // %1
-        "+r"(src_y2),      // %2
-        "+r"(dst_sobelx),  // %3
-        "+r"(width)        // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SOBELXROW_SSE2
-
-#ifdef HAS_SOBELYROW_SSE2
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-void SobelYRow_SSE2(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    uint8_t* dst_sobely,
-                    int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
-      "sub         %0,%2                         \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"
-      "movq        0x00(%0,%1,1),%%xmm1          \n"
-      "punpcklbw   %%xmm5,%%xmm0                 \n"
-      "punpcklbw   %%xmm5,%%xmm1                 \n"
-      "psubw       %%xmm1,%%xmm0                 \n"
-      "movq        0x1(%0),%%xmm1                \n"
-      "movq        0x01(%0,%1,1),%%xmm2          \n"
-      "punpcklbw   %%xmm5,%%xmm1                 \n"
-      "punpcklbw   %%xmm5,%%xmm2                 \n"
-      "psubw       %%xmm2,%%xmm1                 \n"
-      "movq        0x2(%0),%%xmm2                \n"
-      "movq        0x02(%0,%1,1),%%xmm3          \n"
-      "punpcklbw   %%xmm5,%%xmm2                 \n"
-      "punpcklbw   %%xmm5,%%xmm3                 \n"
-      "psubw       %%xmm3,%%xmm2                 \n"
-      "paddw       %%xmm2,%%xmm0                 \n"
-      "paddw       %%xmm1,%%xmm0                 \n"
-      "paddw       %%xmm1,%%xmm0                 \n"
-      "pxor        %%xmm1,%%xmm1                 \n"
-      "psubw       %%xmm0,%%xmm1                 \n"
-      "pmaxsw      %%xmm1,%%xmm0                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movq        %%xmm0,0x00(%0,%2,1)          \n"
-      "lea         0x8(%0),%0                    \n"
-      "sub         $0x8,%3                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_y0),      // %0
-        "+r"(src_y1),      // %1
-        "+r"(dst_sobely),  // %2
-        "+r"(width)        // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SOBELYROW_SSE2
-
-#ifdef HAS_SOBELROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_SSE2(const uint8_t* src_sobelx,
-                   const uint8_t* src_sobely,
-                   uint8_t* dst_argb,
-                   int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "pslld       $0x18,%%xmm5                  \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
-      "lea         0x10(%0),%0                   \n"
-      "paddusb     %%xmm1,%%xmm0                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpcklbw   %%xmm0,%%xmm2                 \n"
-      "punpckhbw   %%xmm0,%%xmm0                 \n"
-      "movdqa      %%xmm2,%%xmm1                 \n"
-      "punpcklwd   %%xmm2,%%xmm1                 \n"
-      "punpckhwd   %%xmm2,%%xmm2                 \n"
-      "por         %%xmm5,%%xmm1                 \n"
-      "por         %%xmm5,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm3                 \n"
-      "punpcklwd   %%xmm0,%%xmm3                 \n"
-      "punpckhwd   %%xmm0,%%xmm0                 \n"
-      "por         %%xmm5,%%xmm3                 \n"
-      "por         %%xmm5,%%xmm0                 \n"
-      "movdqu      %%xmm1,(%2)                   \n"
-      "movdqu      %%xmm2,0x10(%2)               \n"
-      "movdqu      %%xmm3,0x20(%2)               \n"
-      "movdqu      %%xmm0,0x30(%2)               \n"
-      "lea         0x40(%2),%2                   \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SOBELROW_SSE2
-
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
-                          const uint8_t* src_sobely,
-                          uint8_t* dst_y,
-                          int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "pslld       $0x18,%%xmm5                  \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
-      "lea         0x10(%0),%0                   \n"
-      "paddusb     %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_y),       // %2
-        "+r"(width)        // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_SOBELTOPLANEROW_SSE2
-
-#ifdef HAS_SOBELXYROW_SSE2
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_SSE2(const uint8_t* src_sobelx,
-                     const uint8_t* src_sobely,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "paddusb     %%xmm1,%%xmm2                 \n"
-      "movdqa      %%xmm0,%%xmm3                 \n"
-      "punpcklbw   %%xmm5,%%xmm3                 \n"
-      "punpckhbw   %%xmm5,%%xmm0                 \n"
-      "movdqa      %%xmm1,%%xmm4                 \n"
-      "punpcklbw   %%xmm2,%%xmm4                 \n"
-      "punpckhbw   %%xmm2,%%xmm1                 \n"
-      "movdqa      %%xmm4,%%xmm6                 \n"
-      "punpcklwd   %%xmm3,%%xmm6                 \n"
-      "punpckhwd   %%xmm3,%%xmm4                 \n"
-      "movdqa      %%xmm1,%%xmm7                 \n"
-      "punpcklwd   %%xmm0,%%xmm7                 \n"
-      "punpckhwd   %%xmm0,%%xmm1                 \n"
-      "movdqu      %%xmm6,(%2)                   \n"
-      "movdqu      %%xmm4,0x10(%2)               \n"
-      "movdqu      %%xmm7,0x20(%2)               \n"
-      "movdqu      %%xmm1,0x30(%2)               \n"
-      "lea         0x40(%2),%2                   \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_SOBELXYROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
-                                  int32_t* cumsum,
-                                  const int32_t* previous_cumsum,
-                                  int width) {
-  asm volatile(
-      "pxor        %%xmm0,%%xmm0                 \n"
-      "pxor        %%xmm1,%%xmm1                 \n"
-      "sub         $0x4,%3                       \n"
-      "jl          49f                           \n"
-      "test        $0xf,%1                       \n"
-      "jne         49f                           \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "40:                                       \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqa      %%xmm2,%%xmm4                 \n"
-      "punpcklbw   %%xmm1,%%xmm2                 \n"
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "punpcklwd   %%xmm1,%%xmm2                 \n"
-      "punpckhwd   %%xmm1,%%xmm3                 \n"
-      "punpckhbw   %%xmm1,%%xmm4                 \n"
-      "movdqa      %%xmm4,%%xmm5                 \n"
-      "punpcklwd   %%xmm1,%%xmm4                 \n"
-      "punpckhwd   %%xmm1,%%xmm5                 \n"
-      "paddd       %%xmm2,%%xmm0                 \n"
-      "movdqu      (%2),%%xmm2                   \n"
-      "paddd       %%xmm0,%%xmm2                 \n"
-      "paddd       %%xmm3,%%xmm0                 \n"
-      "movdqu      0x10(%2),%%xmm3               \n"
-      "paddd       %%xmm0,%%xmm3                 \n"
-      "paddd       %%xmm4,%%xmm0                 \n"
-      "movdqu      0x20(%2),%%xmm4               \n"
-      "paddd       %%xmm0,%%xmm4                 \n"
-      "paddd       %%xmm5,%%xmm0                 \n"
-      "movdqu      0x30(%2),%%xmm5               \n"
-      "lea         0x40(%2),%2                   \n"
-      "paddd       %%xmm0,%%xmm5                 \n"
-      "movdqu      %%xmm2,(%1)                   \n"
-      "movdqu      %%xmm3,0x10(%1)               \n"
-      "movdqu      %%xmm4,0x20(%1)               \n"
-      "movdqu      %%xmm5,0x30(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x4,%3                       \n"
-      "jge         40b                           \n"
-
-      "49:                                       \n"
-      "add         $0x3,%3                       \n"
-      "jl          19f                           \n"
-
-      // 1 pixel loop.
-      LABELALIGN
-      "10:                                       \n"
-      "movd        (%0),%%xmm2                   \n"
-      "lea         0x4(%0),%0                    \n"
-      "punpcklbw   %%xmm1,%%xmm2                 \n"
-      "punpcklwd   %%xmm1,%%xmm2                 \n"
-      "paddd       %%xmm2,%%xmm0                 \n"
-      "movdqu      (%2),%%xmm2                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "paddd       %%xmm0,%%xmm2                 \n"
-      "movdqu      %%xmm2,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x1,%3                       \n"
-      "jge         10b                           \n"
-
-      "19:                                       \n"
-      : "+r"(row),              // %0
-        "+r"(cumsum),           // %1
-        "+r"(previous_cumsum),  // %2
-        "+r"(width)             // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
-                                    const int32_t* botleft,
-                                    int width,
-                                    int area,
-                                    uint8_t* dst,
-                                    int count) {
-  asm volatile(
-      "movd        %5,%%xmm5                     \n"
-      "cvtdq2ps    %%xmm5,%%xmm5                 \n"
-      "rcpss       %%xmm5,%%xmm4                 \n"
-      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
-      "sub         $0x4,%3                       \n"
-      "jl          49f                           \n"
-      "cmpl        $0x80,%5                      \n"
-      "ja          40f                           \n"
-
-      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "psrld       $0x10,%%xmm6                  \n"
-      "cvtdq2ps    %%xmm6,%%xmm6                 \n"
-      "addps       %%xmm6,%%xmm5                 \n"
-      "mulps       %%xmm4,%%xmm5                 \n"
-      "cvtps2dq    %%xmm5,%%xmm5                 \n"
-      "packssdw    %%xmm5,%%xmm5                 \n"
-
-      // 4 pixel small loop.
-      LABELALIGN
-      "4:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x30(%0),%%xmm3               \n"
-      "psubd       0x00(%0,%4,4),%%xmm0          \n"
-      "psubd       0x10(%0,%4,4),%%xmm1          \n"
-      "psubd       0x20(%0,%4,4),%%xmm2          \n"
-      "psubd       0x30(%0,%4,4),%%xmm3          \n"
-      "lea         0x40(%0),%0                   \n"
-      "psubd       (%1),%%xmm0                   \n"
-      "psubd       0x10(%1),%%xmm1               \n"
-      "psubd       0x20(%1),%%xmm2               \n"
-      "psubd       0x30(%1),%%xmm3               \n"
-      "paddd       0x00(%1,%4,4),%%xmm0          \n"
-      "paddd       0x10(%1,%4,4),%%xmm1          \n"
-      "paddd       0x20(%1,%4,4),%%xmm2          \n"
-      "paddd       0x30(%1,%4,4),%%xmm3          \n"
-      "lea         0x40(%1),%1                   \n"
-      "packssdw    %%xmm1,%%xmm0                 \n"
-      "packssdw    %%xmm3,%%xmm2                 \n"
-      "pmulhuw     %%xmm5,%%xmm0                 \n"
-      "pmulhuw     %%xmm5,%%xmm2                 \n"
-      "packuswb    %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%3                       \n"
-      "jge         4b                            \n"
-      "jmp         49f                           \n"
-
-      // 4 pixel loop
-      LABELALIGN
-      "40:                                       \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x30(%0),%%xmm3               \n"
-      "psubd       0x00(%0,%4,4),%%xmm0          \n"
-      "psubd       0x10(%0,%4,4),%%xmm1          \n"
-      "psubd       0x20(%0,%4,4),%%xmm2          \n"
-      "psubd       0x30(%0,%4,4),%%xmm3          \n"
-      "lea         0x40(%0),%0                   \n"
-      "psubd       (%1),%%xmm0                   \n"
-      "psubd       0x10(%1),%%xmm1               \n"
-      "psubd       0x20(%1),%%xmm2               \n"
-      "psubd       0x30(%1),%%xmm3               \n"
-      "paddd       0x00(%1,%4,4),%%xmm0          \n"
-      "paddd       0x10(%1,%4,4),%%xmm1          \n"
-      "paddd       0x20(%1,%4,4),%%xmm2          \n"
-      "paddd       0x30(%1,%4,4),%%xmm3          \n"
-      "lea         0x40(%1),%1                   \n"
-      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
-      "cvtdq2ps    %%xmm1,%%xmm1                 \n"
-      "mulps       %%xmm4,%%xmm0                 \n"
-      "mulps       %%xmm4,%%xmm1                 \n"
-      "cvtdq2ps    %%xmm2,%%xmm2                 \n"
-      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
-      "mulps       %%xmm4,%%xmm2                 \n"
-      "mulps       %%xmm4,%%xmm3                 \n"
-      "cvtps2dq    %%xmm0,%%xmm0                 \n"
-      "cvtps2dq    %%xmm1,%%xmm1                 \n"
-      "cvtps2dq    %%xmm2,%%xmm2                 \n"
-      "cvtps2dq    %%xmm3,%%xmm3                 \n"
-      "packssdw    %%xmm1,%%xmm0                 \n"
-      "packssdw    %%xmm3,%%xmm2                 \n"
-      "packuswb    %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%3                       \n"
-      "jge         40b                           \n"
-
-      "49:                                       \n"
-      "add         $0x3,%3                       \n"
-      "jl          19f                           \n"
-
-      // 1 pixel loop
-      LABELALIGN
-      "10:                                       \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "psubd       0x00(%0,%4,4),%%xmm0          \n"
-      "lea         0x10(%0),%0                   \n"
-      "psubd       (%1),%%xmm0                   \n"
-      "paddd       0x00(%1,%4,4),%%xmm0          \n"
-      "lea         0x10(%1),%1                   \n"
-      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
-      "mulps       %%xmm4,%%xmm0                 \n"
-      "cvtps2dq    %%xmm0,%%xmm0                 \n"
-      "packssdw    %%xmm0,%%xmm0                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movd        %%xmm0,(%2)                   \n"
-      "lea         0x4(%2),%2                    \n"
-      "sub         $0x1,%3                       \n"
-      "jge         10b                           \n"
-      "19:                                       \n"
-      : "+r"(topleft),           // %0
-        "+r"(botleft),           // %1
-        "+r"(dst),               // %2
-        "+rm"(count)             // %3
-      : "r"((intptr_t)(width)),  // %4
-        "rm"(area)               // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// Copy ARGB pixels from source image with slope to a row of destination.
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8_t* src_argb,
-                        int src_argb_stride,
-                        uint8_t* dst_argb,
-                        const float* src_dudv,
-                        int width) {
-  intptr_t src_argb_stride_temp = src_argb_stride;
-  intptr_t temp;
-  asm volatile(
-      "movq        (%3),%%xmm2                   \n"
-      "movq        0x08(%3),%%xmm7               \n"
-      "shl         $0x10,%1                      \n"
-      "add         $0x4,%1                       \n"
-      "movd        %1,%%xmm5                     \n"
-      "sub         $0x4,%4                       \n"
-      "jl          49f                           \n"
-
-      "pshufd      $0x44,%%xmm7,%%xmm7           \n"
-      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
-      "movdqa      %%xmm2,%%xmm0                 \n"
-      "addps       %%xmm7,%%xmm0                 \n"
-      "movlhps     %%xmm0,%%xmm2                 \n"
-      "movdqa      %%xmm7,%%xmm4                 \n"
-      "addps       %%xmm4,%%xmm4                 \n"
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "addps       %%xmm4,%%xmm3                 \n"
-      "addps       %%xmm4,%%xmm4                 \n"
-
-      // 4 pixel loop
-      LABELALIGN
-      "40:                                       \n"
-      "cvttps2dq   %%xmm2,%%xmm0                 \n"  // x,y float->int first 2
-      "cvttps2dq   %%xmm3,%%xmm1                 \n"  // x,y float->int next 2
-      "packssdw    %%xmm1,%%xmm0                 \n"  // x, y as 8 shorts
-      "pmaddwd     %%xmm5,%%xmm0                 \n"  // off = x*4 + y*stride
-      "movd        %%xmm0,%k1                    \n"
-      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
-      "movd        %%xmm0,%k5                    \n"
-      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
-      "movd        0x00(%0,%1,1),%%xmm1          \n"
-      "movd        0x00(%0,%5,1),%%xmm6          \n"
-      "punpckldq   %%xmm6,%%xmm1                 \n"
-      "addps       %%xmm4,%%xmm2                 \n"
-      "movq        %%xmm1,(%2)                   \n"
-      "movd        %%xmm0,%k1                    \n"
-      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
-      "movd        %%xmm0,%k5                    \n"
-      "movd        0x00(%0,%1,1),%%xmm0          \n"
-      "movd        0x00(%0,%5,1),%%xmm6          \n"
-      "punpckldq   %%xmm6,%%xmm0                 \n"
-      "addps       %%xmm4,%%xmm3                 \n"
-      "movq        %%xmm0,0x08(%2)               \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%4                       \n"
-      "jge         40b                           \n"
-
-      "49:                                       \n"
-      "add         $0x3,%4                       \n"
-      "jl          19f                           \n"
-
-      // 1 pixel loop
-      LABELALIGN
-      "10:                                       \n"
-      "cvttps2dq   %%xmm2,%%xmm0                 \n"
-      "packssdw    %%xmm0,%%xmm0                 \n"
-      "pmaddwd     %%xmm5,%%xmm0                 \n"
-      "addps       %%xmm7,%%xmm2                 \n"
-      "movd        %%xmm0,%k1                    \n"
-      "movd        0x00(%0,%1,1),%%xmm0          \n"
-      "movd        %%xmm0,(%2)                   \n"
-      "lea         0x04(%2),%2                   \n"
-      "sub         $0x1,%4                       \n"
-      "jge         10b                           \n"
-      "19:                                       \n"
-      : "+r"(src_argb),              // %0
-        "+r"(src_argb_stride_temp),  // %1
-        "+r"(dst_argb),              // %2
-        "+r"(src_dudv),              // %3
-        "+rm"(width),                // %4
-        "=&r"(temp)                  // %5
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBAFFINEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction) {
-  asm volatile(
-      "sub         %1,%0                         \n"
-      "cmp         $0x0,%3                       \n"
-      "je          100f                          \n"
-      "cmp         $0x80,%3                      \n"
-      "je          50f                           \n"
-
-      "movd        %3,%%xmm0                     \n"
-      "neg         %3                            \n"
-      "add         $0x100,%3                     \n"
-      "movd        %3,%%xmm5                     \n"
-      "punpcklbw   %%xmm0,%%xmm5                 \n"
-      "punpcklwd   %%xmm5,%%xmm5                 \n"
-      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
-      "mov         $0x80808080,%%eax             \n"
-      "movd        %%eax,%%xmm4                  \n"
-      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
-
-      // General purpose row blend.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%1),%%xmm0                   \n"
-      "movdqu      0x00(%1,%4,1),%%xmm2          \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm2,%%xmm0                 \n"
-      "punpckhbw   %%xmm2,%%xmm1                 \n"
-      "psubb       %%xmm4,%%xmm0                 \n"
-      "psubb       %%xmm4,%%xmm1                 \n"
-      "movdqa      %%xmm5,%%xmm2                 \n"
-      "movdqa      %%xmm5,%%xmm3                 \n"
-      "pmaddubsw   %%xmm0,%%xmm2                 \n"
-      "pmaddubsw   %%xmm1,%%xmm3                 \n"
-      "paddw       %%xmm4,%%xmm2                 \n"
-      "paddw       %%xmm4,%%xmm3                 \n"
-      "psrlw       $0x8,%%xmm2                   \n"
-      "psrlw       $0x8,%%xmm3                   \n"
-      "packuswb    %%xmm3,%%xmm2                 \n"
-      "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "jmp         99f                           \n"
-
-      // Blend 50 / 50.
-      LABELALIGN
-      "50:                                       \n"
-      "movdqu      (%1),%%xmm0                   \n"
-      "movdqu      0x00(%1,%4,1),%%xmm1          \n"
-      "pavgb       %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          50b                           \n"
-      "jmp         99f                           \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      LABELALIGN
-      "100:                                      \n"
-      "movdqu      (%1),%%xmm0                   \n"
-      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          100b                          \n"
-
-      "99:                                       \n"
-      : "+r"(dst_ptr),               // %0
-        "+r"(src_ptr),               // %1
-        "+rm"(dst_width),            // %2
-        "+r"(source_y_fraction)      // %3
-      : "r"((intptr_t)(src_stride))  // %4
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int dst_width,
-                         int source_y_fraction) {
-  asm volatile(
-      "cmp         $0x0,%3                       \n"
-      "je          100f                          \n"
-      "sub         %1,%0                         \n"
-      "cmp         $0x80,%3                      \n"
-      "je          50f                           \n"
-
-      "vmovd       %3,%%xmm0                     \n"
-      "neg         %3                            \n"
-      "add         $0x100,%3                     \n"
-      "vmovd       %3,%%xmm5                     \n"
-      "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
-      "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
-      "vbroadcastss %%xmm5,%%ymm5                \n"
-      "mov         $0x80808080,%%eax             \n"
-      "vmovd       %%eax,%%xmm4                  \n"
-      "vbroadcastss %%xmm4,%%ymm4                \n"
-
-      // General purpose row blend.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%1),%%ymm0                   \n"
-      "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
-      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
-      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
-      "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
-      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "jmp         99f                           \n"
-
-      // Blend 50 / 50.
-      LABELALIGN
-      "50:                                       \n"
-      "vmovdqu     (%1),%%ymm0                   \n"
-      "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
-      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          50b                           \n"
-      "jmp         99f                           \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      LABELALIGN
-      "100:                                      \n"
-      "rep         movsb                         \n"
-      "jmp         999f                          \n"
-
-      "99:                                       \n"
-      "vzeroupper                                \n"
-      "999:                                      \n"
-      : "+D"(dst_ptr),               // %0
-        "+S"(src_ptr),               // %1
-        "+cm"(dst_width),            // %2
-        "+r"(source_y_fraction)      // %3
-      : "r"((intptr_t)(src_stride))  // %4
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
-}
-#endif  // HAS_INTERPOLATEROW_AVX2
-
-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          const uint8_t* shuffler,
-                          int width) {
-  asm volatile(
-
-      "movdqu      (%3),%%xmm5                   \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(shuffler)    // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_ARGBSHUFFLEROW_SSSE3
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const uint8_t* shuffler,
-                         int width) {
-  asm volatile(
-
-      "vbroadcastf128 (%3),%%ymm5                \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(shuffler)    // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_ARGBSHUFFLEROW_AVX2
-
-#ifdef HAS_I422TOYUY2ROW_SSE2
-void I422ToYUY2Row_SSE2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width) {
-  asm volatile(
-
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%1),%%xmm2                   \n"
-      "movq        0x00(%1,%2,1),%%xmm1          \n"
-      "add         $0x8,%1                       \n"
-      "punpcklbw   %%xmm1,%%xmm2                 \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "add         $0x10,%0                      \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm2,%%xmm0                 \n"
-      "punpckhbw   %%xmm2,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%3)                   \n"
-      "movdqu      %%xmm1,0x10(%3)               \n"
-      "lea         0x20(%3),%3                   \n"
-      "sub         $0x10,%4                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_yuy2),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_I422TOYUY2ROW_SSE2
-
-#ifdef HAS_I422TOUYVYROW_SSE2
-void I422ToUYVYRow_SSE2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width) {
-  asm volatile(
-
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%1),%%xmm2                   \n"
-      "movq        0x00(%1,%2,1),%%xmm1          \n"
-      "add         $0x8,%1                       \n"
-      "punpcklbw   %%xmm1,%%xmm2                 \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm2,%%xmm1                 \n"
-      "add         $0x10,%0                      \n"
-      "punpcklbw   %%xmm0,%%xmm1                 \n"
-      "punpckhbw   %%xmm0,%%xmm2                 \n"
-      "movdqu      %%xmm1,(%3)                   \n"
-      "movdqu      %%xmm2,0x10(%3)               \n"
-      "lea         0x20(%3),%3                   \n"
-      "sub         $0x10,%4                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_uyvy),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_I422TOUYVYROW_SSE2
-
-#ifdef HAS_I422TOYUY2ROW_AVX2
-void I422ToYUY2Row_AVX2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width) {
-  asm volatile(
-
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vpmovzxbw   (%1),%%ymm1                   \n"
-      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
-      "add         $0x10,%1                      \n"
-      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
-      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "add         $0x20,%0                      \n"
-      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
-      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
-      "vextractf128 $0x0,%%ymm1,(%3)             \n"
-      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
-      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
-      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
-      "lea         0x40(%3),%3                   \n"
-      "sub         $0x20,%4                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_yuy2),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_I422TOYUY2ROW_AVX2
-
-#ifdef HAS_I422TOUYVYROW_AVX2
-void I422ToUYVYRow_AVX2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width) {
-  asm volatile(
-
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vpmovzxbw   (%1),%%ymm1                   \n"
-      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
-      "add         $0x10,%1                      \n"
-      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
-      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "add         $0x20,%0                      \n"
-      "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
-      "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
-      "vextractf128 $0x0,%%ymm1,(%3)             \n"
-      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
-      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
-      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
-      "lea         0x40(%3),%3                   \n"
-      "sub         $0x20,%4                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_uyvy),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_I422TOUYVYROW_AVX2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const float* poly,
-                            int width) {
-  asm volatile(
-
-      "pxor        %%xmm3,%%xmm3                 \n"
-
-      // 2 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"
-      "lea         0x8(%0),%0                    \n"
-      "punpcklbw   %%xmm3,%%xmm0                 \n"
-      "movdqa      %%xmm0,%%xmm4                 \n"
-      "punpcklwd   %%xmm3,%%xmm0                 \n"
-      "punpckhwd   %%xmm3,%%xmm4                 \n"
-      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
-      "cvtdq2ps    %%xmm4,%%xmm4                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm4,%%xmm5                 \n"
-      "mulps       0x10(%3),%%xmm0               \n"
-      "mulps       0x10(%3),%%xmm4               \n"
-      "addps       (%3),%%xmm0                   \n"
-      "addps       (%3),%%xmm4                   \n"
-      "movdqa      %%xmm1,%%xmm2                 \n"
-      "movdqa      %%xmm5,%%xmm6                 \n"
-      "mulps       %%xmm1,%%xmm2                 \n"
-      "mulps       %%xmm5,%%xmm6                 \n"
-      "mulps       %%xmm2,%%xmm1                 \n"
-      "mulps       %%xmm6,%%xmm5                 \n"
-      "mulps       0x20(%3),%%xmm2               \n"
-      "mulps       0x20(%3),%%xmm6               \n"
-      "mulps       0x30(%3),%%xmm1               \n"
-      "mulps       0x30(%3),%%xmm5               \n"
-      "addps       %%xmm2,%%xmm0                 \n"
-      "addps       %%xmm6,%%xmm4                 \n"
-      "addps       %%xmm1,%%xmm0                 \n"
-      "addps       %%xmm5,%%xmm4                 \n"
-      "cvttps2dq   %%xmm0,%%xmm0                 \n"
-      "cvttps2dq   %%xmm4,%%xmm4                 \n"
-      "packuswb    %%xmm4,%%xmm0                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x2,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(poly)        // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const float* poly,
-                            int width) {
-  asm volatile(
-      "vbroadcastf128 (%3),%%ymm4                \n"
-      "vbroadcastf128 0x10(%3),%%ymm5            \n"
-      "vbroadcastf128 0x20(%3),%%ymm6            \n"
-      "vbroadcastf128 0x30(%3),%%ymm7            \n"
-
-      // 2 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
-      "lea         0x8(%0),%0                    \n"
-      "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
-      "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
-      "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
-      "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
-      "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
-      "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
-                                                      // X
-      "vcvttps2dq  %%ymm0,%%ymm0                 \n"
-      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
-      "vmovq       %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x2,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(poly)        // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
-
-#ifdef HAS_HALFFLOATROW_SSE2
-static float kScaleBias = 1.9259299444e-34f;
-void HalfFloatRow_SSE2(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width) {
-  scale *= kScaleBias;
-  asm volatile(
-      "movd        %3,%%xmm4                     \n"
-      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-      "sub         %0,%1                         \n"
-
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
-      "add         $0x10,%0                      \n"
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
-      "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
-      "punpckhwd   %%xmm5,%%xmm3                 \n"
-      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
-      "mulps       %%xmm4,%%xmm2                 \n"
-      "mulps       %%xmm4,%%xmm3                 \n"
-      "psrld       $0xd,%%xmm2                   \n"
-      "psrld       $0xd,%%xmm3                   \n"
-      "packssdw    %%xmm3,%%xmm2                 \n"
-      "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      : "m"(scale)   // %3
-      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_HALFFLOATROW_SSE2
-
-#ifdef HAS_HALFFLOATROW_AVX2
-void HalfFloatRow_AVX2(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width) {
-  scale *= kScaleBias;
-  asm volatile(
-      "vbroadcastss %3, %%ymm4                   \n"
-      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
-      "sub         %0,%1                         \n"
-
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm2                   \n"  // 16 shorts
-      "add         $0x20,%0                      \n"
-      "vpunpckhwd  %%ymm5,%%ymm2,%%ymm3          \n"  // mutates
-      "vpunpcklwd  %%ymm5,%%ymm2,%%ymm2          \n"
-      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
-      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
-      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
-      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
-      "vpsrld      $0xd,%%ymm3,%%ymm3            \n"
-      "vpsrld      $0xd,%%ymm2,%%ymm2            \n"
-      "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // unmutates
-      "vmovdqu     %%ymm2,-0x20(%0,%1,1)         \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-
-      "vzeroupper                                \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-#if defined(__x86_64__)
-      : "x"(scale)  // %3
-#else
-      : "m"(scale)            // %3
-#endif
-      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_HALFFLOATROW_AVX2
-
-#ifdef HAS_HALFFLOATROW_F16C
-void HalfFloatRow_F16C(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width) {
-  asm volatile(
-      "vbroadcastss %3, %%ymm4                   \n"
-      "sub         %0,%1                         \n"
-
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
-      "vpmovzxwd   0x10(%0),%%ymm3               \n"
-      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
-      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
-      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
-      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
-      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
-      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
-      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
-      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
-      "add         $0x20,%0                      \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-#if defined(__x86_64__)
-      : "x"(scale)  // %3
-#else
-      : "m"(scale)            // %3
-#endif
-      : "memory", "cc", "xmm2", "xmm3", "xmm4");
-}
-#endif  // HAS_HALFFLOATROW_F16C
-
-#ifdef HAS_HALFFLOATROW_F16C
-void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
-      "vpmovzxwd   0x10(%0),%%ymm3               \n"
-      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
-      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
-      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
-      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
-      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
-      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
-      "add         $0x20,%0                      \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm2", "xmm3");
-}
-#endif  // HAS_HALFFLOATROW_F16C
-
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8_t* dst_argb,
-                           const uint8_t* table_argb,
-                           int width) {
-  uintptr_t pixel_temp;
-  asm volatile(
-      // 1 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movzb       (%0),%1                       \n"
-      "lea         0x4(%0),%0                    \n"
-      "movzb       0x00(%3,%1,4),%1              \n"
-      "mov         %b1,-0x4(%0)                  \n"
-      "movzb       -0x3(%0),%1                   \n"
-      "movzb       0x01(%3,%1,4),%1              \n"
-      "mov         %b1,-0x3(%0)                  \n"
-      "movzb       -0x2(%0),%1                   \n"
-      "movzb       0x02(%3,%1,4),%1              \n"
-      "mov         %b1,-0x2(%0)                  \n"
-      "movzb       -0x1(%0),%1                   \n"
-      "movzb       0x03(%3,%1,4),%1              \n"
-      "mov         %b1,-0x1(%0)                  \n"
-      "dec         %2                            \n"
-      "jg          1b                            \n"
-      : "+r"(dst_argb),     // %0
-        "=&d"(pixel_temp),  // %1
-        "+r"(width)         // %2
-      : "r"(table_argb)     // %3
-      : "memory", "cc");
-}
-#endif  // HAS_ARGBCOLORTABLEROW_X86
-
-#ifdef HAS_RGBCOLORTABLEROW_X86
-// Tranform RGB pixels with color table.
-void RGBColorTableRow_X86(uint8_t* dst_argb,
-                          const uint8_t* table_argb,
-                          int width) {
-  uintptr_t pixel_temp;
-  asm volatile(
-      // 1 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movzb       (%0),%1                       \n"
-      "lea         0x4(%0),%0                    \n"
-      "movzb       0x00(%3,%1,4),%1              \n"
-      "mov         %b1,-0x4(%0)                  \n"
-      "movzb       -0x3(%0),%1                   \n"
-      "movzb       0x01(%3,%1,4),%1              \n"
-      "mov         %b1,-0x3(%0)                  \n"
-      "movzb       -0x2(%0),%1                   \n"
-      "movzb       0x02(%3,%1,4),%1              \n"
-      "mov         %b1,-0x2(%0)                  \n"
-      "dec         %2                            \n"
-      "jg          1b                            \n"
-      : "+r"(dst_argb),     // %0
-        "=&d"(pixel_temp),  // %1
-        "+r"(width)         // %2
-      : "r"(table_argb)     // %3
-      : "memory", "cc");
-}
-#endif  // HAS_RGBCOLORTABLEROW_X86
-
-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
-// Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
-                                 uint8_t* dst_argb,
-                                 int width,
-                                 const uint8_t* luma,
-                                 uint32_t lumacoeff) {
-  uintptr_t pixel_temp;
-  uintptr_t table_temp;
-  asm volatile(
-      "movd        %6,%%xmm3                     \n"
-      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "psllw       $0x8,%%xmm4                   \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%2),%%xmm0                   \n"
-      "pmaddubsw   %%xmm3,%%xmm0                 \n"
-      "phaddw      %%xmm0,%%xmm0                 \n"
-      "pand        %%xmm4,%%xmm0                 \n"
-      "punpcklwd   %%xmm5,%%xmm0                 \n"
-      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
-      "add         %5,%1                         \n"
-      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
-
-      "movzb       (%2),%0                       \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,(%3)                      \n"
-      "movzb       0x1(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0x1(%3)                   \n"
-      "movzb       0x2(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0x2(%3)                   \n"
-      "movzb       0x3(%2),%0                    \n"
-      "mov         %b0,0x3(%3)                   \n"
-
-      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
-      "add         %5,%1                         \n"
-      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
-
-      "movzb       0x4(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0x4(%3)                   \n"
-      "movzb       0x5(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0x5(%3)                   \n"
-      "movzb       0x6(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0x6(%3)                   \n"
-      "movzb       0x7(%2),%0                    \n"
-      "mov         %b0,0x7(%3)                   \n"
-
-      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
-      "add         %5,%1                         \n"
-      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
-
-      "movzb       0x8(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0x8(%3)                   \n"
-      "movzb       0x9(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0x9(%3)                   \n"
-      "movzb       0xa(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0xa(%3)                   \n"
-      "movzb       0xb(%2),%0                    \n"
-      "mov         %b0,0xb(%3)                   \n"
-
-      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
-      "add         %5,%1                         \n"
-
-      "movzb       0xc(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0xc(%3)                   \n"
-      "movzb       0xd(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0xd(%3)                   \n"
-      "movzb       0xe(%2),%0                    \n"
-      "movzb       0x00(%1,%0,1),%0              \n"
-      "mov         %b0,0xe(%3)                   \n"
-      "movzb       0xf(%2),%0                    \n"
-      "mov         %b0,0xf(%3)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "lea         0x10(%3),%3                   \n"
-      "sub         $0x4,%4                       \n"
-      "jg          1b                            \n"
-      : "=&d"(pixel_temp),  // %0
-        "=&a"(table_temp),  // %1
-        "+r"(src_argb),     // %2
-        "+r"(dst_argb),     // %3
-        "+rm"(width)        // %4
-      : "r"(luma),          // %5
-        "rm"(lumacoeff)     // %6
-      : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-
-#ifdef HAS_NV21TOYUV24ROW_AVX2
-
-// begin NV21ToYUV24Row_C avx2 constants
-static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
-                               0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
-                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
-
-static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
-                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
-
-static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
-
-static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
-                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
-                              0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
-                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
-
-static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
-                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
-                              0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
-                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
-
-static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
-                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
-                              0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
-                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
-
-static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
-                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
-                              0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
-                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
-
-static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
-                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
-                              0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
-                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
-
-static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
-                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
-                              0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
-                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
-
-// NV21ToYUV24Row_AVX2
-void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_yuv24,
-                         int width) {
-  uint8_t* src_y_ptr;
-  uint64_t src_offset = 0;
-  uint64_t width64;
-
-  width64 = width;
-  src_y_ptr = (uint8_t*)src_y;
-
-  asm volatile(
-      "vmovdqu     %5, %%ymm0                    \n"  // init blend value
-      "vmovdqu     %6, %%ymm1                    \n"  // init blend value
-      "vmovdqu     %7, %%ymm2                    \n"  // init blend value
-      //      "sub         $0x20, %3                     \n"  //sub 32 from
-      //      width for final loop
-
-      LABELALIGN
-      "1:                                        \n"      // label 1
-      "vmovdqu     (%0,%4), %%ymm3               \n"      // src_y
-      "vmovdqu     1(%1,%4), %%ymm4              \n"      // src_uv+1
-      "vmovdqu     (%1), %%ymm5                  \n"      // src_uv
-      "vpshufb     %8, %%ymm3, %%ymm13           \n"      // y, kSHUF0 for shuf
-      "vpshufb     %9, %%ymm4, %%ymm14           \n"      // uv+1, kSHUF1 for
-                                                          // shuf
-      "vpshufb     %10, %%ymm5, %%ymm15          \n"      // uv, kSHUF2 for
-                                                          // shuf
-      "vpshufb     %11, %%ymm3, %%ymm3           \n"      // y kSHUF3 for shuf
-      "vpshufb     %12, %%ymm4, %%ymm4           \n"      // uv+1 kSHUF4 for
-                                                          // shuf
-      "vpblendvb   %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n"  // blend 0
-      "vpblendvb   %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n"  // blend 0
-      "vpblendvb   %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n"  // blend 2
-      "vpblendvb   %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n"  // blend 1
-      "vpshufb     %13, %%ymm5, %%ymm15          \n"      // shuffle const
-      "vpor        %%ymm4, %%ymm3, %%ymm5        \n"      // get results
-      "vmovdqu     %%ymm12, 0x20(%2)             \n"      // store dst_yuv+20h
-      "vpor        %%ymm15, %%ymm5, %%ymm3       \n"      // get results
-      "add         $0x20, %4                     \n"      // add to src buffer
-                                                          // ptr
-      "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n"      // insert
-      "vperm2i128  $0x31, %%ymm13, %%ymm3, %%ymm5 \n"     // insert
-      "vmovdqu     %%ymm4, (%2)                  \n"      // store dst_yuv
-      "vmovdqu     %%ymm5, 0x40(%2)              \n"      // store dst_yuv+40h
-      "add         $0x60,%2                      \n"      // add to dst buffer
-                                                          // ptr
-      //      "cmp         %3, %4                        \n" //(width64 -
-      //      32 bytes) and src_offset
-      "sub         $0x20,%3                      \n"  // 32 pixels per loop
-      "jg          1b                            \n"
-      "vzeroupper                                \n"  // sse-avx2
-                                                      // transistions
-
-      : "+r"(src_y),      //%0
-        "+r"(src_vu),     //%1
-        "+r"(dst_yuv24),  //%2
-        "+r"(width64),    //%3
-        "+r"(src_offset)  //%4
-      : "m"(kBLEND0),     //%5
-        "m"(kBLEND1),     //%6
-        "m"(kBLEND2),     //%7
-        "m"(kSHUF0),      //%8
-        "m"(kSHUF1),      //%9
-        "m"(kSHUF2),      //%10
-        "m"(kSHUF3),      //%11
-        "m"(kSHUF4),      //%12
-        "m"(kSHUF5)       //%13
-      : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
-        "xmm13", "xmm14", "xmm15");
-}
-#endif  // HAS_NV21TOYUV24ROW_AVX2
-
-#ifdef HAS_SWAPUVROW_SSSE3
-
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
-                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
-
-// Convert UV plane of NV12 to VU of NV21.
-void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
-
-      "movdqu      %3,%%xmm5                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_uv),        // %0
-        "+r"(dst_vu),        // %1
-        "+r"(width)          // %2
-      : "m"(kShuffleUVToVU)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_SWAPUVROW_SSSE3
-
-#ifdef HAS_SWAPUVROW_AVX2
-void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
-
-      "vbroadcastf128 %3,%%ymm5                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_uv),        // %0
-        "+r"(dst_vu),        // %1
-        "+r"(width)          // %2
-      : "m"(kShuffleUVToVU)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_SWAPUVROW_AVX2
-
-void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
-                          int src_stride_u,
-                          const uint8_t* src_v,
-                          int src_stride_v,
-                          uint8_t* dst_uv,
-                          int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "psrlw       $0xf,%%xmm4                   \n"
-      "packuswb    %%xmm4,%%xmm4                 \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"  // load 16 U values
-      "movdqu      (%1),%%xmm1                   \n"  // load 16 V values
-      "movdqu      0(%0,%4,1),%%xmm2             \n"  // 16 from next row
-      "movdqu      0(%1,%5,1),%%xmm3             \n"
-      "lea         0x10(%0),%0                   \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // half size
-      "pmaddubsw   %%xmm4,%%xmm1                 \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm4,%%xmm3                 \n"
-      "lea         0x10(%1),%1                   \n"
-      "paddw       %%xmm2,%%xmm0                 \n"
-      "paddw       %%xmm3,%%xmm1                 \n"
-      "psrlw       $0x1,%%xmm0                   \n"
-      "psrlw       $0x1,%%xmm1                   \n"
-      "pavgw       %%xmm5,%%xmm0                 \n"
-      "pavgw       %%xmm5,%%xmm1                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "packuswb    %%xmm1,%%xmm1                 \n"
-      "punpcklbw   %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"  // store 8 UV pixels
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x10,%3                      \n"  // 16 src pixels per loop
-      "jg          1b                            \n"
-      : "+r"(src_u),                    // %0
-        "+r"(src_v),                    // %1
-        "+r"(dst_uv),                   // %2
-        "+r"(width)                     // %3
-      : "r"((intptr_t)(src_stride_u)),  // %4
-        "r"((intptr_t)(src_stride_v))   // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void HalfMergeUVRow_AVX2(const uint8_t* src_u,
-                         int src_stride_u,
-                         const uint8_t* src_v,
-                         int src_stride_v,
-                         uint8_t* dst_uv,
-                         int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
-      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"  // load 32 U values
-      "vmovdqu     (%1),%%ymm1                   \n"  // load 32 V values
-      "vmovdqu     0(%0,%4,1),%%ymm2             \n"  // 32 from next row
-      "vmovdqu     0(%1,%5,1),%%ymm3             \n"
-      "lea         0x20(%0),%0                   \n"
-      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // half size
-      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
-      "lea         0x20(%1),%1                   \n"
-      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
-      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
-      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
-      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%2)                   \n"  // store 16 UV pixels
-      "lea         0x20(%2),%2                   \n"
-      "sub         $0x20,%3                      \n"  // 32 src pixels per loop
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_u),                    // %0
-        "+r"(src_v),                    // %1
-        "+r"(dst_uv),                   // %2
-        "+r"(width)                     // %3
-      : "r"((intptr_t)(src_stride_u)),  // %4
-        "r"((intptr_t)(src_stride_v))   // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
-  asm volatile(
-      "pxor        %%xmm1,%%xmm1                 \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movd        (%0),%%xmm0                   \n"  // load float
-      "maxss       %%xmm1, %%xmm0                \n"  // clamp to zero
-      "add         4, %0                         \n"
-      "movd        %%xmm0, (%1)                  \n"  // store float
-      "add         4, %1                         \n"
-      "sub         $0x4,%2                       \n"  // 1 float per loop
-      "jg          1b                            \n"
-      : "+r"(src_x),  // %0
-        "+r"(dst_y),  // %1
-        "+r"(width)   // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/row_mmi.cc b/thirdparty/libyuv/source/row_mmi.cc
deleted file mode 100644
index 362fd1c..0000000
--- a/thirdparty/libyuv/source/row_mmi.cc
+++ /dev/null
@@ -1,7842 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "libyuv/row.h"
-
-#include <string.h>  // For memcpy and memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// clang-format off
-
-void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
-                        uint8_t* dst_argb,
-                        int width) {
-  uint64_t src0, src1, dest;
-  const uint64_t mask = 0xff000000ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask]       \n\t"
-      "or         %[src1],         %[src1],           %[mask]       \n\t"
-      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask]       \n\t"
-      "or         %[src1],         %[src1],           %[mask]       \n\t"
-      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  uint64_t src0, src1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0xff000000ULL;
-  const uint64_t mask2 = 0xc6;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
-      "or         %[src1],         %[src1],           %[mask1]      \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
-      "or         %[src1],         %[src1],           %[mask1]      \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
-      : "memory");
-}
-
-void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-  uint64_t mask0 = 0xc6;
-  uint64_t mask1 = 0x6c;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_raw])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_raw])                 \n\t"
-      "gslwrc1    %[src1],         0x08(%[src_raw])                 \n\t"
-      "gslwlc1    %[src1],         0x0b(%[src_raw])                 \n\t"
-
-      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
-      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
-      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[zero]       \n\t"
-      "pextrh     %[ftmp2],        %[ftmp0],          %[three]      \n\t"
-      "pextrh     %[ftmp3],        %[ftmp1],          %[one]        \n\t"
-      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[ftmp3]      \n\t"
-      "pextrh     %[ftmp3],        %[ftmp1],          %[two]        \n\t"
-      "pinsrh_1   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "pextrh     %[ftmp2],        %[src1],           %[zero]       \n\t"
-      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
-      "pinsrh_0   %[src1],         %[src1],           %[ftmp3]      \n\t"
-      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
-      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
-
-      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb24])               \n\t"
-      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb24])               \n\t"
-      "gsswrc1    %[src1],         0x08(%[dst_rgb24])               \n\t"
-      "gsswlc1    %[src1],         0x0b(%[dst_rgb24])               \n\t"
-
-      "daddiu     %[src_raw],      %[src_raw],        0x0c          \n\t"
-      "daddiu     %[dst_rgb24],    %[dst_rgb24],      0x0c          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
-        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
-      : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
-        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
-        [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
-      : "memory");
-}
-
-void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t ftmp[5];
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "1:                                                      \n\t"
-      "gsldrc1   %[src0],       0x00(%[src_rgb565])            \n\t"
-      "gsldlc1   %[src0],       0x07(%[src_rgb565])            \n\t"
-      "psrlh     %[src1],       %[src0],             %[eight]  \n\t"
-      "and       %[b],          %[src0],             %[c0]     \n\t"
-      "and       %[src0],       %[src0],             %[c1]     \n\t"
-      "psrlh     %[src0],       %[src0],             %[five]   \n\t"
-      "and       %[g],          %[src1],             %[c2]     \n\t"
-      "psllh     %[g],          %[g],                %[three]  \n\t"
-      "or        %[g],          %[src0],             %[g]      \n\t"
-      "psrlh     %[r],          %[src1],             %[three]  \n\t"
-      "psllh     %[src0],       %[b],                %[three]  \n\t"
-      "psrlh     %[src1],       %[b],                %[two]    \n\t"
-      "or        %[b],          %[src0],             %[src1]   \n\t"
-      "psllh     %[src0],       %[g],                %[two]    \n\t"
-      "psrlh     %[src1],       %[g],                %[four]   \n\t"
-      "or        %[g],          %[src0],             %[src1]   \n\t"
-      "psllh     %[src0],       %[r],                %[three]  \n\t"
-      "psrlh     %[src1],       %[r],                %[two]    \n\t"
-      "or        %[r],          %[src0],             %[src1]   \n\t"
-      "packushb  %[b],          %[b],                %[r]      \n\t"
-      "packushb  %[g],          %[g],                %[c1]     \n\t"
-      "punpcklbh %[src0],       %[b],                %[g]      \n\t"
-      "punpckhbh %[src1],       %[b],                %[g]      \n\t"
-      "punpcklhw %[r],          %[src0],             %[src1]   \n\t"
-      "gssdrc1   %[r],          0x00(%[dst_argb])              \n\t"
-      "gssdlc1   %[r],          0x07(%[dst_argb])              \n\t"
-      "punpckhhw %[r],          %[src0],             %[src1]   \n\t"
-      "gssdrc1   %[r],          0x08(%[dst_argb])              \n\t"
-      "gssdlc1   %[r],          0x0f(%[dst_argb])              \n\t"
-      "daddiu    %[src_rgb565], %[src_rgb565],       0x08      \n\t"
-      "daddiu    %[dst_argb],   %[dst_argb],         0x10      \n\t"
-      "daddiu    %[width],      %[width],           -0x04      \n\t"
-      "bgtz      %[width],     1b                              \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
-      : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
-        [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
-                           uint8_t* dst_argb,
-                           int width) {
-  uint64_t ftmp[6];
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  uint64_t c4 = 0x0001000100010001;
-  __asm__ volatile(
-      "1:                                                         \n\t"
-      "gsldrc1   %[src0],         0x00(%[src_argb1555])           \n\t"
-      "gsldlc1   %[src0],         0x07(%[src_argb1555])           \n\t"
-      "psrlh     %[src1],         %[src0],              %[eight]  \n\t"
-      "and       %[b],            %[src0],              %[c0]     \n\t"
-      "and       %[src0],         %[src0],              %[c1]     \n\t"
-      "psrlh     %[src0],         %[src0],              %[five]   \n\t"
-      "and       %[g],            %[src1],              %[c2]     \n\t"
-      "psllh     %[g],            %[g],                 %[three]  \n\t"
-      "or        %[g],            %[src0],              %[g]      \n\t"
-      "and       %[r],            %[src1],              %[c3]     \n\t"
-      "psrlh     %[r],            %[r],                 %[two]    \n\t"
-      "psrlh     %[a],            %[src1],              %[seven]  \n\t"
-      "psllh     %[src0],         %[b],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[b],                 %[two]    \n\t"
-      "or        %[b],            %[src0],              %[src1]   \n\t"
-      "psllh     %[src0],         %[g],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[g],                 %[two]    \n\t"
-      "or        %[g],            %[src0],              %[src1]   \n\t"
-      "psllh     %[src0],         %[r],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[r],                 %[two]    \n\t"
-      "or        %[r],            %[src0],              %[src1]   \n\t"
-      "xor       %[a],            %[a],                 %[c1]     \n\t"
-      "paddb     %[a],            %[a],                 %[c4]     \n\t"
-      "packushb  %[b],            %[b],                 %[r]      \n\t"
-      "packushb  %[g],            %[g],                 %[a]      \n\t"
-      "punpcklbh %[src0],         %[b],                 %[g]      \n\t"
-      "punpckhbh %[src1],         %[b],                 %[g]      \n\t"
-      "punpcklhw %[r],            %[src0],              %[src1]   \n\t"
-      "gssdrc1   %[r],            0x00(%[dst_argb])               \n\t"
-      "gssdlc1   %[r],            0x07(%[dst_argb])               \n\t"
-      "punpckhhw %[r],            %[src0],              %[src1]   \n\t"
-      "gssdrc1   %[r],            0x08(%[dst_argb])               \n\t"
-      "gssdlc1   %[r],            0x0f(%[dst_argb])               \n\t"
-      "daddiu    %[src_argb1555], %[src_argb1555],      0x08      \n\t"
-      "daddiu    %[dst_argb],     %[dst_argb],          0x10      \n\t"
-      "daddiu    %[width],        %[width],            -0x04      \n\t"
-      "bgtz      %[width],        1b                              \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
-      : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
-        [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
-      : "memory");
-}
-
-void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
-                           uint8_t* dst_argb,
-                           int width) {
-  uint64_t ftmp[6];
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "1:                                                          \n\t"
-      "gsldrc1   %[src0],         0x00(%[src_argb4444])            \n\t"
-      "gsldlc1   %[src0],         0x07(%[src_argb4444])            \n\t"
-      "psrlh     %[src1],         %[src0],              %[eight]   \n\t"
-      "and       %[b],            %[src0],              %[c0]      \n\t"
-      "and       %[src0],         %[src0],              %[c1]      \n\t"
-      "psrlh     %[g],            %[src0],              %[four]    \n\t"
-      "and       %[r],            %[src1],              %[c0]      \n\t"
-      "psrlh     %[a],            %[src1],              %[four]    \n\t"
-      "psllh     %[src0],         %[b],                 %[four]    \n\t"
-      "or        %[b],            %[src0],              %[b]       \n\t"
-      "psllh     %[src0],         %[g],                 %[four]    \n\t"
-      "or        %[g],            %[src0],              %[g]       \n\t"
-      "psllh     %[src0],         %[r],                 %[four]    \n\t"
-      "or        %[r],            %[src0],              %[r]       \n\t"
-      "psllh     %[src0],         %[a],                 %[four]    \n\t"
-      "or        %[a],            %[src0],              %[a]       \n\t"
-      "packushb  %[b],            %[b],                 %[r]       \n\t"
-      "packushb  %[g],            %[g],                 %[a]       \n\t"
-      "punpcklbh %[src0],         %[b],                 %[g]       \n\t"
-      "punpckhbh %[src1],         %[b],                 %[g]       \n\t"
-      "punpcklhw %[r],            %[src0],              %[src1]    \n\t"
-      "gssdrc1   %[r],            0x00(%[dst_argb])                \n\t"
-      "gssdlc1   %[r],            0x07(%[dst_argb])                \n\t"
-      "punpckhhw %[r],            %[src0],              %[src1]    \n\t"
-      "gssdrc1   %[r],            0x08(%[dst_argb])                \n\t"
-      "gssdlc1   %[r],            0x0f(%[dst_argb])                \n\t"
-      "daddiu    %[src_argb4444], %[src_argb4444],      0x08       \n\t"
-      "daddiu    %[dst_argb],     %[dst_argb],          0x10       \n\t"
-      "daddiu    %[width],        %[width],            -0x04       \n\t"
-      "bgtz      %[width],        1b                               \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
-      : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
-        [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x03(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x04(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x06(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x03(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x0b(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x09(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x06(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x0c(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x0c(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x09(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x0c          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-  uint64_t mask0 = 0xc6;
-  uint64_t mask1 = 0x18;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
-      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
-      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[ftmp2],        %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]       \n\t"
-
-      "pextrh     %[src0],         %[ftmp1],          %[two]        \n\t"
-      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[src0]       \n\t"
-      "pshufh     %[ftmp1],        %[ftmp1],          %[one]        \n\t"
-
-      "pextrh     %[src0],         %[ftmp2],          %[two]        \n\t"
-      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
-      "pextrh     %[src0],         %[ftmp2],          %[one]        \n\t"
-      "pinsrh_3   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
-      "pextrh     %[src0],         %[ftmp2],          %[zero]       \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "pinsrh_0   %[src1],         %[src1],           %[src0]       \n\t"
-      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
-      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
-
-      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb])                 \n\t"
-      "gsswrc1    %[src1],         0x08(%[dst_rgb])                 \n\t"
-      "gsswlc1    %[src1],         0x0b(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x0c          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
-        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
-        [one] "f"(0x01), [two] "f"(0x02)
-      : "memory");
-}
-
-void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[two]        \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[eleven]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
-        [eleven] "f"(0x0b)
-      : "memory");
-}
-
-// dither4 is a row of 4 values from 4x4 dither matrix.
-// The 4x4 matrix contains values to increase RGB.  When converting to
-// fewer bits (565) this provides an ordered dither.
-// The order in the 4x4 matrix in first byte is upper left.
-// The 4 values are passed as an int, then referenced as an array, so
-// endian will not affect order of the original matrix.  But the dither4
-// will containing the first pixel in the lower byte for little endian
-// or the upper byte for big endian.
-void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
-                               uint8_t* dst_rgb,
-                               const uint32_t dither4,
-                               int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-
-  __asm__ volatile(
-      "punpcklbh  %[dither],       %[dither],         %[zero]       \n\t"
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-
-      "paddh      %[b],            %[b],              %[dither]     \n\t"
-      "paddh      %[g],            %[g],              %[dither]     \n\t"
-      "paddh      %[r],            %[r],              %[dither]     \n\t"
-      "pcmpgth    %[src0],         %[b],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[b]          \n\t"
-      "and        %[b],            %[src0],           %[c0]         \n\t"
-      "pcmpgth    %[src0],         %[g],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[g]          \n\t"
-      "and        %[g],            %[src0],           %[c0]         \n\t"
-      "pcmpgth    %[src0],         %[r],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[r]          \n\t"
-      "and        %[r],            %[src0],           %[c0]         \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[two]        \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[eleven]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
-        [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
-      : "memory");
-}
-
-void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[three]      \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-      "psrlh      %[a],            %[a],              %[seven]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[ten]        \n\t"
-      "psllh      %[a],            %[a],              %[fifteen]    \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-      "or         %[b],            %[b],              %[a]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
-        [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
-      : "memory");
-}
-
-void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[four]       \n\t"
-      "psrlh      %[g],            %[g],              %[four]       \n\t"
-      "psrlh      %[r],            %[r],              %[four]       \n\t"
-      "psrlh      %[a],            %[a],              %[four]       \n\t"
-
-      "psllh      %[g],            %[g],              %[four]       \n\t"
-      "psllh      %[r],            %[r],              %[eight]      \n\t"
-      "psllh      %[a],            %[a],              %[twelve]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-      "or         %[b],            %[b],              %[a]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
-        [twelve] "f"(0x0c)
-      : "memory");
-}
-
-void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001004200810019;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void ARGBToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0019008100420001;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void BGRAToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002f00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]        \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001001900810042;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void ABGRToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002F00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0042008100190001;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RGBAToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001004200810019;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0d(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x13(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x19(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x18          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x06(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x0c(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x12(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x1e(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x24(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x2a(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x30              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001001900810042;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0d(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x13(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x19(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x18          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RAWToUVRow_MMI(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002f00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x06(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x0c(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x12(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x1e(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x24(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x2a(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x30              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest0, dest1, dest2, dest3;
-  uint64_t tmp0, tmp1;
-  const uint64_t shift = 0x08;
-  const uint64_t value = 0x80;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x0001004D0096001DULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[shift]      \n\t"
-
-      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
-        [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
-        [tmp1] "=&f"(tmp1)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
-        [width] "r"(width)
-      : "memory");
-}
-
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0015002a003f0002;
-  const uint64_t mask_v = 0x0002003f0035000a;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  uint64_t ftmp[11];
-  const uint64_t value = 0x1080108010801080;
-  const uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],        0x00(%[src_rgb565])                \n\t"
-      "gsldlc1    %[src0],        0x07(%[src_rgb565])                \n\t"
-      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
-      "and        %[b],           %[src0],             %[c0]         \n\t"
-      "and        %[src0],        %[src0],             %[c1]         \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
-      "and        %[g],           %[src1],             %[c2]         \n\t"
-      "psllh      %[g],           %[g],                %[three]      \n\t"
-      "or         %[g],           %[src0],             %[g]          \n\t"
-      "psrlh      %[r],           %[src1],             %[three]      \n\t"
-      "psllh      %[src0],        %[b],                %[three]      \n\t"
-      "psrlh      %[src1],        %[b],                %[two]        \n\t"
-      "or         %[b],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[g],                %[two]        \n\t"
-      "psrlh      %[src1],        %[g],                %[four]       \n\t"
-      "or         %[g],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[r],                %[three]      \n\t"
-      "psrlh      %[src1],        %[r],                %[two]        \n\t"
-      "or         %[r],           %[src0],             %[src1]       \n\t"
-      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest0],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest0],       %[dest0],            %[eight]      \n\t"
-
-      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest1],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest1],       %[dest1],            %[eight]      \n\t"
-
-      "gsldrc1    %[src0],        0x08(%[src_rgb565])                \n\t"
-      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                \n\t"
-      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
-      "and        %[b],           %[src0],             %[c0]         \n\t"
-      "and        %[src0],        %[src0],             %[c1]         \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
-      "and        %[g],           %[src1],             %[c2]         \n\t"
-      "psllh      %[g],           %[g],                %[three]      \n\t"
-      "or         %[g],           %[src0],             %[g]          \n\t"
-      "psrlh      %[r],           %[src1],             %[three]      \n\t"
-      "psllh      %[src0],        %[b],                %[three]      \n\t"
-      "psrlh      %[src1],        %[b],                %[two]        \n\t"
-      "or         %[b],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[g],                %[two]        \n\t"
-      "psrlh      %[src1],        %[g],                %[four]       \n\t"
-      "or         %[g],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[r],                %[three]      \n\t"
-      "psrlh      %[src1],        %[r],                %[two]        \n\t"
-      "or         %[r],           %[src0],             %[src1]       \n\t"
-      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest2],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest2],       %[dest2],            %[eight]      \n\t"
-
-      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest3],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest3],       %[dest3],            %[eight]      \n\t"
-
-      "packsswh   %[src_lo],      %[dest0],            %[dest1]      \n\t"
-      "packsswh   %[src_hi],      %[dest2],            %[dest3]      \n\t"
-      "packushb   %[dest0],       %[src_lo],           %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],       0x07(%[dst_y])                     \n\t"
-      "gssdrc1    %[dest0],       0x00(%[dst_y])                     \n\t"
-
-      "daddiu    %[src_rgb565],   %[src_rgb565],       0x10          \n\t"
-      "daddiu    %[dst_y],        %[dst_y],            0x08          \n\t"
-      "daddiu    %[width],        %[width],           -0x08          \n\t"
-      "bgtz      %[width],        1b                                 \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
-        [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
-                        uint8_t* dst_y,
-                        int width) {
-  uint64_t ftmp[11];
-  const uint64_t value = 0x1080108010801080;
-  const uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb1555])             \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb1555])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
-      "and        %[g],            %[src1],              %[c2]       \n\t"
-      "psllh      %[g],            %[g],                 %[three]    \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "and        %[r],            %[src1],              %[c3]       \n\t"
-      "psrlh      %[r],            %[r],                 %[two]      \n\t"
-      "psllh      %[src0],         %[b],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
-      "or         %[b],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[g],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
-      "or         %[g],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[r],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
-      "or         %[r],            %[src0],              %[src1]     \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb1555])             \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb1555])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
-      "and        %[g],            %[src1],              %[c2]       \n\t"
-      "psllh      %[g],            %[g],                 %[three]    \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "and        %[r],            %[src1],              %[c3]       \n\t"
-      "psrlh      %[r],            %[r],                 %[two]      \n\t"
-      "psllh      %[src0],         %[b],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
-      "or         %[b],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[g],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
-      "or         %[g],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[r],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
-      "or         %[r],            %[src0],              %[src1]     \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
-      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
-      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
-
-      "daddiu     %[src_argb1555], %[src_argb1555],      0x10        \n\t"
-      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
-      "daddiu     %[width],        %[width],            -0x08        \n\t"
-      "bgtz       %[width],        1b                                \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
-        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
-        [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
-        [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
-      : "memory");
-}
-
-void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
-                        uint8_t* dst_y,
-                        int width) {
-  uint64_t ftmp[11];
-  uint64_t value = 0x1080108010801080;
-  uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb4444])             \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb4444])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[g],            %[src0],              %[four]     \n\t"
-      "and        %[r],            %[src1],              %[c0]       \n\t"
-      "psllh      %[src0],         %[b],                 %[four]     \n\t"
-      "or         %[b],            %[src0],              %[b]        \n\t"
-      "psllh      %[src0],         %[g],                 %[four]     \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "psllh      %[src0],         %[r],                 %[four]     \n\t"
-      "or         %[r],            %[src0],              %[r]        \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb4444])             \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb4444])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[g],            %[src0],              %[four]     \n\t"
-      "and        %[r],            %[src1],              %[c0]       \n\t"
-      "psllh      %[src0],         %[b],                 %[four]     \n\t"
-      "or         %[b],            %[src0],              %[b]        \n\t"
-      "psllh      %[src0],         %[g],                 %[four]     \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "psllh      %[src0],         %[r],                 %[four]     \n\t"
-      "or         %[r],            %[src0],              %[r]        \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
-      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
-      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
-
-      "daddiu     %[src_argb4444], %[src_argb4444],      0x10        \n\t"
-      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
-      "daddiu     %[width],        %[width],            -0x08        \n\t"
-      "bgtz       %[width],        1b                                \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
-        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
-        [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
-      : "memory");
-}
-
-void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
-                       int src_stride_rgb565,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  uint64_t ftmp[13];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "daddu      %[next_rgb565], %[src_rgb565],       %[next_rgb565]   \n\t"
-      "1:                                                               \n\t"
-      "gsldrc1    %[src0],        0x00(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x07(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x00(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x07(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest0_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest0_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest0_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest0_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest0_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest0_v],     %[dest0_v],          %[three]         \n\t"
-      "or         %[dest0_v],     %[src1],             %[dest0_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest0_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest0_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest0_u],          %[dest0_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest0_u],          %[dest0_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest0_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest0_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest0_u],     %[dest0_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest0_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest0_u],          %[b0]            \n\t"
-      "psubw      %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest0_u],     %[dest0_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest0_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest0_v],          %[g0]            \n\t"
-      "psubw      %[dest0_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest0_v],     %[dest0_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x08(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x08(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x0f(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest1_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest1_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest1_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest1_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest1_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest1_v],     %[dest1_v],          %[three]         \n\t"
-      "or         %[dest1_v],     %[src1],             %[dest1_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest1_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest1_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest1_u],          %[dest1_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest1_u],          %[dest1_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest1_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest1_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest1_u],     %[dest1_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest1_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest1_u],          %[b0]            \n\t"
-      "psubw      %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest1_u],     %[dest1_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest1_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest1_v],          %[g0]            \n\t"
-      "psubw      %[dest1_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest1_v],     %[dest1_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x10(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x17(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x10(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x17(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest2_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest2_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest2_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest2_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest2_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest2_v],     %[dest2_v],          %[three]         \n\t"
-      "or         %[dest2_v],     %[src1],             %[dest2_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest2_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest2_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest2_u],          %[dest2_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest2_u],          %[dest2_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest2_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest2_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest2_u],     %[dest2_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest2_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest2_u],          %[b0]            \n\t"
-      "psubw      %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest2_u],     %[dest2_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest2_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest2_v],          %[g0]            \n\t"
-      "psubw      %[dest2_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest2_v],     %[dest2_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x18(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x1f(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x18(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x1f(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest3_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest3_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest3_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest3_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest3_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest3_v],     %[dest3_v],          %[three]         \n\t"
-      "or         %[dest3_v],     %[src1],             %[dest3_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest3_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest3_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest3_u],          %[dest3_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest3_u],          %[dest3_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest3_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest3_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest3_u],     %[dest3_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest3_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest3_u],          %[b0]            \n\t"
-      "psubw      %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest3_u],     %[dest3_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest3_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest3_v],          %[g0]            \n\t"
-      "psubw      %[dest3_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest3_v],     %[dest3_v],          %[eight]         \n\t"
-
-      "packsswh   %[src0],        %[dest0_u],          %[dest1_u]       \n\t"
-      "packsswh   %[src1],        %[dest2_u],          %[dest3_u]       \n\t"
-      "packushb   %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "gssdlc1    %[dest0_u],     0x07(%[dst_u])                        \n\t"
-      "gssdrc1    %[dest0_u],     0x00(%[dst_u])                        \n\t"
-      "packsswh   %[src0],        %[dest0_v],          %[dest1_v]       \n\t"
-      "packsswh   %[src1],        %[dest2_v],          %[dest3_v]       \n\t"
-      "packushb   %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "gssdlc1    %[dest0_v],     0x07(%[dst_v])                        \n\t"
-      "gssdrc1    %[dest0_v],     0x00(%[dst_v])                        \n\t"
-
-      "daddiu    %[src_rgb565],   %[src_rgb565],       0x20             \n\t"
-      "daddiu    %[next_rgb565],  %[next_rgb565],      0x20             \n\t"
-      "daddiu    %[dst_u],        %[dst_u],            0x08             \n\t"
-      "daddiu    %[dst_v],        %[dst_v],            0x08             \n\t"
-      "daddiu    %[width],        %[width],           -0x10             \n\t"
-      "bgtz      %[width],        1b                                    \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
-        [dest3_v] "=&f"(ftmp[12])
-      : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
-        [one] "f"(0x01)
-      : "memory");
-}
-
-void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
-                         int src_stride_argb1555,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  uint64_t ftmp[11];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  __asm__ volatile(
-      "daddu      %[next_argb1555], %[src_argb1555],      %[next_argb1555] \n\t"
-      "1:                                                                  \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x00(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x07(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest0_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest0_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
-      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
-      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x08(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x0f(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x08(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x0f(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest1_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest1_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
-      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
-      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "packsswh   %[dest0_u],       %[dest0_u],            %[dest1_u]      \n\t"
-      "packsswh   %[dest1_u],       %[dest0_v],            %[dest1_v]      \n\t"
-
-      "gsldrc1    %[src0],          0x10(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x17(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x10(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x17(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest2_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest2_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
-      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest2_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest2_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
-      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x18(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x1f(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x18(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x1f(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest3_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest3_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
-      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest3_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest3_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
-      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
-      "packushb   %[dest0_u],       %[dest0_u],            %[src1]         \n\t"
-      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
-      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
-      "packsswh   %[src1],          %[dest0_v],            %[dest1_v]      \n\t"
-      "packushb   %[dest0_v],       %[dest1_u],            %[src1]         \n\t"
-      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
-      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
-
-      "daddiu    %[src_argb1555],   %[src_argb1555],       0x20            \n\t"
-      "daddiu    %[next_argb1555],  %[next_argb1555],      0x20            \n\t"
-      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
-      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
-      "daddiu    %[width],          %[width],             -0x10            \n\t"
-      "bgtz      %[width],          1b                                     \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10])
-      : [src_argb1555] "r"(src_argb1555),
-        [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
-        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
-        [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
-        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
-        [two] "f"(0x02), [one] "f"(0x01)
-      : "memory");
-}
-
-void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
-                         int src_stride_argb4444,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  uint64_t ftmp[13];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "daddu      %[next_argb4444], %[src_argb4444],      %[next_argb4444] \n\t"
-      "1:                                                                  \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x00(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x07(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest0_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest0_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
-      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x08(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x0f(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x08(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x0f(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest1_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest1_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
-      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x10(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x17(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x10(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x17(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest2_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest2_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest2_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest2_u],            %[dest2_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest2_u],            %[dest2_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest2_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
-      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest2_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_v],            %[g0]           \n\t"
-      "psubw      %[dest2_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest2_v],       %[dest2_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x18(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x1f(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x18(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x1f(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest3_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest3_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest3_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest3_u],            %[dest3_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest3_u],            %[dest3_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest3_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
-      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest3_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_v],            %[g0]           \n\t"
-      "psubw      %[dest3_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest3_v],       %[dest3_v],            %[eight]        \n\t"
-
-      "packsswh   %[src0],          %[dest0_u],            %[dest1_u]      \n\t"
-      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
-      "packushb   %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
-      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
-      "packsswh   %[src0],          %[dest0_v],            %[dest1_v]      \n\t"
-      "packsswh   %[src1],          %[dest2_v],            %[dest3_v]      \n\t"
-      "packushb   %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
-      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
-
-      "daddiu    %[src_argb4444],   %[src_argb4444],       0x20            \n\t"
-      "daddiu    %[next_argb4444],  %[next_argb4444],      0x20            \n\t"
-      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
-      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
-      "daddiu    %[width],          %[width],             -0x10            \n\t"
-      "bgtz      %[width],          1b                                     \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
-        [dest3_v] "=&f"(ftmp[12])
-      : [src_argb4444] "r"(src_argb4444),
-        [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
-        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
-        [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
-        [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
-        [two] "f"(0x02)
-      : "memory");
-}
-
-void ARGBToUV444Row_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0026004a00700002;
-  const uint64_t mask_v = 0x00020070005e0012;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest0_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest1_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest2_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest3_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x20              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x08              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
-        [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
-        [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
-        [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
-        [dest3_v] "=&f"(ftmp[11])
-      : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
-        [eight] "f"(0x08)
-      : "memory");
-}
-
-void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
-  uint64_t tmp0, tmp1;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x01;
-  const uint64_t mask2 = 0x0080004D0096001DULL;
-  const uint64_t mask3 = 0xFF000000FF000000ULL;
-  const uint64_t mask4 = ~mask3;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "and        %[src37],        %[src],            %[mask3]      \n\t"
-
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[mask2]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_lo]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_lo]    \n\t"
-      "paddw      %[dest_lo],      %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest_lo],        %[dest_lo]    \n\t"
-
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[mask2]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest_hi],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_hi],        %[dest_hi]    \n\t"
-      "paddw      %[dest_hi],      %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest_hi],        %[dest_hi]    \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask4]      \n\t"
-      "or         %[dest],         %[dest],           %[src37]      \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
-        [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
-        [src37] "=&f"(src37)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
-      : "memory");
-}
-
-// Convert a row of image to Sepia tone.
-void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
-  uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
-  uint64_t tmp0, tmp1;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x002300440011ULL;
-  const uint64_t mask2 = 0x002D00580016ULL;
-  const uint64_t mask3 = 0x003200620018ULL;
-  const uint64_t mask4 = 0xFF000000FF000000ULL;
-  const uint64_t shift = 0x07;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "and        %[dest37],       %[dest],           %[mask4]      \n\t"
-
-      "punpcklbh  %[dest_lo],      %[dest],           %[mask0]      \n\t"
-      "pmaddhw    %[dest0],        %[dest_lo],        %[mask1]      \n\t"
-      "pmaddhw    %[dest1],        %[dest_lo],        %[mask2]      \n\t"
-      "pmaddhw    %[dest2],        %[dest_lo],        %[mask3]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-
-      "punpckhbh  %[dest_hi],      %[dest],           %[mask0]      \n\t"
-      "pmaddhw    %[dest0],        %[dest_hi],        %[mask1]      \n\t"
-      "pmaddhw    %[dest1],        %[dest_hi],        %[mask2]      \n\t"
-      "pmaddhw    %[dest2],        %[dest_hi],        %[mask3]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "or         %[dest],         %[dest],           %[dest37]     \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
-        [dest] "=&f"(dest)
-      : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
-        [mask4] "f"(mask4), [shift] "f"(shift)
-      : "memory");
-}
-
-// Apply color matrix to a row of image. Matrix is signed.
-// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const int8_t* matrix_argb,
-                            int width) {
-  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
-      dest3;
-  uint64_t matrix, matrix_hi, matrix_lo;
-  uint64_t tmp0, tmp1;
-  const uint64_t shift0 = 0x06;
-  const uint64_t shift1 = 0x08;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-
-      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest0],        %[dest0],          %[shift0]     \n\t"
-
-      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest1],        %[dest1],          %[shift0]     \n\t"
-
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-
-      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest2],        %[dest2],          %[shift0]     \n\t"
-
-      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest3],        %[dest3],          %[shift0]     \n\t"
-
-      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
-        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
-        [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
-      : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
-        [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
-      : "memory");
-}
-
-void ARGBShadeRow_MMI(const uint8_t* src_argb,
-                      uint8_t* dst_argb,
-                      int width,
-                      uint32_t value) {
-  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
-
-      "punpcklbh  %[value],        %[value],          %[value]      \n\t"
-
-      "pmulhuh    %[dest_lo],      %[src_lo],         %[value]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "pmulhuh    %[dest_hi],      %[src_hi],         %[value]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
-        [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [value] "f"(value), [shift] "f"(shift)
-      : "memory");
-}
-
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
-  uint64_t dest, dest_lo, dest_hi;
-  const uint64_t mask = 0x0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[src0]       \n\t"
-      "punpckhbh  %[src0_hi],      %[src0],           %[src0]       \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask]       \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask]       \n\t"
-
-      "pmulhuh    %[dest_lo],      %[src0_lo],        %[src1_lo]    \n\t"
-      "pmulhuh    %[dest_hi],      %[src0_hi],        %[src1_hi]    \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
-        [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
-      : "memory");
-}
-
-void ARGBAddRow_MMI(const uint8_t* src_argb,
-                    const uint8_t* src_argb1,
-                    uint8_t* dst_argb,
-                    int width) {
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "paddusb    %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBSubtractRow_MMI(const uint8_t* src_argb,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "psubusb    %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
-}
-
-// Sobel functions which mimics SSSE3.
-void SobelXRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   const uint8_t* src_y2,
-                   uint8_t* dst_sobelx,
-                   int width) {
-  uint64_t y00 = 0, y10 = 0, y20 = 0;
-  uint64_t y02 = 0, y12 = 0, y22 = 0;
-  uint64_t zero = 0x0;
-  uint64_t sobel = 0x0;
-  __asm__ volatile(
-      "1:	                                         \n\t"
-      "gsldlc1   %[y00],        0x07(%[src_y0])          \n\t"  // a=src_y0[i]
-      "gsldrc1   %[y00],        0x00(%[src_y0])          \n\t"
-      "gsldlc1   %[y02],        0x09(%[src_y0])          \n\t"  // a_sub=src_y0[i+2]
-      "gsldrc1   %[y02],        0x02(%[src_y0])          \n\t"
-
-      "gsldlc1   %[y10],        0x07(%[src_y1])          \n\t"  // b=src_y1[i]
-      "gsldrc1   %[y10],        0x00(%[src_y1])          \n\t"
-      "gsldlc1   %[y12],        0x09(%[src_y1])          \n\t"  // b_sub=src_y1[i+2]
-      "gsldrc1   %[y12],        0x02(%[src_y1])          \n\t"
-
-      "gsldlc1   %[y20],        0x07(%[src_y2])          \n\t"  // c=src_y2[i]
-      "gsldrc1   %[y20],        0x00(%[src_y2])          \n\t"
-      "gsldlc1   %[y22],        0x09(%[src_y2])          \n\t"  // c_sub=src_y2[i+2]
-      "gsldrc1   %[y22],        0x02(%[src_y2])          \n\t"
-
-      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
-      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
-      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
-
-      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
-      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],          %[y10]  \n\t"  // a+b
-      "paddh     %[y20],        %[y20],          %[y10]  \n\t"  // c+b
-      "paddh     %[y00],        %[y00],          %[y20]  \n\t"  // a+2b+c
-
-      "paddh     %[y02],        %[y02],          %[y12]  \n\t"  // a_sub+b_sub
-      "paddh     %[y22],        %[y22],          %[y12]  \n\t"  // c_sub+b_sub
-      "paddh     %[y02],        %[y02],          %[y22]  \n\t"  // a_sub+2b_sub+c_sub
-
-      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
-      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
-      "psubh     %[sobel],      %[y10],          %[y20]  \n\t"  // Abs
-
-      "gsldlc1   %[y00],        0x0B(%[src_y0])          \n\t"
-      "gsldrc1   %[y00],        0x04(%[src_y0])          \n\t"
-      "gsldlc1   %[y02],        0x0D(%[src_y0])          \n\t"
-      "gsldrc1   %[y02],        0x06(%[src_y0])          \n\t"
-
-      "gsldlc1   %[y10],        0x0B(%[src_y1])          \n\t"
-      "gsldrc1   %[y10],        0x04(%[src_y1])          \n\t"
-      "gsldlc1   %[y12],        0x0D(%[src_y1])          \n\t"
-      "gsldrc1   %[y12],        0x06(%[src_y1])          \n\t"
-
-      "gsldlc1   %[y20],        0x0B(%[src_y2])          \n\t"
-      "gsldrc1   %[y20],        0x04(%[src_y2])          \n\t"
-      "gsldlc1   %[y22],        0x0D(%[src_y2])          \n\t"
-      "gsldrc1   %[y22],        0x06(%[src_y2])          \n\t"
-
-      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
-      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
-      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
-
-      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
-      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],          %[y10]  \n\t"
-      "paddh     %[y20],        %[y20],          %[y10]  \n\t"
-      "paddh     %[y00],        %[y00],          %[y20]  \n\t"
-
-      "paddh     %[y02],        %[y02],          %[y12]  \n\t"
-      "paddh     %[y22],        %[y22],          %[y12]  \n\t"
-      "paddh     %[y02],        %[y02],          %[y22]  \n\t"
-
-      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
-      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
-      "psubh     %[y00],        %[y10],          %[y20]  \n\t"
-
-      "packushb  %[sobel],      %[sobel],        %[y00]  \n\t"  // clamp255
-      "gssdrc1   %[sobel],      0(%[dst_sobelx])         \n\t"
-      "gssdlc1   %[sobel],      7(%[dst_sobelx])         \n\t"
-
-      "daddiu    %[src_y0],     %[src_y0],      8        \n\t"
-      "daddiu    %[src_y1],     %[src_y1],      8        \n\t"
-      "daddiu    %[src_y2],     %[src_y2],      8        \n\t"
-      "daddiu    %[dst_sobelx], %[dst_sobelx],  8        \n\t"
-      "daddiu    %[width],      %[width],      -8        \n\t"
-      "bgtz      %[width],      1b                       \n\t"
-      "nop                                               \n\t"
-      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
-        [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
-      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
-        [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
-      : "memory");
-}
-
-void SobelYRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   uint8_t* dst_sobely,
-                   int width) {
-  uint64_t y00 = 0, y01 = 0, y02 = 0;
-  uint64_t y10 = 0, y11 = 0, y12 = 0;
-  uint64_t zero = 0x0;
-  uint64_t sobel = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[y00],        0x07(%[src_y0])         \n\t"  // a=src_y0[i]
-      "gsldrc1   %[y00],        0x00(%[src_y0])         \n\t"
-      "gsldlc1   %[y01],        0x08(%[src_y0])         \n\t"  // b=src_y0[i+1]
-      "gsldrc1   %[y01],        0x01(%[src_y0])         \n\t"
-      "gsldlc1   %[y02],        0x09(%[src_y0])         \n\t"  // c=src_y0[i+2]
-      "gsldrc1   %[y02],        0x02(%[src_y0])         \n\t"
-
-      "gsldlc1   %[y10],        0x07(%[src_y1])         \n\t"  // a_sub=src_y1[i]
-      "gsldrc1   %[y10],        0x00(%[src_y1])         \n\t"
-      "gsldlc1   %[y11],        0x08(%[src_y1])         \n\t"  // b_sub=src_y1[i+1]
-      "gsldrc1   %[y11],        0x01(%[src_y1])         \n\t"
-      "gsldlc1   %[y12],        0x09(%[src_y1])         \n\t"  // c_sub=src_y1[i+2]
-      "gsldrc1   %[y12],        0x02(%[src_y1])         \n\t"
-
-      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
-      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
-      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
-
-      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
-      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],         %[y01]  \n\t"  // a+b
-      "paddh     %[y02],        %[y02],         %[y01]  \n\t"  // c+b
-      "paddh     %[y00],        %[y00],         %[y02]  \n\t"  // a+2b+c
-
-      "paddh     %[y10],        %[y10],         %[y11]  \n\t"  // a_sub+b_sub
-      "paddh     %[y12],        %[y12],         %[y11]  \n\t"  // c_sub+b_sub
-      "paddh     %[y10],        %[y10],         %[y12]  \n\t"  // a_sub+2b_sub+c_sub
-
-      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
-      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
-      "psubh     %[sobel],      %[y02],         %[y12]  \n\t"  // Abs
-
-      "gsldlc1   %[y00],        0x0B(%[src_y0])         \n\t"
-      "gsldrc1   %[y00],        0x04(%[src_y0])         \n\t"
-      "gsldlc1   %[y01],        0x0C(%[src_y0])         \n\t"
-      "gsldrc1   %[y01],        0x05(%[src_y0])         \n\t"
-      "gsldlc1   %[y02],        0x0D(%[src_y0])         \n\t"
-      "gsldrc1   %[y02],        0x06(%[src_y0])         \n\t"
-
-      "gsldlc1   %[y10],        0x0B(%[src_y1])         \n\t"
-      "gsldrc1   %[y10],        0x04(%[src_y1])         \n\t"
-      "gsldlc1   %[y11],        0x0C(%[src_y1])         \n\t"
-      "gsldrc1   %[y11],        0x05(%[src_y1])         \n\t"
-      "gsldlc1   %[y12],        0x0D(%[src_y1])         \n\t"
-      "gsldrc1   %[y12],        0x06(%[src_y1])         \n\t"
-
-      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
-      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
-      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
-
-      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
-      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],         %[y01]  \n\t"
-      "paddh     %[y02],        %[y02],         %[y01]  \n\t"
-      "paddh     %[y00],        %[y00],         %[y02]  \n\t"
-
-      "paddh     %[y10],        %[y10],         %[y11]  \n\t"
-      "paddh     %[y12],        %[y12],         %[y11]  \n\t"
-      "paddh     %[y10],        %[y10],         %[y12]  \n\t"
-
-      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
-      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
-      "psubh     %[y00],        %[y02],         %[y12]  \n\t"
-
-      "packushb  %[sobel],      %[sobel],       %[y00]  \n\t"  // clamp255
-      "gssdrc1   %[sobel],      0(%[dst_sobely])        \n\t"
-      "gssdlc1   %[sobel],      7(%[dst_sobely])        \n\t"
-
-      "daddiu    %[src_y0],     %[src_y0],      8       \n\t"
-      "daddiu    %[src_y1],     %[src_y1],      8       \n\t"
-      "daddiu    %[dst_sobely], %[dst_sobely],  8       \n\t"
-      "daddiu    %[width],      %[width],      -8       \n\t"
-      "bgtz      %[width],      1b                      \n\t"
-      "nop                                              \n\t"
-      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
-        [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
-      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
-        [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
-      : "memory");
-}
-
-void SobelRow_MMI(const uint8_t* src_sobelx,
-                  const uint8_t* src_sobely,
-                  uint8_t* dst_argb,
-                  int width) {
-  double temp[3];
-  uint64_t c1 = 0xff000000ff000000;
-  __asm__ volatile(
-      "1:	                                          \n\t"
-      "gsldlc1   %[t0],         0x07(%[src_sobelx])       \n\t"  // a=src_sobelx[i]
-      "gsldrc1   %[t0],         0x00(%[src_sobelx])       \n\t"
-      "gsldlc1   %[t1],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "gsldrc1   %[t1],         0x00(%[src_sobely])       \n\t"
-      // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
-      "paddusb   %[t2] ,        %[t0],              %[t1] \n\t"
-
-      // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
-      "punpcklbh %[t0],         %[t2],              %[t2] \n\t"
-
-      // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
-      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      // 255 s1 s1 s1 s55 s0 s0 s0
-      "gssdrc1   %[t1],         0x00(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x07(%[dst_argb])         \n\t"
-
-      // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
-      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      // 255 s3 s3 s3 255 s2 s2 s2
-      "gssdrc1   %[t1],         0x08(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x0f(%[dst_argb])         \n\t"
-
-      // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
-      "punpckhbh %[t0],         %[t2],              %[t2] \n\t"
-
-      // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
-      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      "gssdrc1   %[t1],         0x10(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x17(%[dst_argb])         \n\t"
-
-      // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
-      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      "gssdrc1   %[t1],         0x18(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x1f(%[dst_argb])         \n\t"
-
-      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
-      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
-      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
-      "daddiu    %[width],      %[width],          -8     \n\t"
-      "bgtz      %[width],      1b                        \n\t"
-      "nop                                                \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
-      : "memory");
-}
-
-void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
-                         const uint8_t* src_sobely,
-                         uint8_t* dst_y,
-                         int width) {
-  uint64_t tr = 0;
-  uint64_t tb = 0;
-  __asm__ volatile(
-      "1:	                                       \n\t"
-      "gsldrc1 %[tr],         0x0(%[src_sobelx])       \n\t"
-      "gsldlc1 %[tr],         0x7(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "gsldrc1 %[tb],         0x0(%[src_sobely])       \n\t"
-      "gsldlc1 %[tb],         0x7(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "paddusb %[tr],         %[tr],             %[tb] \n\t"  // g
-      "gssdrc1 %[tr],         0x0(%[dst_y])	       \n\t"
-      "gssdlc1 %[tr],         0x7(%[dst_y])            \n\t"
-
-      "daddiu  %[dst_y],      %[dst_y],          8     \n\t"
-      "daddiu  %[src_sobelx], %[src_sobelx],     8     \n\t"
-      "daddiu  %[src_sobely], %[src_sobely],     8     \n\t"
-      "daddiu  %[width],      %[width],         -8     \n\t"
-      "bgtz    %[width],      1b                       \n\t"
-      "nop                                             \n\t"
-      : [tr] "=&f"(tr), [tb] "=&f"(tb)
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_y] "r"(dst_y), [width] "r"(width)
-      : "memory");
-}
-
-void SobelXYRow_MMI(const uint8_t* src_sobelx,
-                    const uint8_t* src_sobely,
-                    uint8_t* dst_argb,
-                    int width) {
-  uint64_t temp[3];
-  uint64_t result = 0;
-  uint64_t gb = 0;
-  uint64_t cr = 0;
-  uint64_t c1 = 0xffffffffffffffff;
-  __asm__ volatile(
-      "1:	                                          \n\t"
-      "gsldlc1   %[tr],         0x07(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[tr],         0x00(%[src_sobelx])       \n\t"
-      "gsldlc1   %[tb],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "gsldrc1   %[tb],         0x00(%[src_sobely])       \n\t"
-      "paddusb   %[tg] ,        %[tr],              %[tb] \n\t"  // g
-
-      // g3 b3 g2 b2 g1 b1 g0 b0
-      "punpcklbh %[gb],         %[tb],              %[tg] \n\t"
-      // c3 r3 r2 r2 c1 r1 c0 r0
-      "punpcklbh %[cr],         %[tr],              %[c1] \n\t"
-      // c1 r1 g1 b1 c0 r0 g0 b0
-      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x00(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x07(%[dst_argb])         \n\t"
-      // c3 r3 g3 b3 c2 r2 g2 b2
-      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x08(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x0f(%[dst_argb])         \n\t"
-
-      // g7 b7 g6 b6 g5 b5 g4 b4
-      "punpckhbh %[gb],         %[tb],              %[tg] \n\t"
-      // c7 r7 c6 r6 c5 r5 c4 r4
-      "punpckhbh %[cr],         %[tr],              %[c1] \n\t"
-      // c5 r5 g5 b5 c4 r4 g4 b4
-      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x10(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x17(%[dst_argb])         \n\t"
-      // c7 r7 g7 b7 c6 r6 g6 b6
-      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x18(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x1f(%[dst_argb])         \n\t"
-
-      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
-      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
-      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
-      "daddiu    %[width],      %[width],          -8     \n\t"
-      "bgtz      %[width],      1b                        \n\t"
-      "nop                                                \n\t"
-      : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
-        [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
-      : "memory");
-}
-
-void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  // Copy a Y to RGB.
-  uint64_t src, dest;
-  const uint64_t mask0 = 0x00ffffff00ffffffULL;
-  const uint64_t mask1 = ~mask0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src],          %[src],            %[src]        \n\t"
-      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
-      "and        %[dest],         %[dest],           %[mask0]      \n\t"
-      "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-
-      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
-      "and        %[dest],         %[dest],           %[mask0]      \n\t"
-      "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-// TODO - respect YuvConstants
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf,
-                       const struct YuvConstants*, int width) {
-  uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x55;
-  const uint64_t mask2 = 0xAA;
-  const uint64_t mask3 = 0xFF;
-  const uint64_t mask4 = 0x4A354A354A354A35ULL;
-  const uint64_t mask5 = 0x0488048804880488ULL;
-  const uint64_t shift0 = 0x08;
-  const uint64_t shift1 = 0x06;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-
-      "pshufh     %[src],          %[src_lo],         %[mask0]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_lo],         %[mask1]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_lo],         %[mask2]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_lo],         %[mask3]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_hi],         %[mask0]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_hi],         %[mask1]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_hi],         %[mask2]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_hi],         %[mask3]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo)
-      : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
-        [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
-        [shift1] "f"(shift1), [width] "r"(width)
-      : "memory");
-}
-
-void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, src0, src1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x1b;
-
-  src += width - 1;
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[source],       0(%[src_ptr])                    \n\t"
-      "gsldrc1    %[source],       -7(%[src_ptr])                   \n\t"
-      "punpcklbh  %[src0],         %[source],         %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpckhbh  %[src1],         %[source],         %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "packushb   %[dest],         %[src1],           %[src0]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],       -0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
-        [src1] "=&f"(src1)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  uint64_t src0, src1, dest0, dest1;
-  const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
-  const uint64_t mask1 = 0x1b;
-  const uint64_t shift = 0x08;
-
-  src_uv += (width - 1) << 1;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         1(%[src_ptr])                    \n\t"
-      "gsldrc1    %[src0],         -6(%[src_ptr])                   \n\t"
-      "gsldlc1    %[src1],         -7(%[src_ptr])                   \n\t"
-      "gsldrc1    %[src1],         -14(%[src_ptr])                  \n\t"
-
-      "and        %[dest0],        %[src0],           %[mask0]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "and        %[dest1],        %[src1],           %[mask0]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dstu_ptr])                \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dstu_ptr])                \n\t"
-
-      "psrlh      %[dest0],        %[src0],           %[shift]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "psrlh      %[dest1],        %[src1],           %[shift]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dstv_ptr])                \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dstv_ptr])                \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],       -0x10          \n\t"
-      "daddiu     %[dstu_ptr],     %[dstu_ptr],       0x08          \n\t"
-      "daddiu     %[dstv_ptr],     %[dstv_ptr],       0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
-        [src1] "=&f"(src1)
-      : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
-        [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  src += (width - 1) * 4;
-  uint64_t temp = 0x0;
-  uint64_t shuff = 0x4e;  // 01 00 11 10
-  __asm__ volatile(
-      "1:                                      \n\t"
-      "gsldlc1 %[temp],  3(%[src])     	       \n\t"
-      "gsldrc1 %[temp], -4(%[src])     	       \n\t"
-      "pshufh  %[temp],  %[temp],    %[shuff]  \n\t"
-      "gssdrc1 %[temp],  0x0(%[dst])           \n\t"
-      "gssdlc1 %[temp],  0x7(%[dst])           \n\t"
-
-      "daddiu  %[src],   %[src],    -0x08      \n\t"
-      "daddiu  %[dst],   %[dst],     0x08      \n\t"
-      "daddiu  %[width], %[width],  -0x02      \n\t"
-      "bnez    %[width], 1b                    \n\t"
-      : [temp] "=&f"(temp)
-      : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
-      : "memory");
-}
-
-void SplitUVRow_MMI(const uint8_t* src_uv,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                    \n\t"
-      "gsldrc1  %[t0],     0x00(%[src_uv])          \n\t"
-      "gsldlc1  %[t0],     0x07(%[src_uv])          \n\t"
-      "gsldrc1  %[t1],     0x08(%[src_uv])          \n\t"
-      "gsldlc1  %[t1],     0x0f(%[src_uv])          \n\t"
-
-      "and      %[t2],     %[t0],          %[c0]    \n\t"
-      "and      %[t3],     %[t1],          %[c0]    \n\t"
-      "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "gssdrc1  %[t2],     0x0(%[dst_u])	    \n\t"
-      "gssdlc1  %[t2],     0x7(%[dst_u])            \n\t"
-
-      "psrlh    %[t2],     %[t0],          %[shift] \n\t"
-      "psrlh    %[t3],     %[t1],          %[shift] \n\t"
-      "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "gssdrc1  %[t2],     0x0(%[dst_v])            \n\t"
-      "gssdlc1  %[t2],     0x7(%[dst_v])            \n\t"
-
-      "daddiu   %[src_uv], %[src_uv],      16       \n\t"
-      "daddiu   %[dst_u],  %[dst_u],       8        \n\t"
-      "daddiu   %[dst_v],  %[dst_v],       8        \n\t"
-      "daddiu   %[width],  %[width],      -8        \n\t"
-      "bgtz     %[width],  1b                       \n\t"
-      "nop                                          \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [t3] "=&f"(temp[3])
-      : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-void MergeUVRow_MMI(const uint8_t* src_u,
-                    const uint8_t* src_v,
-                    uint8_t* dst_uv,
-                    int width) {
-  uint64_t temp[3];
-  __asm__ volatile(
-      "1:	                                 \n\t"
-      "gsldrc1   %[t0],     0x0(%[src_u])        \n\t"
-      "gsldlc1   %[t0],     0x7(%[src_u])        \n\t"
-      "gsldrc1   %[t1],     0x0(%[src_v])        \n\t"
-      "gsldlc1   %[t1],     0x7(%[src_v])        \n\t"
-      "punpcklbh %[t2],     %[t0],         %[t1] \n\t"
-      "gssdrc1   %[t2],     0x0(%[dst_uv])	 \n\t"
-      "gssdlc1   %[t2],     0x7(%[dst_uv])       \n\t"
-      "punpckhbh %[t2],     %[t0],         %[t1] \n\t"
-      "gssdrc1   %[t2],     0x8(%[dst_uv])	 \n\t"
-      "gssdlc1   %[t2],     0xf(%[dst_uv])       \n\t"
-
-      "daddiu    %[src_u],  %[src_u],      8     \n\t"
-      "daddiu    %[src_v],  %[src_v],      8     \n\t"
-      "daddiu    %[dst_uv], %[dst_uv],     16    \n\t"
-      "daddiu    %[width],  %[width],     -8     \n\t"
-      "bgtz      %[width],  1b                   \n\t"
-      "nop                                       \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
-      : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [width] "r"(width)
-      : "memory");
-}
-
-void SplitRGBRow_MMI(const uint8_t* src_rgb,
-                     uint8_t* dst_r,
-                     uint8_t* dst_g,
-                     uint8_t* dst_b,
-                     int width) {
-  uint64_t src[4];
-  uint64_t dest_hi, dest_lo, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "gslwlc1    %[src2],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src2],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src3],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src3],         0x09(%[src_ptr])                 \n\t"
-      "punpcklbh  %[dest_hi],      %[src2],           %[src3]       \n\t"
-
-      "punpcklhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstr_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstr_ptr])                \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstg_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstg_ptr])                \n\t"
-      "punpckhhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstb_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstb_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dstr_ptr],     %[dstr_ptr],       0x04          \n\t"
-      "daddiu     %[dstg_ptr],     %[dstg_ptr],       0x04          \n\t"
-      "daddiu     %[dstb_ptr],     %[dstb_ptr],       0x04          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
-        [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
-        [dstb_ptr] "r"(dst_b), [width] "r"(width)
-      : "memory");
-}
-
-void MergeRGBRow_MMI(const uint8_t* src_r,
-                     const uint8_t* src_g,
-                     const uint8_t* src_b,
-                     uint8_t* dst_rgb,
-                     int width) {
-  uint64_t srcr, srcg, srcb, dest;
-  uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
-  const uint64_t temp = 0x0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[srcr],         0x07(%[srcr_ptr])                \n\t"
-      "gsldrc1    %[srcr],         0x00(%[srcr_ptr])                \n\t"
-      "gsldlc1    %[srcg],         0x07(%[srcg_ptr])                \n\t"
-      "gsldrc1    %[srcg],         0x00(%[srcg_ptr])                \n\t"
-      "punpcklbh  %[srcrg_lo],     %[srcr],           %[srcg]       \n\t"
-      "punpckhbh  %[srcrg_hi],     %[srcr],           %[srcg]       \n\t"
-
-      "gsldlc1    %[srcb],         0x07(%[srcb_ptr])                \n\t"
-      "gsldrc1    %[srcb],         0x00(%[srcb_ptr])                \n\t"
-      "punpcklbh  %[srcbz_lo],     %[srcb],           %[temp]       \n\t"
-      "punpckhbh  %[srcbz_hi],     %[srcb],           %[temp]       \n\t"
-
-      "punpcklhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
-      "gsswlc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
-      "gsswlc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
-      "punpcklhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
-      "gsswlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
-      "gsswlc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[srcr_ptr],     %[srcr_ptr],       0x08          \n\t"
-      "daddiu     %[srcg_ptr],     %[srcg_ptr],       0x08          \n\t"
-      "daddiu     %[srcb_ptr],     %[srcb_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x18          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
-        [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
-        [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
-        [srcbz_lo] "=&f"(srcbz_lo)
-      : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
-        [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
-      : "memory");
-}
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
-                     int src_stride_yuy2,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t c0 = 0xff00ff00ff00ff00;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t temp[3];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  uint64_t src_stride = 0x0;
-  __asm__ volatile(
-      "1:	                                                     \n\t"
-      "gsldrc1  %[t0],         0x00(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t0],         0x07(%[src_yuy2])                     \n\t"
-      "daddu    %[src_stride], %[src_yuy2],       %[src_stride_yuy2] \n\t"
-      "gsldrc1  %[t1],         0x00(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x07(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0],             %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x08(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t2],         0x0f(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x08(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x0f(%[src_stride])                   \n\t"
-      "pavgb    %[t1],         %[t2],             %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],             %[c0]              \n\t"
-      "and      %[t1],         %[t1],             %[c0]              \n\t"
-      "psrlh    %[t0],         %[t0],             %[shift]           \n\t"
-      "psrlh    %[t1],         %[t1],             %[shift]           \n\t"
-      "packushb %[t0],         %[t0],             %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                 \n\t"
-      "and      %[d0],         %[t0],             %[c1]              \n\t"
-      "psrlh    %[d1],         %[t1],             %[shift]           \n\t"
-
-      "gsldrc1  %[t0],         0x10(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t0],         0x17(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x10(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x17(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]             \n\t"
-
-      "gsldrc1  %[t2],         0x18(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t2],         0x1f(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x18(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x1f(%[src_stride])                   \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]             \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]             \n\t"
-      "and      %[t1],         %[t1],              %[c0]             \n\t"
-      "psrlh    %[t0],         %[t0],              %[shift]          \n\t"
-      "psrlh    %[t1],         %[t1],              %[shift]          \n\t"
-      "packushb %[t0],         %[t0],              %[t1]             \n\t"
-      "mov.s    %[t1],         %[t0]                                 \n\t"
-      "and      %[d2],         %[t0],              %[c1]             \n\t"
-      "psrlh    %[d3],         %[t1],              %[shift]          \n\t"
-
-      "packushb %[d0],         %[d0],              %[d2]             \n\t"
-      "packushb %[d1],         %[d1],              %[d3]             \n\t"
-      "gssdrc1  %[d0],         0x0(%[dst_u])	                     \n\t"
-      "gssdlc1  %[d0],         0x7(%[dst_u])                         \n\t"
-      "gssdrc1  %[d1],         0x0(%[dst_v])	                     \n\t"
-      "gssdlc1  %[d1],         0x7(%[dst_v])                         \n\t"
-      "daddiu   %[src_yuy2],   %[src_yuy2],        32                \n\t"
-      "daddiu   %[dst_u],      %[dst_u],           8                 \n\t"
-      "daddiu   %[dst_v],      %[dst_v],           8                 \n\t"
-      "daddiu   %[width],      %[width],          -16                \n\t"
-      "bgtz     %[width],      1b                                    \n\t"
-      "nop                                                           \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
-        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
-      : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  uint64_t c0 = 0xff00ff00ff00ff00;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_yuy2])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_yuy2])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
-      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d0],       %[t0],            %[c1]    \n\t"
-      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
-
-      "gsldrc1  %[t0],       0x10(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t0],       0x17(%[src_yuy2])          \n\t"
-      "gsldrc1  %[t1],       0x18(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t1],       0x1f(%[src_yuy2])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
-      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d2],       %[t0],            %[c1]    \n\t"
-      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
-
-      "packushb %[d0],       %[d0],            %[d2]    \n\t"
-      "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
-      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
-      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
-      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
-      "daddiu   %[src_yuy2], %[src_yuy2],      32       \n\t"
-      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
-      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
-      "daddiu   %[width],    %[width],        -16       \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
-        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  __asm__ volatile(
-      "1:	                                     \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_yuy2])       \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_yuy2])       \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_yuy2])       \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_yuy2])       \n\t"
-      "and      %[t0],       %[t0],            %[c0] \n\t"
-      "and      %[t1],       %[t1],            %[c0] \n\t"
-      "packushb %[t0],       %[t0],            %[t1] \n\t"
-      "gssdrc1  %[t0],       0x0(%[dst_y])	     \n\t"
-      "gssdlc1  %[t0],       0x7(%[dst_y])           \n\t"
-      "daddiu   %[src_yuy2], %[src_yuy2],      16    \n\t"
-      "daddiu   %[dst_y],    %[dst_y],         8     \n\t"
-      "daddiu   %[width],    %[width],        -8     \n\t"
-      "bgtz     %[width],    1b                      \n\t"
-      "nop                                           \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
-      : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
-        [c0] "f"(c0)
-      : "memory");
-}
-
-// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
-                     int src_stride_uyvy,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  // Output a row of UV values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[3];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  uint64_t src_stride = 0x0;
-  __asm__ volatile(
-      "1:	                                                      \n\t"
-      "gsldrc1  %[t0],         0x00(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t0],         0x07(%[src_uyvy])                      \n\t"
-      "daddu    %[src_stride], %[src_uyvy],        %[src_stride_uyvy] \n\t"
-      "gsldrc1  %[t1],         0x00(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x07(%[src_stride])                    \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x08(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t2],         0x0f(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x08(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x0f(%[src_stride])                    \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]              \n\t"
-      "and      %[t1],         %[t1],              %[c0]              \n\t"
-      "packushb %[t0],         %[t0],              %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                  \n\t"
-      "and      %[d0],         %[t0],              %[c0]              \n\t"
-      "psrlh    %[d1],         %[t1],              %[shift]           \n\t"
-
-      "gsldrc1  %[t0],         0x10(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t0],         0x17(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x10(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x17(%[src_stride])                    \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x18(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t2],         0x1f(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x18(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x1f(%[src_stride])                    \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]              \n\t"
-      "and      %[t1],         %[t1],              %[c0]              \n\t"
-      "packushb %[t0],         %[t0],              %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                  \n\t"
-      "and      %[d2],         %[t0],              %[c0]              \n\t"
-      "psrlh    %[d3],         %[t1],              %[shift]           \n\t"
-
-      "packushb %[d0],         %[d0],              %[d2]              \n\t"
-      "packushb %[d1],         %[d1],              %[d3]              \n\t"
-      "gssdrc1  %[d0],         0x0(%[dst_u])	                      \n\t"
-      "gssdlc1  %[d0],         0x7(%[dst_u])                          \n\t"
-      "gssdrc1  %[d1],         0x0(%[dst_v])	                      \n\t"
-      "gssdlc1  %[d1],         0x7(%[dst_v])                          \n\t"
-      "daddiu   %[src_uyvy],   %[src_uyvy],        32                 \n\t"
-      "daddiu   %[dst_u],      %[dst_u],           8                  \n\t"
-      "daddiu   %[dst_v],      %[dst_v],           8                  \n\t"
-      "daddiu   %[width],      %[width],          -16                 \n\t"
-      "bgtz     %[width],      1b                                     \n\t"
-      "nop                                                            \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
-        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
-      : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  // Output a row of UV values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d0],       %[t0],            %[c0]    \n\t"
-      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
-
-      "gsldrc1  %[t0],       0x10(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x17(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x18(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x1f(%[src_uyvy])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d2],       %[t0],            %[c0]    \n\t"
-      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
-
-      "packushb %[d0],       %[d0],            %[d2]    \n\t"
-      "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
-      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
-      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
-      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
-      "daddiu   %[src_uyvy], %[src_uyvy],      32       \n\t"
-      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
-      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
-      "daddiu   %[width],    %[width],        -16       \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
-        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  // Output a row of Y values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t shift = 0x08;
-  uint64_t temp[2];
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
-      "dsrl     %[t0],       %[t0],            %[shift] \n\t"
-      "dsrl     %[t1],       %[t1],            %[shift] \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "gssdrc1  %[t0],       0x0(%[dst_y])	        \n\t"
-      "gssdlc1  %[t0],       0x7(%[dst_y])              \n\t"
-      "daddiu   %[src_uyvy], %[src_uyvy],      16       \n\t"
-      "daddiu   %[dst_y],    %[dst_y],         8        \n\t"
-      "daddiu   %[width],    %[width],        -8        \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
-      : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
-        [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Blend src_argb over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb or src_argb1.
-// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_MMI(const uint8_t* src_argb,
-                      const uint8_t* src_argb1,
-                      uint8_t* dst_argb,
-                      int width) {
-  uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
-      dest_lo;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
-  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
-  const uint64_t mask3 = 0xFF;
-  const uint64_t mask4 = ~mask1;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
-
-      "psubush    %[alpha],        %[mask2],          %[src0_lo]    \n\t"
-      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
-      "pmullh     %[dest_lo],      %[src1_lo],        %[alpha]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src0_lo]    \n\t"
-
-      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
-
-      "psubush    %[alpha],        %[mask2],          %[src0_hi]    \n\t"
-      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
-      "pmullh     %[dest_hi],      %[src1_hi],        %[alpha]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src0_hi]    \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[mask4]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
-        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
-        [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-void BlendPlaneRow_MMI(const uint8_t* src0,
-                       const uint8_t* src1,
-                       const uint8_t* alpha,
-                       uint8_t* dst,
-                       int width) {
-  uint64_t source0, source1, dest, alph;
-  uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
-      dest_lo;
-  uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
-  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
-      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
-
-      "gsldlc1    %[alpha],        0x07(%[alpha_ptr])               \n\t"
-      "gsldrc1    %[alpha],        0x00(%[alpha_ptr])               \n\t"
-      "psubusb    %[alpha_r],      %[mask1],          %[alpha]      \n\t"
-      "punpcklbh  %[alpha_lo],     %[alpha],          %[mask0]      \n\t"
-      "punpckhbh  %[alpha_hi],     %[alpha],          %[mask0]      \n\t"
-      "punpcklbh  %[alpha_rlo],    %[alpha_r],        %[mask0]      \n\t"
-      "punpckhbh  %[alpha_rhi],    %[alpha_r],        %[mask0]      \n\t"
-
-      "pmullh     %[dest_lo],      %[src0_lo],        %[alpha_lo]   \n\t"
-      "pmullh     %[dest],         %[src1_lo],        %[alpha_rlo]  \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[dest]       \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[mask2]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-
-      "pmullh     %[dest_hi],      %[src0_hi],        %[alpha_hi]   \n\t"
-      "pmullh     %[dest],         %[src1_hi],        %[alpha_rhi]  \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[dest]       \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[mask2]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[alpha_ptr],    %[alpha_ptr],      0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
-        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
-        [alpha_r] "=&f"(alpha_rev)
-      : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
-        [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          int width) {
-  uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
-  const uint64_t mask0 = 0xFF;
-  const uint64_t mask1 = 0xFF000000FF000000ULL;
-  const uint64_t mask2 = ~mask1;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
-
-      "pshufh     %[alpha],        %[src_lo],         %[mask0]      \n\t"
-      "pmulhuh    %[dest_lo],      %[alpha],          %[src_lo]     \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "pshufh     %[alpha],        %[src_hi],         %[mask0]      \n\t"
-      "pmulhuh    %[dest_hi],      %[alpha],          %[src_hi]     \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask2]      \n\t"
-      "and        %[src],          %[src],            %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
-        [width] "r"(width)
-      : "memory");
-}
-
-void ComputeCumulativeSumRow_MMI(const uint8_t* row,
-                                 int32_t* cumsum,
-                                 const int32_t* previous_cumsum,
-                                 int width) {
-  int64_t row_sum[2] = {0, 0};
-  uint64_t src, dest0, dest1, presrc0, presrc1, dest;
-  const uint64_t mask = 0x0;
-
-  __asm__ volatile(
-      "xor        %[row_sum0],     %[row_sum0],       %[row_sum0]   \n\t"
-      "xor        %[row_sum1],     %[row_sum1],       %[row_sum1]   \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[row_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[row_ptr])                 \n\t"
-
-      "punpcklbh  %[src],          %[src],            %[mask]       \n\t"
-      "punpcklhw  %[dest0],        %[src],            %[mask]       \n\t"
-      "punpckhhw  %[dest1],        %[src],            %[mask]       \n\t"
-
-      "paddw      %[row_sum0],     %[row_sum0],       %[dest0]      \n\t"
-      "paddw      %[row_sum1],     %[row_sum1],       %[dest1]      \n\t"
-
-      "gsldlc1    %[presrc0],      0x07(%[pre_ptr])                 \n\t"
-      "gsldrc1    %[presrc0],      0x00(%[pre_ptr])                 \n\t"
-      "gsldlc1    %[presrc1],      0x0f(%[pre_ptr])                 \n\t"
-      "gsldrc1    %[presrc1],      0x08(%[pre_ptr])                 \n\t"
-
-      "paddw      %[dest0],        %[row_sum0],       %[presrc0]    \n\t"
-      "paddw      %[dest1],        %[row_sum1],       %[presrc1]    \n\t"
-
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[row_ptr],      %[row_ptr],        0x04          \n\t"
-      "daddiu     %[pre_ptr],      %[pre_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x01          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
-        [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
-        [presrc1] "=&f"(presrc1)
-      : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
-        [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
-      : "memory");
-}
-
-// C version 2x2 -> 2x1.
-void InterpolateRow_MMI(uint8_t* dst_ptr,
-                        const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        int width,
-                        int source_y_fraction) {
-  if (source_y_fraction == 0) {
-    __asm__ volatile(
-        "1:	                              \n\t"
-        "ld     $t0,        0x0(%[src_ptr])   \n\t"
-        "sd     $t0,        0x0(%[dst_ptr])   \n\t"
-        "daddiu %[src_ptr], %[src_ptr],     8 \n\t"
-        "daddiu %[dst_ptr], %[dst_ptr],     8 \n\t"
-        "daddiu %[width],   %[width],      -8 \n\t"
-        "bgtz   %[width],   1b                \n\t"
-        "nop                                  \n\t"
-        :
-        : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
-        : "memory");
-    return;
-  }
-  if (source_y_fraction == 128) {
-    uint64_t uv = 0x0;
-    uint64_t uv_stride = 0x0;
-    __asm__ volatile(
-        "1:	                                            \n\t"
-        "gsldrc1 %[uv],        0x0(%[src_ptr])              \n\t"
-        "gsldlc1 %[uv],        0x7(%[src_ptr])              \n\t"
-        "daddu   $t0,          %[src_ptr],     %[stride]    \n\t"
-        "gsldrc1 %[uv_stride], 0x0($t0)                     \n\t"
-        "gsldlc1 %[uv_stride], 0x7($t0)                     \n\t"
-
-        "pavgb   %[uv],        %[uv],          %[uv_stride] \n\t"
-        "gssdrc1 %[uv],        0x0(%[dst_ptr])              \n\t"
-        "gssdlc1 %[uv],        0x7(%[dst_ptr])              \n\t"
-
-        "daddiu  %[src_ptr],   %[src_ptr],     8            \n\t"
-        "daddiu  %[dst_ptr],   %[dst_ptr],     8            \n\t"
-        "daddiu  %[width],     %[width],      -8            \n\t"
-        "bgtz    %[width],     1b                           \n\t"
-        "nop                                                \n\t"
-        : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
-        : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
-          [stride] "r"((int64_t)src_stride)
-        : "memory");
-    return;
-  }
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  uint64_t temp;
-  uint64_t data[4];
-  uint64_t zero = 0x0;
-  uint64_t c0 = 0x0080008000800080;
-  uint64_t fy0 = 0x0100010001000100;
-  uint64_t shift = 0x8;
-  __asm__ volatile(
-      "pshufh    %[fy1],      %[fy1],          %[zero]  \n\t"
-      "psubh     %[fy0],      %[fy0],          %[fy1]   \n\t"
-      "1:	                                        \n\t"
-      "gsldrc1   %[t0],       0x0(%[src_ptr])           \n\t"
-      "gsldlc1   %[t0],       0x7(%[src_ptr])           \n\t"
-      "punpcklbh %[d0],       %[t0],           %[zero]  \n\t"
-      "punpckhbh %[d1],       %[t0],           %[zero]  \n\t"
-      "gsldrc1   %[t0],       0x0(%[src_ptr1])          \n\t"
-      "gsldlc1   %[t0],       0x7(%[src_ptr1])          \n\t"
-      "punpcklbh %[d2],       %[t0],           %[zero]  \n\t"
-      "punpckhbh %[d3],       %[t0],           %[zero]  \n\t"
-
-      "pmullh    %[d0],       %[d0],           %[fy0]   \n\t"
-      "pmullh    %[d2],       %[d2],           %[fy1]   \n\t"
-      "paddh     %[d0],       %[d0],           %[d2]    \n\t"
-      "paddh     %[d0],       %[d0],           %[c0]    \n\t"
-      "psrlh     %[d0],       %[d0],           %[shift] \n\t"
-
-      "pmullh    %[d1],       %[d1],           %[fy0]   \n\t"
-      "pmullh    %[d3],       %[d3],           %[fy1]   \n\t"
-      "paddh     %[d1],       %[d1],           %[d3]    \n\t"
-      "paddh     %[d1],       %[d1],           %[c0]    \n\t"
-      "psrlh     %[d1],       %[d1],           %[shift] \n\t"
-
-      "packushb  %[d0],       %[d0],           %[d1]    \n\t"
-      "gssdrc1   %[d0],       0x0(%[dst_ptr])           \n\t"
-      "gssdlc1   %[d0],       0x7(%[dst_ptr])           \n\t"
-      "daddiu    %[src_ptr],  %[src_ptr],      8        \n\t"
-      "daddiu    %[src_ptr1], %[src_ptr1],     8        \n\t"
-      "daddiu    %[dst_ptr],  %[dst_ptr],      8        \n\t"
-      "daddiu    %[width],    %[width],       -8        \n\t"
-      "bgtz      %[width],    1b                        \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
-        [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
-        [dst_ptr] "r"(dst_ptr), [width] "r"(width),
-        [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
-        [shift] "f"(shift), [zero] "f"(zero)
-      : "memory");
-}
-
-// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_argb,
-                        const uint8_t* shuffler,
-                        int width) {
-  uint64_t source, dest0, dest1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
-                         ((shuffler[2] & 0x03) << 4) |
-                         ((shuffler[3] & 0x03) << 6);
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest0],        %[src],            %[mask0]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "punpckhbh  %[dest1],        %[src],            %[mask0]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void I422ToYUY2Row_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_frame,
-                       int width) {
-  uint64_t temp[3];
-  uint64_t vu = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
-      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
-      "punpcklbh %[tu],        %[ty],             %[vu]	\n\t"  // g
-      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
-      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
-      "punpckhbh %[tu],        %[ty],             %[vu]	\n\t"  // g
-      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
-      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
-      "daddiu    %[src_y],     %[src_y],          8     \n\t"
-      "daddiu    %[src_u],     %[src_u],          4     \n\t"
-      "daddiu    %[src_v],     %[src_v],          4     \n\t"
-      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
-      "daddiu    %[width],     %[width],         -8     \n\t"
-      "bgtz      %[width],     1b                       \n\t"
-      "nop                                              \n\t"
-      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
-        [vu] "=&f"(vu)
-      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [dst_frame] "r"(dst_frame), [width] "r"(width)
-      : "memory");
-}
-
-void I422ToUYVYRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_frame,
-                       int width) {
-  uint64_t temp[3];
-  uint64_t vu = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
-      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
-      "punpcklbh %[tu],        %[vu],             %[ty]	\n\t"  // g
-      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
-      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
-      "punpckhbh %[tu],        %[vu],             %[ty]	\n\t"  // g
-      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
-      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
-      "daddiu    %[src_y],     %[src_y],          8     \n\t"
-      "daddiu    %[src_u],     %[src_u],          4     \n\t"
-      "daddiu    %[src_v],     %[src_v],          4     \n\t"
-      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
-      "daddiu    %[width],     %[width],         -8     \n\t"
-      "bgtz      %[width],     1b                       \n\t"
-      "nop                                              \n\t"
-      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
-        [vu] "=&f"(vu)
-      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [dst_frame] "r"(dst_frame), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, dest;
-  const uint64_t mask0 = 0xff000000ff000000ULL;
-  const uint64_t mask1 = ~mask0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "and        %[src],          %[src],            %[mask0]      \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[src],            %[dest]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
-                             uint8_t* dst_a,
-                             int width) {
-  uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
-  const uint64_t mask = 0xff000000ff000000ULL;
-  const uint64_t shift = 0x18;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "and        %[dest0],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "and        %[dest1],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
-      "and        %[dest0],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
-      "and        %[dest1],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
-        [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, dest0, dest1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00ffffff00ffffffULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest0],        %[mask0],          %[src]        \n\t"
-      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "punpckhbh  %[dest0],        %[mask0],          %[src]        \n\t"
-      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void I444ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-  __asm__ volatile (
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
-    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"//u
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"//v
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-// Also used for 420
-void I422ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
-    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"//v
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-// 10 bit YUV to ARGB
-void I210ToARGBRow_MMI(const uint16_t* src_y,
-                       const uint16_t* src_u,
-                       const uint16_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[y_ptr])                   \n\t"
-    "gsldrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "psllh      %[y],            %[y],              %[six]        \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "punpcklhw  %[u],            %[u],              %[u]          \n\t"
-    "psrah      %[u],            %[u],              %[two]        \n\t"
-    "punpcklhw  %[v],            %[v],              %[v]          \n\t"
-    "psrah      %[v],            %[v],              %[two]        \n\t"
-    "pminsh     %[u],            %[u],              %[mask1]      \n\t"
-    "pminsh     %[v],            %[v],              %[mask1]      \n\t"
-
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x08          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask),                     [two]"f"(0x02),
-      [mask1]"f"(0x00ff00ff00ff00ff)
-    : "memory"
-  );
-}
-
-void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            const uint8_t* src_a,
-                            uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  uint64_t y,u,v,a;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-    "gslwlc1    %[a],            0x03(%[a_ptr])                   \n\t"
-    "gslwrc1    %[a],            0x00(%[a_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[a]          \n\t"
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[a]          \n\t"//aaaagggg
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[a_ptr],        %[a_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),                         [a]"=&f"(a),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [a_ptr]"r"(src_a),                   [zero]"f"(0x00),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-void I422ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "packushb   %[g_vec0],       %[g_vec0],         %[zero]       \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-
-    "punpckhwd  %[r_vec0],       %[g_vec0],         %[g_vec0]     \n\t"
-    "psllw      %[r_vec1],       %[r_vec0],         %[lmove1]     \n\t"
-    "or         %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "psrlw      %[r_vec1],       %[r_vec0],         %[rmove1]     \n\t"
-    "pextrh     %[r_vec1],       %[r_vec1],         %[zero]       \n\t"
-    "pinsrh_2   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "pextrh     %[r_vec1],       %[g_vec1],         %[zero]       \n\t"
-    "pinsrh_3   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "pextrh     %[r_vec1],       %[g_vec1],         %[one]        \n\t"
-    "punpckhwd  %[g_vec1],       %[g_vec1],         %[g_vec1]     \n\t"
-    "psllw      %[g_vec1],       %[g_vec1],         %[rmove1]     \n\t"
-    "or         %[g_vec1],       %[g_vec1],         %[r_vec1]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[g_vec1],       0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0c          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask]"f"(mask),
-      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
-      [one]"f"(0x1)
-    : "memory"
-  );
-}
-
-void I422ToARGB4444Row_MMI(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb4444,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "and        %[g_vec],        %[g_vec],          %[mask1]      \n\t"
-    "psrlw      %[g_vec],        %[g_vec],          %[four]       \n\t"
-    "psrlw      %[r_vec],        %[g_vec],          %[four]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-    "and        %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "and        %[b_vec],        %[b_vec],          %[mask1]      \n\t"
-    "psrlw      %[b_vec],        %[b_vec],          %[four]       \n\t"
-    "psrlw      %[r_vec],        %[b_vec],          %[four]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-    "and        %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_argb4444])            \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_argb4444])            \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_argb4444], %[dst_argb4444],   0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_argb4444]"r"(dst_argb4444),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask]"f"(0xff00ff00ff00ff00),
-      [four]"f"(0x4),                      [mask1]"f"(0xf0f0f0f0f0f0f0f0),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void I422ToARGB1555Row_MMI(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb1555,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlw      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "or         %[g_vec],        %[g_vec],          %[mask3]      \n\t"
-
-    "psrlw      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "or         %[b_vec],        %[b_vec],          %[mask3]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_argb1555])            \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_argb1555])            \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_argb1555], %[dst_argb1555],   0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_argb1555]"r"(dst_argb1555),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [mask3]"f"(0x800000008000),
-      [lmove5]"f"(0x5)
-    : "memory"
-  );
-}
-
-void I422ToRGB565Row_MMI(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_rgb565]"r"(dst_rgb565),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [seven]"f"(0x7),
-      [lmove5]"f"(0x5)
-    : "memory"
-  );
-}
-
-void NV12ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_uv,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void NV21ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_vu,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void NV12ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_uv,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [lmove1]"f"(0x18),
-      [one]"f"(0x1),                       [rmove1]"f"(0x8)
-    : "memory"
-  );
-}
-
-void NV21ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_vu,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
-      [one]"f"(0x1)
-    : "memory"
-  );
-}
-
-void NV12ToRGB565Row_MMI(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
-    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
-    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-	"daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [dst_rgb565]"r"(dst_rgb565),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [seven]"f"(0x7)
-    : "memory"
-  );
-}
-
-void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[yuy2_ptr])                \n\t"
-    "gsldrc1    %[y],            0x00(%[yuy2_ptr])                \n\t"
-    "psrlh      %[temp],         %[y],              %[eight]      \n\t"
-    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
-
-    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-    "and        %[y],            %[y],              %[temp]       \n\t"
-    "psllh      %[temp],         %[y],              %[eight]      \n\t"
-    "or         %[y],            %[y],              %[temp]       \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[yuy2_ptr],     %[yuy2_ptr],       0x08          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [yuy2_ptr]"r"(src_yuy2),             [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [eight]"f"(0x8)
-    : "memory"
-  );
-}
-
-void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[uyvy_ptr])                \n\t"
-    "gsldrc1    %[y],            0x00(%[uyvy_ptr])                \n\t"
-    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-    "and        %[temp],         %[y],              %[temp]       \n\t"
-    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
-
-    "psrlh      %[y],            %[y],              %[eight]      \n\t"
-    "psllh      %[temp],         %[y],              %[eight]      \n\t"
-    "or         %[y],            %[y],              %[temp]       \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[uyvy_ptr],     %[uyvy_ptr],       0x08          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [uyvy_ptr]"r"(src_uyvy),             [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [eight]"f"(0x8)
-    : "memory"
-  );
-}
-
-void I422ToRGBARow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[alpha],          %[g_vec]      \n\t"
-    "punpcklbh  %[b_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
-  __asm__ volatile (
-    "punpcklwd  %[v32],          %[v32],            %[v32]        \n\t"
-    "1:                                                           \n\t"
-    "gssdlc1    %[v32],          0x07(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[v32],          0x00(%[dst_ptr])                 \n\t"
-    "gssdlc1    %[v32],          0x0f(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[v32],          0x08(%[dst_ptr])                 \n\t"
-
-    "daddi      %[width],        %[width],         -0x04          \n\t"
-    "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-    "bnez       %[width],        1b                               \n\t"
-    : [v32]"+&f"(v32)
-    : [dst_ptr]"r"(dst_argb),           [width]"r"(width)
-    : "memory"
-  );
-}
-// clang-format on
-
-// 10 bit YUV to ARGB
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/row_msa.cc b/thirdparty/libyuv/source/row_msa.cc
deleted file mode 100644
index c0b13b0..0000000
--- a/thirdparty/libyuv/source/row_msa.cc
+++ /dev/null
@@ -1,3620 +0,0 @@
-/*
- *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <string.h>
-
-#include "libyuv/row.h"
-
-// This module is for GCC MSA
-#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
-#include "libyuv/macros_msa.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define ALPHA_VAL (-1)
-
-// Fill YUV -> RGB conversion constants into vectors
-#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
-  {                                                              \
-    ub = __msa_fill_w(yuvconst->kUVToB[0]);                      \
-    vr = __msa_fill_w(yuvconst->kUVToR[1]);                      \
-    ug = __msa_fill_w(yuvconst->kUVToG[0]);                      \
-    vg = __msa_fill_w(yuvconst->kUVToG[1]);                      \
-    bb = __msa_fill_w(yuvconst->kUVBiasB[0]);                    \
-    bg = __msa_fill_w(yuvconst->kUVBiasG[0]);                    \
-    br = __msa_fill_w(yuvconst->kUVBiasR[0]);                    \
-    yg = __msa_fill_w(yuvconst->kYToRgb[0]);                     \
-  }
-
-// Load YUV 422 pixel data
-#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v)    \
-  {                                                                \
-    uint64_t y_m;                                                  \
-    uint32_t u_m, v_m;                                             \
-    v4i32 zero_m = {0};                                            \
-    y_m = LD(psrc_y);                                              \
-    u_m = LW(psrc_u);                                              \
-    v_m = LW(psrc_v);                                              \
-    out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \
-    out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m);        \
-    out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m);        \
-  }
-
-// Clip input vector elements between 0 to 255
-#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
-  {                                               \
-    v4i32 max_m = __msa_ldi_w(0xFF);              \
-                                                  \
-    in0 = __msa_maxi_s_w(in0, 0);                 \
-    in1 = __msa_maxi_s_w(in1, 0);                 \
-    in2 = __msa_maxi_s_w(in2, 0);                 \
-    in3 = __msa_maxi_s_w(in3, 0);                 \
-    in4 = __msa_maxi_s_w(in4, 0);                 \
-    in5 = __msa_maxi_s_w(in5, 0);                 \
-    in0 = __msa_min_s_w(max_m, in0);              \
-    in1 = __msa_min_s_w(max_m, in1);              \
-    in2 = __msa_min_s_w(max_m, in2);              \
-    in3 = __msa_min_s_w(max_m, in3);              \
-    in4 = __msa_min_s_w(max_m, in4);              \
-    in5 = __msa_min_s_w(max_m, in5);              \
-  }
-
-// Convert 8 pixels of YUV 420 to RGB.
-#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
-  {                                                                            \
-    v8i16 vec0_m, vec1_m;                                                      \
-    v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m;                              \
-    v4i32 reg5_m, reg6_m, reg7_m;                                              \
-    v16i8 zero_m = {0};                                                        \
-                                                                               \
-    vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y);                    \
-    vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv);                 \
-    reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m);                \
-    reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m);                \
-    reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m);                \
-    reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m);                \
-    reg0_m *= yg;                                                              \
-    reg1_m *= yg;                                                              \
-    reg2_m *= ubvr;                                                            \
-    reg3_m *= ubvr;                                                            \
-    reg0_m = __msa_srai_w(reg0_m, 16);                                         \
-    reg1_m = __msa_srai_w(reg1_m, 16);                                         \
-    reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg);                       \
-    reg5_m = __msa_ilvev_w(reg2_m, reg2_m);                                    \
-    reg6_m = __msa_ilvev_w(reg3_m, reg3_m);                                    \
-    reg7_m = __msa_ilvr_w(reg4_m, reg4_m);                                     \
-    reg2_m = __msa_ilvod_w(reg2_m, reg2_m);                                    \
-    reg3_m = __msa_ilvod_w(reg3_m, reg3_m);                                    \
-    reg4_m = __msa_ilvl_w(reg4_m, reg4_m);                                     \
-    reg5_m = reg0_m - reg5_m;                                                  \
-    reg6_m = reg1_m - reg6_m;                                                  \
-    reg2_m = reg0_m - reg2_m;                                                  \
-    reg3_m = reg1_m - reg3_m;                                                  \
-    reg7_m = reg0_m - reg7_m;                                                  \
-    reg4_m = reg1_m - reg4_m;                                                  \
-    reg5_m += bb;                                                              \
-    reg6_m += bb;                                                              \
-    reg7_m += bg;                                                              \
-    reg4_m += bg;                                                              \
-    reg2_m += br;                                                              \
-    reg3_m += br;                                                              \
-    reg5_m = __msa_srai_w(reg5_m, 6);                                          \
-    reg6_m = __msa_srai_w(reg6_m, 6);                                          \
-    reg7_m = __msa_srai_w(reg7_m, 6);                                          \
-    reg4_m = __msa_srai_w(reg4_m, 6);                                          \
-    reg2_m = __msa_srai_w(reg2_m, 6);                                          \
-    reg3_m = __msa_srai_w(reg3_m, 6);                                          \
-    CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m);               \
-    out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m);                       \
-    out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m);                       \
-    out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                       \
-  }
-
-// Pack and Store 8 ARGB values.
-#define STOREARGB(in0, in1, in2, in3, pdst_argb)           \
-  {                                                        \
-    v8i16 vec0_m, vec1_m;                                  \
-    v16u8 dst0_m, dst1_m;                                  \
-    vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
-    vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
-    dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m);          \
-    dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m);          \
-    ST_UB2(dst0_m, dst1_m, pdst_argb, 16);                 \
-  }
-
-// Takes ARGB input and calculates Y.
-#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
-                y_out)                                                     \
-  {                                                                        \
-    v16u8 vec0_m, vec1_m, vec2_m, vec3_m;                                  \
-    v8u16 reg0_m, reg1_m;                                                  \
-                                                                           \
-    vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0);             \
-    vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2);             \
-    vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0);             \
-    vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2);             \
-    reg0_m = __msa_dotp_u_h(vec0_m, const0);                               \
-    reg1_m = __msa_dotp_u_h(vec1_m, const0);                               \
-    reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1);                      \
-    reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1);                      \
-    reg0_m += const2;                                                      \
-    reg1_m += const2;                                                      \
-    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift);                    \
-    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift);                    \
-    y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);            \
-  }
-
-// Loads current and next row of ARGB input and averages it to calculate U and V
-#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \
-  {                                                                       \
-    v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
-    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
-    v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
-    v8u16 reg8_m, reg9_m;                                                 \
-                                                                          \
-    src0_m = (v16u8)__msa_ld_b((void*)s, 0);                              \
-    src1_m = (v16u8)__msa_ld_b((void*)s, 16);                             \
-    src2_m = (v16u8)__msa_ld_b((void*)s, 32);                             \
-    src3_m = (v16u8)__msa_ld_b((void*)s, 48);                             \
-    src4_m = (v16u8)__msa_ld_b((void*)t, 0);                              \
-    src5_m = (v16u8)__msa_ld_b((void*)t, 16);                             \
-    src6_m = (v16u8)__msa_ld_b((void*)t, 32);                             \
-    src7_m = (v16u8)__msa_ld_b((void*)t, 48);                             \
-    vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
-    vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
-    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
-    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
-    vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
-    vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
-    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
-    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
-    reg0_m = __msa_hadd_u_h(vec0_m, vec0_m);                              \
-    reg1_m = __msa_hadd_u_h(vec1_m, vec1_m);                              \
-    reg2_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
-    reg3_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
-    reg4_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
-    reg5_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
-    reg6_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
-    reg7_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
-    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
-    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
-    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
-    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
-    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
-    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
-    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
-    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
-    reg8_m += const_0x0101;                                               \
-    reg9_m += const_0x0101;                                               \
-    reg0_m += const_0x0101;                                               \
-    reg1_m += const_0x0101;                                               \
-    argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1);                        \
-    argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1);                        \
-    argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1);                        \
-    argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1);                        \
-  }
-
-#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
-                 shf0, shf1, shf2, shf3, shift, u_out, v_out)                \
-  {                                                                          \
-    v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-    v4u32 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
-                                                                             \
-    vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0);          \
-    vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2);          \
-    vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0);          \
-    vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2);          \
-    vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0);          \
-    vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2);          \
-    vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0);          \
-    vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2);          \
-    reg0_m = __msa_dotp_u_w(vec0_m, const0);                                 \
-    reg1_m = __msa_dotp_u_w(vec1_m, const0);                                 \
-    reg2_m = __msa_dotp_u_w(vec4_m, const0);                                 \
-    reg3_m = __msa_dotp_u_w(vec5_m, const0);                                 \
-    reg0_m += const1;                                                        \
-    reg1_m += const1;                                                        \
-    reg2_m += const1;                                                        \
-    reg3_m += const1;                                                        \
-    reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2);                         \
-    reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2);                         \
-    reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3);                         \
-    reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3);                         \
-    reg0_m = __msa_srl_w(reg0_m, shift);                                     \
-    reg1_m = __msa_srl_w(reg1_m, shift);                                     \
-    reg2_m = __msa_srl_w(reg2_m, shift);                                     \
-    reg3_m = __msa_srl_w(reg3_m, shift);                                     \
-    u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m);              \
-    v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);              \
-  }
-
-// Takes ARGB input and calculates U and V.
-#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
-                   shf0, shf1, shf2, shf3, v_out, u_out)                       \
-  {                                                                            \
-    v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;      \
-    v4u32 reg0_m, reg1_m, reg2_m, reg3_m;                                      \
-                                                                               \
-    vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0);                   \
-    vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2);                   \
-    vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0);                   \
-    vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2);                   \
-    vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0);                   \
-    vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2);                   \
-    vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0);                   \
-    vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2);                   \
-    reg0_m = __msa_dotp_u_w(vec0_m, const1);                                   \
-    reg1_m = __msa_dotp_u_w(vec1_m, const1);                                   \
-    reg2_m = __msa_dotp_u_w(vec4_m, const1);                                   \
-    reg3_m = __msa_dotp_u_w(vec5_m, const1);                                   \
-    reg0_m += (v4u32)const3;                                                   \
-    reg1_m += (v4u32)const3;                                                   \
-    reg2_m += (v4u32)const3;                                                   \
-    reg3_m += (v4u32)const3;                                                   \
-    reg0_m -= __msa_dotp_u_w(vec2_m, const0);                                  \
-    reg1_m -= __msa_dotp_u_w(vec3_m, const0);                                  \
-    reg2_m -= __msa_dotp_u_w(vec6_m, const2);                                  \
-    reg3_m -= __msa_dotp_u_w(vec7_m, const2);                                  \
-    u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                \
-    v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m);                \
-    u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out);                  \
-    v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out);                  \
-  }
-
-// Load I444 pixel data
-#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
-  {                                                           \
-    uint64_t y_m, u_m, v_m;                                   \
-    v2i64 zero_m = {0};                                       \
-    y_m = LD(psrc_y);                                         \
-    u_m = LD(psrc_u);                                         \
-    v_m = LD(psrc_v);                                         \
-    out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m);   \
-    out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m);   \
-    out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m);   \
-  }
-
-void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3;
-  v16u8 dst0, dst1, dst2, dst3;
-  v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
-  src += width - 64;
-
-  for (x = 0; x < width; x += 64) {
-    LD_UB4(src, 16, src3, src2, src1, src0);
-    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
-    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
-    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
-    dst += 64;
-    src -= 64;
-  }
-}
-
-void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  int x;
-  v8u16 src, dst;
-  v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0};
-  src_uv += (width - 8) << 1;
-  for (x = 0; x < width; x += 8) {
-    src = LD_UH(src_uv);
-    dst = __msa_vshf_h(shuffler, src, src);
-    ST_UH(dst, dst_uv);
-    src_uv -= 16;
-    dst_uv += 16;
-  }
-}
-
-void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3;
-  v16u8 dst0, dst1, dst2, dst3;
-  v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
-  src += width * 4 - 64;
-
-  for (x = 0; x < width; x += 16) {
-    LD_UB4(src, 16, src3, src2, src1, src0);
-    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
-    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
-    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
-    dst += 64;
-    src -= 64;
-  }
-}
-
-void I422ToYUY2Row_MSA(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_yuy2,
-                       int width) {
-  int x;
-  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
-  v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
-
-  for (x = 0; x < width; x += 32) {
-    src_u0 = LD_UB(src_u);
-    src_v0 = LD_UB(src_v);
-    LD_UB2(src_y, 16, src_y0, src_y1);
-    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
-    ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
-    ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
-    ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
-    src_u += 16;
-    src_v += 16;
-    src_y += 32;
-    dst_yuy2 += 64;
-  }
-}
-
-void I422ToUYVYRow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_uyvy,
-                       int width) {
-  int x;
-  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
-  v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
-
-  for (x = 0; x < width; x += 32) {
-    src_u0 = LD_UB(src_u);
-    src_v0 = LD_UB(src_v);
-    LD_UB2(src_y, 16, src_y0, src_y1);
-    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
-    ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
-    ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
-    ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
-    src_u += 16;
-    src_v += 16;
-    src_y += 32;
-    dst_uyvy += 64;
-  }
-}
-
-void I422ToARGBRow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  int x;
-  v16u8 src0, src1, src2;
-  v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    READYUV422(src_y, src_u, src_v, src0, src1, src2);
-    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
-    src_y += 8;
-    src_u += 4;
-    src_v += 4;
-    dst_argb += 32;
-  }
-}
-
-void I422ToRGBARow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  int x;
-  v16u8 src0, src1, src2;
-  v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    READYUV422(src_y, src_u, src_v, src0, src1, src2);
-    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
-    src_y += 8;
-    src_u += 4;
-    src_v += 4;
-    dst_argb += 32;
-  }
-}
-
-void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            const uint8_t* src_a,
-                            uint8_t* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  int x;
-  int64_t data_a;
-  v16u8 src0, src1, src2, src3;
-  v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v4i32 zero = {0};
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    data_a = LD(src_a);
-    READYUV422(src_y, src_u, src_v, src0, src1, src2);
-    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
-    STOREARGB(vec0, vec1, vec2, src3, dst_argb);
-    src_y += 8;
-    src_u += 4;
-    src_v += 4;
-    src_a += 8;
-    dst_argb += 32;
-  }
-}
-
-void I422ToRGB24Row_MSA(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int32_t width) {
-  int x;
-  int64_t data_u, data_v;
-  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v16u8 reg0, reg1, reg2, reg3;
-  v2i64 zero = {0};
-  v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
-  v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
-  v16i8 shuffler2 = {26, 6,  7,  27, 8,  9,  28, 10,
-                     11, 29, 12, 13, 30, 14, 15, 31};
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
-    data_u = LD(src_u);
-    data_v = LD(src_v);
-    src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
-    src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
-    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
-    src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec3, vec4, vec5);
-    reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
-    reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
-    reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
-    reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
-    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
-    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
-    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    ST_UB(dst2, (dst_argb + 32));
-    src_y += 16;
-    src_u += 8;
-    src_v += 8;
-    dst_argb += 48;
-  }
-}
-
-// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
-void I422ToRGB565Row_MSA(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  int x;
-  v16u8 src0, src1, src2, dst0;
-  v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    READYUV422(src_y, src_u, src_v, src0, src1, src2);
-    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec2, vec1);
-    vec0 = __msa_srai_h(vec0, 3);
-    vec1 = __msa_srai_h(vec1, 3);
-    vec2 = __msa_srai_h(vec2, 2);
-    vec1 = __msa_slli_h(vec1, 11);
-    vec2 = __msa_slli_h(vec2, 5);
-    vec0 |= vec1;
-    dst0 = (v16u8)(vec2 | vec0);
-    ST_UB(dst0, dst_rgb565);
-    src_y += 8;
-    src_u += 4;
-    src_v += 4;
-    dst_rgb565 += 16;
-  }
-}
-
-// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
-void I422ToARGB4444Row_MSA(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb4444,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  int x;
-  v16u8 src0, src1, src2, dst0;
-  v8i16 vec0, vec1, vec2;
-  v8u16 reg0, reg1, reg2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    READYUV422(src_y, src_u, src_v, src0, src1, src2);
-    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    reg0 = (v8u16)__msa_srai_h(vec0, 4);
-    reg1 = (v8u16)__msa_srai_h(vec1, 4);
-    reg2 = (v8u16)__msa_srai_h(vec2, 4);
-    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
-    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
-    reg1 |= const_0xF000;
-    reg0 |= reg2;
-    dst0 = (v16u8)(reg1 | reg0);
-    ST_UB(dst0, dst_argb4444);
-    src_y += 8;
-    src_u += 4;
-    src_v += 4;
-    dst_argb4444 += 16;
-  }
-}
-
-void I422ToARGB1555Row_MSA(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb1555,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  int x;
-  v16u8 src0, src1, src2, dst0;
-  v8i16 vec0, vec1, vec2;
-  v8u16 reg0, reg1, reg2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    READYUV422(src_y, src_u, src_v, src0, src1, src2);
-    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    reg0 = (v8u16)__msa_srai_h(vec0, 3);
-    reg1 = (v8u16)__msa_srai_h(vec1, 3);
-    reg2 = (v8u16)__msa_srai_h(vec2, 3);
-    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
-    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
-    reg1 |= const_0x8000;
-    reg0 |= reg2;
-    dst0 = (v16u8)(reg1 | reg0);
-    ST_UB(dst0, dst_argb1555);
-    src_y += 8;
-    src_u += 4;
-    src_v += 4;
-    dst_argb1555 += 16;
-  }
-}
-
-void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
-
-  for (x = 0; x < width; x += 32) {
-    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
-    ST_UB2(dst0, dst1, dst_y, 16);
-    src_yuy2 += 64;
-    dst_y += 32;
-  }
-}
-
-void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
-                     int src_stride_yuy2,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
-  int x;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 vec0, vec1, dst0, dst1;
-
-  for (x = 0; x < width; x += 32) {
-    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
-    LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
-    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
-    src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
-    vec0 = __msa_aver_u_b(src0, src2);
-    vec1 = __msa_aver_u_b(src1, src3);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    src_yuy2 += 64;
-    src_yuy2_next += 64;
-    dst_u += 16;
-    dst_v += 16;
-  }
-}
-
-void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
-
-  for (x = 0; x < width; x += 32) {
-    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
-    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    src_yuy2 += 64;
-    dst_u += 16;
-    dst_v += 16;
-  }
-}
-
-void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
-
-  for (x = 0; x < width; x += 32) {
-    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
-    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    ST_UB2(dst0, dst1, dst_y, 16);
-    src_uyvy += 64;
-    dst_y += 32;
-  }
-}
-
-void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
-                     int src_stride_uyvy,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
-  int x;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 vec0, vec1, dst0, dst1;
-
-  for (x = 0; x < width; x += 32) {
-    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
-    LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
-    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
-    src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
-    src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
-    vec0 = __msa_aver_u_b(src0, src2);
-    vec1 = __msa_aver_u_b(src1, src3);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    src_uyvy += 64;
-    src_uyvy_next += 64;
-    dst_u += 16;
-    dst_v += 16;
-  }
-}
-
-void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
-
-  for (x = 0; x < width; x += 32) {
-    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
-    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    src_uyvy += 64;
-    dst_u += 16;
-    dst_v += 16;
-  }
-}
-
-void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
-  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
-  v16i8 zero = {0};
-  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
-  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
-  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
-  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
-    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
-    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
-    vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
-    reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
-    reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
-    reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
-    reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
-    reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
-    reg0 *= const_0x19;
-    reg1 *= const_0x19;
-    reg2 *= const_0x81;
-    reg3 *= const_0x81;
-    reg4 *= const_0x42;
-    reg5 *= const_0x42;
-    reg0 += reg2;
-    reg1 += reg3;
-    reg0 += reg4;
-    reg1 += reg5;
-    reg0 += const_0x1080;
-    reg1 += const_0x1080;
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
-    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
-    ST_UB(dst0, dst_y);
-    src_argb += 64;
-    dst_y += 16;
-  }
-}
-
-void ARGBToUVRow_MSA(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  int x;
-  const uint8_t* src_argb_next = src_argb + src_stride_argb;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
-  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
-  v16u8 dst0, dst1;
-  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38);
-  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25);
-  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13);
-  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f);
-  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
-  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
-
-  for (x = 0; x < width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
-    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
-    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64);
-    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80);
-    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96);
-    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112);
-    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
-    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
-    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
-    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
-    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
-    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
-    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
-    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
-    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
-    reg0 = __msa_hadd_u_h(vec8, vec8);
-    reg1 = __msa_hadd_u_h(vec9, vec9);
-    reg2 = __msa_hadd_u_h(vec4, vec4);
-    reg3 = __msa_hadd_u_h(vec5, vec5);
-    reg4 = __msa_hadd_u_h(vec0, vec0);
-    reg5 = __msa_hadd_u_h(vec1, vec1);
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16);
-    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32);
-    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48);
-    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64);
-    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80);
-    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96);
-    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112);
-    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
-    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
-    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
-    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
-    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
-    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
-    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
-    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
-    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
-    reg0 += __msa_hadd_u_h(vec8, vec8);
-    reg1 += __msa_hadd_u_h(vec9, vec9);
-    reg2 += __msa_hadd_u_h(vec4, vec4);
-    reg3 += __msa_hadd_u_h(vec5, vec5);
-    reg4 += __msa_hadd_u_h(vec0, vec0);
-    reg5 += __msa_hadd_u_h(vec1, vec1);
-    reg0 += const_0x0001;
-    reg1 += const_0x0001;
-    reg2 += const_0x0001;
-    reg3 += const_0x0001;
-    reg4 += const_0x0001;
-    reg5 += const_0x0001;
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1);
-    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1);
-    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1);
-    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1);
-    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1);
-    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1);
-    reg6 = reg0 * const_0x70;
-    reg7 = reg1 * const_0x70;
-    reg8 = reg2 * const_0x4A;
-    reg9 = reg3 * const_0x4A;
-    reg6 += const_0x8080;
-    reg7 += const_0x8080;
-    reg8 += reg4 * const_0x26;
-    reg9 += reg5 * const_0x26;
-    reg0 *= const_0x12;
-    reg1 *= const_0x12;
-    reg2 *= const_0x5E;
-    reg3 *= const_0x5E;
-    reg4 *= const_0x70;
-    reg5 *= const_0x70;
-    reg2 += reg0;
-    reg3 += reg1;
-    reg4 += const_0x8080;
-    reg5 += const_0x8080;
-    reg6 -= reg8;
-    reg7 -= reg9;
-    reg4 -= reg2;
-    reg5 -= reg3;
-    reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
-    reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
-    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
-    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    src_argb += 128;
-    src_argb_next += 128;
-    dst_u += 16;
-    dst_v += 16;
-  }
-}
-
-void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
-  v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
-  v16i8 shuffler1 = {5,  6,  8,  9,  10, 12, 13, 14,
-                     16, 17, 18, 20, 21, 22, 24, 25};
-  v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
-                     21, 22, 24, 25, 26, 28, 29, 30};
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
-    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
-    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
-    ST_UB2(dst0, dst1, dst_rgb, 16);
-    ST_UB(dst2, (dst_rgb + 32));
-    src_argb += 64;
-    dst_rgb += 48;
-  }
-}
-
-void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
-  v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
-  v16i8 shuffler1 = {5,  4,  10, 9,  8,  14, 13, 12,
-                     18, 17, 16, 22, 21, 20, 26, 25};
-  v16i8 shuffler2 = {8,  14, 13, 12, 18, 17, 16, 22,
-                     21, 20, 26, 25, 24, 30, 29, 28};
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
-    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
-    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
-    ST_UB2(dst0, dst1, dst_rgb, 16);
-    ST_UB(dst2, (dst_rgb + 32));
-    src_argb += 64;
-    dst_rgb += 48;
-  }
-}
-
-void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  int x;
-  v16u8 src0, src1, dst0;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v16i8 zero = {0};
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
-    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
-    vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
-    vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
-    vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
-    vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
-    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
-    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
-    vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
-    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
-    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
-    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
-    vec0 = __msa_binsli_b(vec0, vec1, 2);
-    vec1 = __msa_binsli_b(vec2, vec3, 4);
-    vec4 = __msa_binsli_b(vec4, vec5, 2);
-    vec5 = __msa_binsli_b(vec6, vec7, 4);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
-    vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
-    dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
-    ST_UB(dst0, dst_rgb);
-    src_argb += 32;
-    dst_rgb += 16;
-  }
-}
-
-void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  int x;
-  v16u8 src0, src1, dst0;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
-  v16i8 zero = {0};
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
-    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
-    vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
-    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
-    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
-    vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
-    vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
-    vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
-    vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
-    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
-    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
-    vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
-    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
-    vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
-    vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
-    vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
-    vec0 = __msa_binsli_b(vec0, vec1, 2);
-    vec5 = __msa_binsli_b(vec5, vec6, 2);
-    vec1 = __msa_binsli_b(vec2, vec3, 5);
-    vec6 = __msa_binsli_b(vec7, vec8, 5);
-    vec1 = __msa_binsli_b(vec1, vec4, 0);
-    vec6 = __msa_binsli_b(vec6, vec9, 0);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
-    vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
-    dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
-    ST_UB(dst0, dst_rgb);
-    src_argb += 32;
-    dst_rgb += 16;
-  }
-}
-
-void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  int x;
-  v16u8 src0, src1;
-  v16u8 vec0, vec1;
-  v16u8 dst0;
-  v16i8 zero = {0};
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
-    vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
-    src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
-    src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
-    vec0 = __msa_binsli_b(vec0, src0, 3);
-    vec1 = __msa_binsli_b(vec1, src1, 3);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_rgb);
-    src_argb += 32;
-    dst_rgb += 16;
-  }
-}
-
-void ARGBToUV444Row_MSA(const uint8_t* src_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int32_t width) {
-  int32_t x;
-  v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 vec8, vec9, vec10, vec11;
-  v8u16 const_112 = (v8u16)__msa_ldi_h(112);
-  v8u16 const_74 = (v8u16)__msa_ldi_h(74);
-  v8u16 const_38 = (v8u16)__msa_ldi_h(38);
-  v8u16 const_94 = (v8u16)__msa_ldi_h(94);
-  v8u16 const_18 = (v8u16)__msa_ldi_h(18);
-  v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
-  v16i8 zero = {0};
-
-  for (x = width; x > 0; x -= 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
-    reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
-    reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
-    src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
-    src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
-    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
-    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
-    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
-    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
-    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
-    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
-    vec10 = vec0 * const_18;
-    vec11 = vec1 * const_18;
-    vec8 = vec2 * const_94;
-    vec9 = vec3 * const_94;
-    vec6 = vec4 * const_112;
-    vec7 = vec5 * const_112;
-    vec0 *= const_112;
-    vec1 *= const_112;
-    vec2 *= const_74;
-    vec3 *= const_74;
-    vec4 *= const_38;
-    vec5 *= const_38;
-    vec8 += vec10;
-    vec9 += vec11;
-    vec6 += const_32896;
-    vec7 += const_32896;
-    vec0 += const_32896;
-    vec1 += const_32896;
-    vec2 += vec4;
-    vec3 += vec5;
-    vec0 -= vec2;
-    vec1 -= vec3;
-    vec6 -= vec8;
-    vec7 -= vec9;
-    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
-    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
-    vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
-    vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    src_argb += 64;
-    dst_u += 16;
-    dst_v += 16;
-  }
-}
-
-void ARGBMultiplyRow_MSA(const uint8_t* src_argb,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  int x;
-  v16u8 src0, src1, dst0;
-  v8u16 vec0, vec1, vec2, vec3;
-  v4u32 reg0, reg1, reg2, reg3;
-  v8i16 zero = {0};
-
-  for (x = 0; x < width; x += 4) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
-    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
-    vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
-    vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
-    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
-    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
-    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
-    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
-    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
-    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
-    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
-    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
-    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
-    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
-    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
-    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_argb);
-    src_argb += 16;
-    src_argb1 += 16;
-    dst_argb += 16;
-  }
-}
-
-void ARGBAddRow_MSA(const uint8_t* src_argb,
-                    const uint8_t* src_argb1,
-                    uint8_t* dst_argb,
-                    int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
-    dst0 = __msa_adds_u_b(src0, src2);
-    dst1 = __msa_adds_u_b(src1, src3);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb += 32;
-    src_argb1 += 32;
-    dst_argb += 32;
-  }
-}
-
-void ARGBSubtractRow_MSA(const uint8_t* src_argb,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
-    dst0 = __msa_subs_u_b(src0, src2);
-    dst1 = __msa_subs_u_b(src1, src3);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb += 32;
-    src_argb1 += 32;
-    dst_argb += 32;
-  }
-}
-
-void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          int width) {
-  int x;
-  v16u8 src0, src1, dst0, dst1;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
-  v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-  v8i16 zero = {0};
-  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
-    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
-    vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
-    vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
-    vec4 = (v8u16)__msa_fill_h(vec0[3]);
-    vec5 = (v8u16)__msa_fill_h(vec0[7]);
-    vec6 = (v8u16)__msa_fill_h(vec1[3]);
-    vec7 = (v8u16)__msa_fill_h(vec1[7]);
-    vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
-    vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
-    vec6 = (v8u16)__msa_fill_h(vec2[3]);
-    vec7 = (v8u16)__msa_fill_h(vec2[7]);
-    vec8 = (v8u16)__msa_fill_h(vec3[3]);
-    vec9 = (v8u16)__msa_fill_h(vec3[7]);
-    vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
-    vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
-    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
-    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
-    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
-    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
-    reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
-    reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
-    reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
-    reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
-    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
-    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
-    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
-    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
-    reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
-    reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
-    reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
-    reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
-    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
-    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
-    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
-    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
-    reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
-    reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
-    reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
-    reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
-    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
-    vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    dst0 = __msa_bmnz_v(dst0, src0, mask);
-    dst1 = __msa_bmnz_v(dst1, src1, mask);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb += 32;
-    dst_argb += 32;
-  }
-}
-
-void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
-                               uint8_t* dst_rgb,
-                               uint32_t dither4,
-                               int width) {
-  int x;
-  v16u8 src0, src1, dst0, vec0, vec1;
-  v8i16 vec_d0;
-  v8i16 reg0, reg1, reg2;
-  v16i8 zero = {0};
-  v8i16 max = __msa_ldi_h(0xFF);
-
-  vec_d0 = (v8i16)__msa_fill_w(dither4);
-  vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
-    reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
-    reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
-    reg0 += vec_d0;
-    reg1 += vec_d0;
-    reg2 += vec_d0;
-    reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
-    reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
-    reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
-    reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
-    reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
-    reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
-    reg0 = __msa_srai_h(reg0, 3);
-    reg2 = __msa_srai_h(reg2, 3);
-    reg1 = __msa_srai_h(reg1, 2);
-    reg2 = __msa_slli_h(reg2, 11);
-    reg1 = __msa_slli_h(reg1, 5);
-    reg0 |= reg1;
-    dst0 = (v16u8)(reg0 | reg2);
-    ST_UB(dst0, dst_rgb);
-    src_argb += 32;
-    dst_rgb += 16;
-  }
-}
-
-void ARGBShuffleRow_MSA(const uint8_t* src_argb,
-                        uint8_t* dst_argb,
-                        const uint8_t* shuffler,
-                        int width) {
-  int x;
-  v16u8 src0, src1, dst0, dst1;
-  v16i8 vec0;
-  v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
-  int32_t val = LW((int32_t*)shuffler);
-
-  vec0 = (v16i8)__msa_fill_w(val);
-  shuffler_vec += vec0;
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
-    dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
-    dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb += 32;
-    dst_argb += 32;
-  }
-}
-
-void ARGBShadeRow_MSA(const uint8_t* src_argb,
-                      uint8_t* dst_argb,
-                      int width,
-                      uint32_t value) {
-  int x;
-  v16u8 src0, dst0;
-  v8u16 vec0, vec1;
-  v4u32 reg0, reg1, reg2, reg3, rgba_scale;
-  v8i16 zero = {0};
-
-  rgba_scale[0] = value;
-  rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
-  rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
-
-  for (x = 0; x < width; x += 4) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
-    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
-    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
-    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
-    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
-    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
-    reg0 *= rgba_scale;
-    reg1 *= rgba_scale;
-    reg2 *= rgba_scale;
-    reg3 *= rgba_scale;
-    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
-    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
-    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
-    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_argb);
-    src_argb += 16;
-    dst_argb += 16;
-  }
-}
-
-void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  int x;
-  v16u8 src0, src1, vec0, vec1, dst0, dst1;
-  v8u16 reg0;
-  v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D);
-  v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
-    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
-    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
-    reg0 = __msa_dotp_u_h(vec0, const_0x961D);
-    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D);
-    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
-    vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb += 32;
-    dst_argb += 32;
-  }
-}
-
-void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) {
-  int x;
-  v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
-  v8u16 reg0, reg1, reg2;
-  v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
-  v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
-  v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
-  v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
-  v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
-  v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
-  v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
-    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
-    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
-    vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
-    reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
-    reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
-    reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
-    reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
-    reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
-    reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
-    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
-    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
-    reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
-    reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
-    vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
-    vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
-    vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
-    vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
-    vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    dst_argb += 32;
-  }
-}
-
-void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
-                           uint8_t* dst_argb,
-                           int width) {
-  int x;
-  v16u8 src0, src1;
-  v8u16 vec0, vec1, vec2, vec3;
-  v16u8 dst0, dst1, dst2, dst3;
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
-    vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
-    vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
-    vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
-    vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
-    vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
-    vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
-    vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
-    vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
-    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
-    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
-    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
-    src_argb4444 += 32;
-    dst_argb += 64;
-  }
-}
-
-void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
-                           uint8_t* dst_argb,
-                           int width) {
-  int x;
-  v8u16 src0, src1;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
-  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
-  v16u8 dst0, dst1, dst2, dst3;
-  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v8u16)__msa_ld_h((void*)src_argb1555, 0);
-    src1 = (v8u16)__msa_ld_h((void*)src_argb1555, 16);
-    vec0 = src0 & const_0x1F;
-    vec1 = src1 & const_0x1F;
-    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
-    vec2 = src0 & const_0x1F;
-    vec3 = src1 & const_0x1F;
-    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
-    vec4 = src0 & const_0x1F;
-    vec5 = src1 & const_0x1F;
-    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
-    reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
-    reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
-    reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
-    reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
-    reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
-    reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
-    reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
-    reg3 = -reg3;
-    reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
-    reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
-    reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
-    reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
-    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
-    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
-    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
-    src_argb1555 += 32;
-    dst_argb += 64;
-  }
-}
-
-void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
-                         uint8_t* dst_argb,
-                         int width) {
-  int x;
-  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
-  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
-  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
-  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
-  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v8u16)__msa_ld_h((void*)src_rgb565, 0);
-    src1 = (v8u16)__msa_ld_h((void*)src_rgb565, 16);
-    vec0 = src0 & const_0x1F;
-    vec1 = src0 & const_0x7E0;
-    vec2 = src0 & const_0xF800;
-    vec3 = src1 & const_0x1F;
-    vec4 = src1 & const_0x7E0;
-    vec5 = src1 & const_0xF800;
-    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
-    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
-    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
-    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
-    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
-    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
-    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
-    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
-    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
-    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
-    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
-    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
-    res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
-    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
-    res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
-    res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
-    dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
-    dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
-    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
-    src_rgb565 += 32;
-    dst_argb += 64;
-  }
-}
-
-void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24,
-                        uint8_t* dst_argb,
-                        int width) {
-  int x;
-  v16u8 src0, src1, src2;
-  v16u8 vec0, vec1, vec2;
-  v16u8 dst0, dst1, dst2, dst3;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-  v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_rgb24, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_rgb24, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_rgb24, 32);
-    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
-    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
-    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
-    dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
-    dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
-    dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
-    dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
-    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
-    src_rgb24 += 48;
-    dst_argb += 64;
-  }
-}
-
-void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  int x;
-  v16u8 src0, src1, src2;
-  v16u8 vec0, vec1, vec2;
-  v16u8 dst0, dst1, dst2, dst3;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-  v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_raw, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_raw, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_raw, 32);
-    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
-    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
-    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
-    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
-    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
-    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
-    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
-    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
-    src_raw += 48;
-    dst_argb += 64;
-  }
-}
-
-void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
-                        uint8_t* dst_y,
-                        int width) {
-  int x;
-  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
-  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
-  v16u8 dst0;
-  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
-  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
-  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
-  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
-  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0);
-    src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16);
-    vec0 = src0 & const_0x1F;
-    vec1 = src1 & const_0x1F;
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
-    vec2 = src0 & const_0x1F;
-    vec3 = src1 & const_0x1F;
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
-    vec4 = src0 & const_0x1F;
-    vec5 = src1 & const_0x1F;
-    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
-    reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
-    reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
-    reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
-    reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
-    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
-    reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
-    reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
-    reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
-    reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
-    reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
-    reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
-    reg0 *= const_0x19;
-    reg1 *= const_0x19;
-    reg2 *= const_0x81;
-    reg3 *= const_0x81;
-    reg4 *= const_0x42;
-    reg5 *= const_0x42;
-    reg0 += reg2;
-    reg1 += reg3;
-    reg0 += reg4;
-    reg1 += reg5;
-    reg0 += const_0x1080;
-    reg1 += const_0x1080;
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
-    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
-    ST_UB(dst0, dst_y);
-    src_argb1555 += 32;
-    dst_y += 16;
-  }
-}
-
-void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  int x;
-  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
-  v4u32 res0, res1, res2, res3;
-  v16u8 dst0;
-  v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
-  v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
-  v8i16 const_0x1080 = __msa_fill_h(0x1080);
-  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
-  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
-  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0);
-    src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16);
-    vec0 = src0 & const_0x1F;
-    vec1 = src0 & const_0x7E0;
-    vec2 = src0 & const_0xF800;
-    vec3 = src1 & const_0x1F;
-    vec4 = src1 & const_0x7E0;
-    vec5 = src1 & const_0xF800;
-    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
-    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
-    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
-    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
-    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
-    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
-    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
-    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
-    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
-    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
-    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
-    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
-    vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
-    vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
-    vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
-    vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
-    vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
-    vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
-    vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
-    res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
-    res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
-    res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
-    res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
-    res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
-    res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
-    res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
-    res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
-    res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
-    res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
-    res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
-    res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_y);
-    src_rgb565 += 32;
-    dst_y += 16;
-  }
-}
-
-void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
-  v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
-  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
-  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
-  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
-                 18, 19, 20, 21, 21, 22, 23, 24};
-  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
-  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
-  v16i8 zero = {0};
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
-    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
-    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
-    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
-    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
-    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
-    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
-    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
-    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
-    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
-    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
-    vec0 += const_0x1080;
-    vec1 += const_0x1080;
-    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
-    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_y);
-    src_argb += 48;
-    dst_y += 16;
-  }
-}
-
-void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
-  v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
-  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
-  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
-  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
-                 18, 19, 20, 21, 21, 22, 23, 24};
-  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
-  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
-  v16i8 zero = {0};
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
-    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
-    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
-    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
-    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
-    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
-    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
-    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
-    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
-    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
-    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
-    vec0 += const_0x1080;
-    vec1 += const_0x1080;
-    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
-    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_y);
-    src_argb += 48;
-    dst_y += 16;
-  }
-}
-
-void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
-                         int src_stride_argb1555,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  int x;
-  const uint16_t* s = (const uint16_t*)src_argb1555;
-  const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
-  int64_t res0, res1;
-  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
-  v16u8 dst0;
-  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
-  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
-  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
-  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
-  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
-  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v8u16)__msa_ld_b((void*)s, 0);
-    src1 = (v8u16)__msa_ld_b((void*)s, 16);
-    src2 = (v8u16)__msa_ld_b((void*)t, 0);
-    src3 = (v8u16)__msa_ld_b((void*)t, 16);
-    vec0 = src0 & const_0x1F;
-    vec1 = src1 & const_0x1F;
-    vec0 += src2 & const_0x1F;
-    vec1 += src3 & const_0x1F;
-    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
-    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
-    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
-    vec2 = src0 & const_0x1F;
-    vec3 = src1 & const_0x1F;
-    vec2 += src2 & const_0x1F;
-    vec3 += src3 & const_0x1F;
-    vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
-    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
-    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
-    vec4 = src0 & const_0x1F;
-    vec5 = src1 & const_0x1F;
-    vec4 += src2 & const_0x1F;
-    vec5 += src3 & const_0x1F;
-    vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
-    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
-    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
-    vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
-    vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
-    vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
-    vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
-    vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
-    vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
-    vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
-    reg0 = vec6 * const_0x70;
-    reg1 = vec0 * const_0x4A;
-    reg2 = vec2 * const_0x70;
-    reg3 = vec0 * const_0x5E;
-    reg0 += const_0x8080;
-    reg1 += vec2 * const_0x26;
-    reg2 += const_0x8080;
-    reg3 += vec6 * const_0x12;
-    reg0 -= reg1;
-    reg2 -= reg3;
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
-    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
-    res0 = __msa_copy_u_d((v2i64)dst0, 0);
-    res1 = __msa_copy_u_d((v2i64)dst0, 1);
-    SD(res0, dst_u);
-    SD(res1, dst_v);
-    s += 16;
-    t += 16;
-    dst_u += 8;
-    dst_v += 8;
-  }
-}
-
-void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
-                       int src_stride_rgb565,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  int x;
-  const uint16_t* s = (const uint16_t*)src_rgb565;
-  const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
-  int64_t res0, res1;
-  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
-  v16u8 dst0;
-  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
-  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
-  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
-  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
-  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
-  v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
-  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
-  v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v8u16)__msa_ld_b((void*)s, 0);
-    src1 = (v8u16)__msa_ld_b((void*)s, 16);
-    src2 = (v8u16)__msa_ld_b((void*)t, 0);
-    src3 = (v8u16)__msa_ld_b((void*)t, 16);
-    vec0 = src0 & const_0x1F;
-    vec1 = src1 & const_0x1F;
-    vec0 += src2 & const_0x1F;
-    vec1 += src3 & const_0x1F;
-    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
-    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
-    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
-    vec2 = src0 & const_0x3F;
-    vec3 = src1 & const_0x3F;
-    vec2 += src2 & const_0x3F;
-    vec3 += src3 & const_0x3F;
-    vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
-    src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
-    src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
-    vec4 = src0 & const_0x1F;
-    vec5 = src1 & const_0x1F;
-    vec4 += src2 & const_0x1F;
-    vec5 += src3 & const_0x1F;
-    vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
-    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
-    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
-    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
-    vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
-    vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
-    vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
-    vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
-    reg0 = vec3 * const_0x70;
-    reg1 = vec1 * const_0x4A;
-    reg2 = vec4 * const_0x70;
-    reg3 = vec1 * const_0x5E;
-    reg0 += const_32896;
-    reg1 += vec4 * const_0x26;
-    reg2 += const_32896;
-    reg3 += vec3 * const_0x12;
-    reg0 -= reg1;
-    reg2 -= reg3;
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
-    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
-    res0 = __msa_copy_u_d((v2i64)dst0, 0);
-    res1 = __msa_copy_u_d((v2i64)dst0, 1);
-    SD(res0, dst_u);
-    SD(res1, dst_v);
-    s += 16;
-    t += 16;
-    dst_u += 8;
-    dst_v += 8;
-  }
-}
-
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  int x;
-  const uint8_t* s = src_rgb;
-  const uint8_t* t = src_rgb + src_stride_rgb;
-  int64_t res0, res1;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8i16 reg0, reg1, reg2, reg3;
-  v16u8 dst0;
-  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
-  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
-  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
-  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
-  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
-  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
-  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
-  v16i8 zero = {0};
-
-  for (x = 0; x < width; x += 16) {
-    inp0 = (v16u8)__msa_ld_b((void*)s, 0);
-    inp1 = (v16u8)__msa_ld_b((void*)s, 16);
-    inp2 = (v16u8)__msa_ld_b((void*)s, 32);
-    inp3 = (v16u8)__msa_ld_b((void*)t, 0);
-    inp4 = (v16u8)__msa_ld_b((void*)t, 16);
-    inp5 = (v16u8)__msa_ld_b((void*)t, 32);
-    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
-    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
-    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
-    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
-    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
-    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
-    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
-    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
-    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
-    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
-    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
-    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
-    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
-    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
-    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
-    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
-    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
-    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
-    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
-    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
-    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
-    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
-    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
-    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
-    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
-    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
-    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
-    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
-    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
-    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
-    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
-    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
-    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
-    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
-    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
-    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
-    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
-    reg0 += const_0x0001;
-    reg1 += const_0x0001;
-    reg2 += const_0x0001;
-    reg3 += const_0x0001;
-    reg0 = __msa_srai_h((v8i16)reg0, 1);
-    reg1 = __msa_srai_h((v8i16)reg1, 1);
-    reg2 = __msa_srai_h((v8i16)reg2, 1);
-    reg3 = __msa_srai_h((v8i16)reg3, 1);
-    vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
-    vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
-    vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
-    vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
-    vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
-    vec3 = vec0 * const_0x70;
-    vec4 = vec1 * const_0x4A;
-    vec5 = vec2 * const_0x26;
-    vec2 *= const_0x70;
-    vec1 *= const_0x5E;
-    vec0 *= const_0x12;
-    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
-    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
-    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
-    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
-    reg0 += reg1;
-    reg2 += reg3;
-    reg0 = __msa_srai_h(reg0, 8);
-    reg2 = __msa_srai_h(reg2, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
-    res0 = __msa_copy_u_d((v2i64)dst0, 0);
-    res1 = __msa_copy_u_d((v2i64)dst0, 1);
-    SD(res0, dst_u);
-    SD(res1, dst_v);
-    t += 48;
-    s += 48;
-    dst_u += 8;
-    dst_v += 8;
-  }
-}
-
-void RAWToUVRow_MSA(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  int x;
-  const uint8_t* s = src_rgb;
-  const uint8_t* t = src_rgb + src_stride_rgb;
-  int64_t res0, res1;
-  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8i16 reg0, reg1, reg2, reg3;
-  v16u8 dst0;
-  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
-  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
-  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
-  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
-  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
-  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
-  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
-  v16i8 zero = {0};
-
-  for (x = 0; x < width; x += 16) {
-    inp0 = (v16u8)__msa_ld_b((void*)s, 0);
-    inp1 = (v16u8)__msa_ld_b((void*)s, 16);
-    inp2 = (v16u8)__msa_ld_b((void*)s, 32);
-    inp3 = (v16u8)__msa_ld_b((void*)t, 0);
-    inp4 = (v16u8)__msa_ld_b((void*)t, 16);
-    inp5 = (v16u8)__msa_ld_b((void*)t, 32);
-    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
-    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
-    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
-    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
-    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
-    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
-    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
-    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
-    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
-    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
-    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
-    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
-    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
-    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
-    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
-    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
-    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
-    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
-    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
-    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
-    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
-    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
-    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
-    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
-    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
-    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
-    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
-    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
-    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
-    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
-    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
-    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
-    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
-    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
-    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
-    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
-    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
-    reg0 += const_0x0001;
-    reg1 += const_0x0001;
-    reg2 += const_0x0001;
-    reg3 += const_0x0001;
-    reg0 = __msa_srai_h(reg0, 1);
-    reg1 = __msa_srai_h(reg1, 1);
-    reg2 = __msa_srai_h(reg2, 1);
-    reg3 = __msa_srai_h(reg3, 1);
-    vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
-    vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
-    vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
-    vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
-    vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
-    vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
-    vec3 = vec0 * const_0x70;
-    vec4 = vec1 * const_0x4A;
-    vec5 = vec2 * const_0x26;
-    vec2 *= const_0x70;
-    vec1 *= const_0x5E;
-    vec0 *= const_0x12;
-    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
-    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
-    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
-    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
-    reg0 += reg1;
-    reg2 += reg3;
-    reg0 = __msa_srai_h(reg0, 8);
-    reg2 = __msa_srai_h(reg2, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
-    res0 = __msa_copy_u_d((v2i64)dst0, 0);
-    res1 = __msa_copy_u_d((v2i64)dst0, 1);
-    SD(res0, dst_u);
-    SD(res1, dst_v);
-    t += 48;
-    s += 48;
-    dst_u += 8;
-    dst_v += 8;
-  }
-}
-
-void NV12ToARGBRow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_uv,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  int x;
-  uint64_t val0, val1;
-  v16u8 src0, src1, res0, res1, dst0, dst1;
-  v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v16u8 zero = {0};
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    val0 = LD(src_y);
-    val1 = LD(src_uv);
-    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
-    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
-    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    src_y += 8;
-    src_uv += 8;
-    dst_argb += 32;
-  }
-}
-
-void NV12ToRGB565Row_MSA(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  int x;
-  uint64_t val0, val1;
-  v16u8 src0, src1, dst0;
-  v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v16u8 zero = {0};
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    val0 = LD(src_y);
-    val1 = LD(src_uv);
-    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
-    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    vec0 = vec0 >> 3;
-    vec1 = (vec1 >> 2) << 5;
-    vec2 = (vec2 >> 3) << 11;
-    dst0 = (v16u8)(vec0 | vec1 | vec2);
-    ST_UB(dst0, dst_rgb565);
-    src_y += 8;
-    src_uv += 8;
-    dst_rgb565 += 16;
-  }
-}
-
-void NV21ToARGBRow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_vu,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  int x;
-  uint64_t val0, val1;
-  v16u8 src0, src1, res0, res1, dst0, dst1;
-  v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-  v16u8 zero = {0};
-  v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    val0 = LD(src_y);
-    val1 = LD(src_vu);
-    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
-    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
-    src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
-    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    src_y += 8;
-    src_vu += 8;
-    dst_argb += 32;
-  }
-}
-
-void SobelRow_MSA(const uint8_t* src_sobelx,
-                  const uint8_t* src_sobely,
-                  uint8_t* dst_argb,
-                  int width) {
-  int x;
-  v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
-  v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
-  v16i8 const_0x4 = __msa_ldi_b(0x4);
-  v16i8 mask1 = mask0 + const_0x4;
-  v16i8 mask2 = mask1 + const_0x4;
-  v16i8 mask3 = mask2 + const_0x4;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
-    vec0 = __msa_adds_u_b(src0, src1);
-    dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
-    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
-    dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
-    dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
-    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
-    src_sobelx += 16;
-    src_sobely += 16;
-    dst_argb += 64;
-  }
-}
-
-void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
-                         const uint8_t* src_sobely,
-                         uint8_t* dst_y,
-                         int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
-
-  for (x = 0; x < width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_sobelx, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
-    src3 = (v16u8)__msa_ld_b((void*)src_sobely, 16);
-    dst0 = __msa_adds_u_b(src0, src2);
-    dst1 = __msa_adds_u_b(src1, src3);
-    ST_UB2(dst0, dst1, dst_y, 16);
-    src_sobelx += 32;
-    src_sobely += 32;
-    dst_y += 32;
-  }
-}
-
-void SobelXYRow_MSA(const uint8_t* src_sobelx,
-                    const uint8_t* src_sobely,
-                    uint8_t* dst_argb,
-                    int width) {
-  int x;
-  v16u8 src0, src1, vec0, vec1, vec2;
-  v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
-    vec0 = __msa_adds_u_b(src0, src1);
-    vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
-    vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
-    reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
-    reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
-    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
-    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
-    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
-    src_sobelx += 16;
-    src_sobely += 16;
-    dst_argb += 64;
-  }
-}
-
-void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0;
-  v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
-  v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D);
-  v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
-    ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8,
-            dst0);
-    ST_UB(dst0, dst_y);
-    src_argb += 64;
-    dst_y += 16;
-  }
-}
-
-void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0;
-  v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
-  v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
-  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
-    ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
-            dst0);
-    ST_UB(dst0, dst_y);
-    src_argb += 64;
-    dst_y += 16;
-  }
-}
-
-void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0;
-  v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
-  v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
-  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
-    ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
-            dst0);
-    ST_UB(dst0, dst_y);
-    src_argb += 64;
-    dst_y += 16;
-  }
-}
-
-void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0;
-  v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
-  v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
-  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
-    ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
-            dst0);
-    ST_UB(dst0, dst_y);
-    src_argb += 64;
-    dst_y += 16;
-  }
-}
-
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  int x;
-  const uint8_t* s = src_rgb;
-  const uint8_t* t = src_rgb + src_stride_rgb;
-  v8u16 src0, src1, src2, src3, src4, src5, src6, src7;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 dst0, dst1, dst2, dst3;
-  v16u8 zero = {0};
-  v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15};
-  v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
-  v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15};
-  v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
-  v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f);
-  v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080);
-  v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a);
-  v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a);
-  v4i32 shift = __msa_fill_w(0x00000008);
-
-  for (x = 0; x < width; x += 32) {
-    src1 = __msa_ld_b((void*)s, 0);
-    src3 = __msa_ld_b((void*)s, 16);
-    src5 = __msa_ld_b((void*)t, 0);
-    src7 = __msa_ld_b((void*)t, 16);
-    src0 = __msa_ilvr_b(zero, src1);
-    src1 = __msa_ilvl_b(zero, src1);
-    src2 = __msa_ilvr_b(zero, src3);
-    src3 = __msa_ilvl_b(zero, src3);
-    src4 = __msa_ilvr_b(zero, src5);
-    src5 = __msa_ilvl_b(zero, src5);
-    src6 = __msa_ilvr_b(zero, src7);
-    src7 = __msa_ilvl_b(zero, src7);
-    src0 += src4;
-    src1 += src5;
-    src2 += src6;
-    src3 += src7;
-    src4 = __msa_ilvev_d(src1, src0);
-    src5 = __msa_ilvod_d(src1, src0);
-    src6 = __msa_ilvev_d(src3, src2);
-    src7 = __msa_ilvod_d(src3, src2);
-    vec0 = __msa_aver_u_h(src4, src5);
-    vec1 = __msa_aver_u_h(src6, src7);
-
-    src1 = __msa_ld_b((void*)s, 32);
-    src3 = __msa_ld_b((void*)s, 48);
-    src5 = __msa_ld_b((void*)t, 32);
-    src7 = __msa_ld_b((void*)t, 48);
-    src0 = __msa_ilvr_b(zero, src1);
-    src1 = __msa_ilvl_b(zero, src1);
-    src2 = __msa_ilvr_b(zero, src3);
-    src3 = __msa_ilvl_b(zero, src3);
-    src4 = __msa_ilvr_b(zero, src5);
-    src5 = __msa_ilvl_b(zero, src5);
-    src6 = __msa_ilvr_b(zero, src7);
-    src7 = __msa_ilvl_b(zero, src7);
-    src0 += src4;
-    src1 += src5;
-    src2 += src6;
-    src3 += src7;
-    src4 = __msa_ilvev_d(src1, src0);
-    src5 = __msa_ilvod_d(src1, src0);
-    src6 = __msa_ilvev_d(src3, src2);
-    src7 = __msa_ilvod_d(src3, src2);
-    vec2 = __msa_aver_u_h(src4, src5);
-    vec3 = __msa_aver_u_h(src6, src7);
-    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
-             const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
-             shuffler2, shuffler3, shift, dst0, dst1);
-
-    src1 = __msa_ld_b((void*)s, 64);
-    src3 = __msa_ld_b((void*)s, 80);
-    src5 = __msa_ld_b((void*)t, 64);
-    src7 = __msa_ld_b((void*)t, 80);
-    src0 = __msa_ilvr_b(zero, src1);
-    src1 = __msa_ilvl_b(zero, src1);
-    src2 = __msa_ilvr_b(zero, src3);
-    src3 = __msa_ilvl_b(zero, src3);
-    src4 = __msa_ilvr_b(zero, src5);
-    src5 = __msa_ilvl_b(zero, src5);
-    src6 = __msa_ilvr_b(zero, src7);
-    src7 = __msa_ilvl_b(zero, src7);
-    src0 += src4;
-    src1 += src5;
-    src2 += src6;
-    src3 += src7;
-    src4 = __msa_ilvev_d(src1, src0);
-    src5 = __msa_ilvod_d(src1, src0);
-    src6 = __msa_ilvev_d(src3, src2);
-    src7 = __msa_ilvod_d(src3, src2);
-    vec0 = __msa_aver_u_h(src4, src5);
-    vec1 = __msa_aver_u_h(src6, src7);
-
-    src1 = __msa_ld_b((void*)s, 96);
-    src3 = __msa_ld_b((void*)s, 112);
-    src5 = __msa_ld_b((void*)t, 96);
-    src7 = __msa_ld_b((void*)t, 112);
-    src0 = __msa_ilvr_b(zero, src1);
-    src1 = __msa_ilvl_b(zero, src1);
-    src2 = __msa_ilvr_b(zero, src3);
-    src3 = __msa_ilvl_b(zero, src3);
-    src4 = __msa_ilvr_b(zero, src5);
-    src5 = __msa_ilvl_b(zero, src5);
-    src6 = __msa_ilvr_b(zero, src7);
-    src7 = __msa_ilvl_b(zero, src7);
-    src0 += src4;
-    src1 += src5;
-    src2 += src6;
-    src3 += src7;
-    src4 = __msa_ilvev_d(src1, src0);
-    src5 = __msa_ilvod_d(src1, src0);
-    src6 = __msa_ilvev_d(src3, src2);
-    src7 = __msa_ilvod_d(src3, src2);
-    vec2 = __msa_aver_u_h(src4, src5);
-    vec3 = __msa_aver_u_h(src6, src7);
-    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
-             const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
-             shuffler2, shuffler3, shift, dst2, dst3);
-
-    dst0 = (v8u16)__msa_pckev_b(dst2, dst0);
-    dst1 = (v8u16)__msa_pckev_b(dst3, dst1);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    s += 128;
-    t += 128;
-    dst_v += 16;
-    dst_u += 16;
-  }
-}
-
-void BGRAToUVRow_MSA(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  int x;
-  const uint8_t* s = src_rgb;
-  const uint8_t* t = src_rgb + src_stride_rgb;
-  const uint8_t unused = 0xf;
-  v8u16 src0, src1, src2, src3;
-  v16u8 dst0, dst1;
-  v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused};
-  v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15};
-  v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused};
-  v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14};
-  v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
-  v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
-  v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
-  v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
-  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
-
-  for (x = 0; x < width; x += 16) {
-    READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
-    ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
-               const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
-               shuffler3, dst0, dst1);
-    *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
-    *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
-    s += 64;
-    t += 64;
-    dst_u += 8;
-    dst_v += 8;
-  }
-}
-
-void ABGRToUVRow_MSA(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  int x;
-  const uint8_t* s = src_rgb;
-  const uint8_t* t = src_rgb + src_stride_rgb;
-  const uint8_t unused = 0xf;
-  v8u16 src0, src1, src2, src3;
-  v16u8 dst0, dst1;
-  v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused};
-  v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
-  v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused};
-  v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
-  v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
-  v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
-  v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
-  v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
-  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
-
-  for (x = 0; x < width; x += 16) {
-    READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
-    ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
-               const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
-               shuffler3, dst0, dst1);
-    *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
-    *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
-    s += 64;
-    t += 64;
-    dst_u += 8;
-    dst_v += 8;
-  }
-}
-
-void RGBAToUVRow_MSA(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  int x;
-  const uint8_t* s = src_rgb;
-  const uint8_t* t = src_rgb + src_stride_rgb;
-  const uint8_t unused = 0xf;
-  v8u16 src0, src1, src2, src3;
-  v16u8 dst0, dst1;
-  v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused};
-  v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13};
-  v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused};
-  v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14};
-  v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
-  v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
-  v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
-  v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
-  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
-
-  for (x = 0; x < width; x += 16) {
-    READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
-    ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
-               const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
-               shuffler3, dst0, dst1);
-    *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
-    *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
-    s += 64;
-    t += 64;
-    dst_u += 8;
-    dst_v += 8;
-  }
-}
-
-void I444ToARGBRow_MSA(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  int x;
-  v16u8 src0, src1, src2, dst0, dst1;
-  v8u16 vec0, vec1, vec2;
-  v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-  v8i16 zero = {0};
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-
-  for (x = 0; x < width; x += 8) {
-    READI444(src_y, src_u, src_v, src0, src1, src2);
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
-    reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
-    reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
-    reg0 *= vec_yg;
-    reg1 *= vec_yg;
-    reg0 = __msa_srai_w(reg0, 16);
-    reg1 = __msa_srai_w(reg1, 16);
-    reg4 = reg0 + vec_br;
-    reg5 = reg1 + vec_br;
-    reg2 = reg0 + vec_bg;
-    reg3 = reg1 + vec_bg;
-    reg0 += vec_bb;
-    reg1 += vec_bb;
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
-    vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
-    reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
-    reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
-    reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
-    reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
-    reg0 -= reg6 * vec_ub;
-    reg1 -= reg7 * vec_ub;
-    reg2 -= reg6 * vec_ug;
-    reg3 -= reg7 * vec_ug;
-    reg4 -= reg8 * vec_vr;
-    reg5 -= reg9 * vec_vr;
-    reg2 -= reg8 * vec_vg;
-    reg3 -= reg9 * vec_vg;
-    reg0 = __msa_srai_w(reg0, 6);
-    reg1 = __msa_srai_w(reg1, 6);
-    reg2 = __msa_srai_w(reg2, 6);
-    reg3 = __msa_srai_w(reg3, 6);
-    reg4 = __msa_srai_w(reg4, 6);
-    reg5 = __msa_srai_w(reg5, 6);
-    CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
-    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
-    vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
-    vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
-    dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
-    dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    src_y += 8;
-    src_u += 8;
-    src_v += 8;
-    dst_argb += 32;
-  }
-}
-
-// TODO - respect YuvConstants
-void I400ToARGBRow_MSA(const uint8_t* src_y,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  int x;
-#if defined(__aarch64__) || defined(__arm__)
-  int ygb = yuvconstants->kUVBiasBGR[3];
-  int yg = yuvconstants->kYToRgb[1];
-#else
-  int ygb = yuvconstants->kYBiasToRgb[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-  v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
-  v8i16 vec0, vec1;
-  v4i32 reg0, reg1, reg2, reg3;
-  v4i32 vec_yg = __msa_fill_w(yg);
-  v8i16 vec_ygb = __msa_fill_h(ygb);
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-  v8i16 max = __msa_ldi_h(0xFF);
-  v8i16 zero = {0};
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_y, 0);
-    vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
-    vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
-    reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
-    reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
-    reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
-    reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
-    reg0 *= vec_yg;
-    reg1 *= vec_yg;
-    reg2 *= vec_yg;
-    reg3 *= vec_yg;
-    reg0 = __msa_srai_w(reg0, 16);
-    reg1 = __msa_srai_w(reg1, 16);
-    reg2 = __msa_srai_w(reg2, 16);
-    reg3 = __msa_srai_w(reg3, 16);
-    vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
-    vec0 += vec_ygb;
-    vec1 += vec_ygb;
-    vec0 = __msa_srai_h(vec0, 6);
-    vec1 = __msa_srai_h(vec1, 6);
-    vec0 = __msa_maxi_s_h(vec0, 0);
-    vec1 = __msa_maxi_s_h(vec1, 0);
-    vec0 = __msa_min_s_h(max, vec0);
-    vec1 = __msa_min_s_h(max, vec1);
-    res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
-    res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
-    res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
-    res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
-    dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
-    dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
-    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
-    src_y += 16;
-    dst_argb += 64;
-  }
-}
-
-void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  int x;
-  v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_y, 0);
-    vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
-    vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
-    vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
-    vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
-    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
-    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
-    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
-    src_y += 16;
-    dst_argb += 64;
-  }
-}
-
-void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  int x;
-  v16u8 src0, src1, src2;
-  v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0);
-    src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
-    src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
-    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
-    src_yuy2 += 16;
-    dst_argb += 32;
-  }
-}
-
-void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
-                       uint8_t* dst_argb,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  int x;
-  v16u8 src0, src1, src2;
-  v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
-  v4i32 vec_ubvr, vec_ugvg;
-  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
-
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
-  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
-  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0);
-    src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
-    src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
-    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
-    src_uyvy += 16;
-    dst_argb += 32;
-  }
-}
-
-void InterpolateRow_MSA(uint8_t* dst_ptr,
-                        const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        int width,
-                        int32_t source_y_fraction) {
-  int32_t y1_fraction = source_y_fraction;
-  int32_t y0_fraction = 256 - y1_fraction;
-  uint16_t y_fractions;
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
-  v8u16 vec0, vec1, vec2, vec3, y_frac;
-
-  if (0 == y1_fraction) {
-    memcpy(dst_ptr, src_ptr, width);
-    return;
-  }
-
-  if (128 == y1_fraction) {
-    for (x = 0; x < width; x += 32) {
-      src0 = (v16u8)__msa_ld_b((void*)s, 0);
-      src1 = (v16u8)__msa_ld_b((void*)s, 16);
-      src2 = (v16u8)__msa_ld_b((void*)t, 0);
-      src3 = (v16u8)__msa_ld_b((void*)t, 16);
-      dst0 = __msa_aver_u_b(src0, src2);
-      dst1 = __msa_aver_u_b(src1, src3);
-      ST_UB2(dst0, dst1, dst_ptr, 16);
-      s += 32;
-      t += 32;
-      dst_ptr += 32;
-    }
-    return;
-  }
-
-  y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
-  y_frac = (v8u16)__msa_fill_h(y_fractions);
-
-  for (x = 0; x < width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((void*)s, 0);
-    src1 = (v16u8)__msa_ld_b((void*)s, 16);
-    src2 = (v16u8)__msa_ld_b((void*)t, 0);
-    src3 = (v16u8)__msa_ld_b((void*)t, 16);
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
-    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
-    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
-    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
-    vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
-    vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
-    vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
-    vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
-    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
-    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
-    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
-    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    ST_UB2(dst0, dst1, dst_ptr, 16);
-    s += 32;
-    t += 32;
-    dst_ptr += 32;
-  }
-}
-
-void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) {
-  int x;
-  v4i32 dst0 = __builtin_msa_fill_w(v32);
-
-  for (x = 0; x < width; x += 4) {
-    ST_UB(dst0, dst_argb);
-    dst_argb += 16;
-  }
-}
-
-void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
-  v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
-  v16i8 shuffler1 = {8,  7,  12, 11, 10, 15, 14, 13,
-                     18, 17, 16, 21, 20, 19, 24, 23};
-  v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
-                     24, 23, 28, 27, 26, 31, 30, 29};
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_raw, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_raw, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_raw, 32);
-    src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
-    src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
-    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
-    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
-    ST_UB2(dst0, dst1, dst_rgb24, 16);
-    ST_UB(dst2, (dst_rgb24 + 32));
-    src_raw += 48;
-    dst_rgb24 += 48;
-  }
-}
-
-void MergeUVRow_MSA(const uint8_t* src_u,
-                    const uint8_t* src_v,
-                    uint8_t* dst_uv,
-                    int width) {
-  int x;
-  v16u8 src0, src1, dst0, dst1;
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_u, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_v, 0);
-    dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
-    ST_UB2(dst0, dst1, dst_uv, 16);
-    src_u += 16;
-    src_v += 16;
-    dst_uv += 32;
-  }
-}
-
-void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
-                             uint8_t* dst_a,
-                             int width) {
-  int i;
-  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
-
-  for (i = 0; i < width; i += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
-    vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_a);
-    src_argb += 64;
-    dst_a += 16;
-  }
-}
-
-void ARGBBlendRow_MSA(const uint8_t* src_argb,
-                      const uint8_t* src_argb1,
-                      uint8_t* dst_argb,
-                      int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
-  v8u16 const_256 = (v8u16)__msa_ldi_h(256);
-  v16u8 const_255 = (v16u8)__msa_ldi_b(255);
-  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
-  v16i8 zero = {0};
-
-  for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
-    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
-    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
-    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
-    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
-    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
-    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
-    vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);
-    vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);
-    vec8 = (v8u16)__msa_fill_h(vec0[3]);
-    vec9 = (v8u16)__msa_fill_h(vec0[7]);
-    vec10 = (v8u16)__msa_fill_h(vec1[3]);
-    vec11 = (v8u16)__msa_fill_h(vec1[7]);
-    vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
-    vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
-    vec10 = (v8u16)__msa_fill_h(vec2[3]);
-    vec11 = (v8u16)__msa_fill_h(vec2[7]);
-    vec12 = (v8u16)__msa_fill_h(vec3[3]);
-    vec13 = (v8u16)__msa_fill_h(vec3[7]);
-    vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
-    vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);
-    vec8 = const_256 - vec8;
-    vec9 = const_256 - vec9;
-    vec10 = const_256 - vec10;
-    vec11 = const_256 - vec11;
-    vec8 *= vec4;
-    vec9 *= vec5;
-    vec10 *= vec6;
-    vec11 *= vec7;
-    vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);
-    vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
-    vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
-    vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8);
-    dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
-    dst0 = (v16u8)__msa_adds_u_b(dst0, dst2);
-    dst1 = (v16u8)__msa_adds_u_b(dst1, dst3);
-    dst0 = __msa_bmnz_v(dst0, const_255, mask);
-    dst1 = __msa_bmnz_v(dst1, const_255, mask);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb += 32;
-    src_argb1 += 32;
-    dst_argb += 32;
-  }
-}
-
-void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
-                         int scale,
-                         int interval_size,
-                         int interval_offset,
-                         int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
-  v4i32 vec_scale = __msa_fill_w(scale);
-  v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);
-  v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);
-  v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
-  v16i8 zero = {0};
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
-    src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
-    src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
-    src3 = (v16u8)__msa_ld_b((void*)dst_argb, 48);
-    vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);
-    vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);
-    vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
-    vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
-    vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
-    vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
-    vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);
-    vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);
-    tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
-    tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
-    tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
-    tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
-    tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);
-    tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);
-    tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);
-    tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);
-    tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);
-    tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);
-    tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);
-    tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);
-    tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);
-    tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);
-    tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);
-    tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);
-    tmp0 *= vec_scale;
-    tmp1 *= vec_scale;
-    tmp2 *= vec_scale;
-    tmp3 *= vec_scale;
-    tmp4 *= vec_scale;
-    tmp5 *= vec_scale;
-    tmp6 *= vec_scale;
-    tmp7 *= vec_scale;
-    tmp8 *= vec_scale;
-    tmp9 *= vec_scale;
-    tmp10 *= vec_scale;
-    tmp11 *= vec_scale;
-    tmp12 *= vec_scale;
-    tmp13 *= vec_scale;
-    tmp14 *= vec_scale;
-    tmp15 *= vec_scale;
-    tmp0 >>= 16;
-    tmp1 >>= 16;
-    tmp2 >>= 16;
-    tmp3 >>= 16;
-    tmp4 >>= 16;
-    tmp5 >>= 16;
-    tmp6 >>= 16;
-    tmp7 >>= 16;
-    tmp8 >>= 16;
-    tmp9 >>= 16;
-    tmp10 >>= 16;
-    tmp11 >>= 16;
-    tmp12 >>= 16;
-    tmp13 >>= 16;
-    tmp14 >>= 16;
-    tmp15 >>= 16;
-    vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
-    vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
-    vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
-    vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
-    vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
-    vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
-    vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
-    vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
-    dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
-    dst0 *= vec_int_sz;
-    dst1 *= vec_int_sz;
-    dst2 *= vec_int_sz;
-    dst3 *= vec_int_sz;
-    dst0 += vec_int_ofst;
-    dst1 += vec_int_ofst;
-    dst2 += vec_int_ofst;
-    dst3 += vec_int_ofst;
-    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);
-    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);
-    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);
-    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);
-    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
-    dst_argb += 64;
-  }
-}
-
-void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const int8_t* matrix_argb,
-                            int width) {
-  int32_t x;
-  v16i8 src0;
-  v16u8 src1, src2, dst0, dst1;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
-  v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
-  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
-  v16i8 zero = {0};
-  v8i16 max = __msa_ldi_h(255);
-
-  src0 = __msa_ld_b((void*)matrix_argb, 0);
-  vec0 = (v8i16)__msa_ilvr_b(zero, src0);
-  vec1 = (v8i16)__msa_ilvl_b(zero, src0);
-
-  for (x = 0; x < width; x += 8) {
-    src1 = (v16u8)__msa_ld_b((void*)src_argb, 0);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb, 16);
-    vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
-    vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
-    vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
-    vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
-    vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);
-    vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);
-    vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);
-    vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);
-    vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);
-    vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);
-    vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);
-    vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);
-    vec10 = vec2 * vec0;
-    vec11 = vec2 * vec1;
-    vec12 = vec6 * vec0;
-    vec13 = vec6 * vec1;
-    tmp0 = __msa_hadd_s_w(vec10, vec10);
-    tmp1 = __msa_hadd_s_w(vec11, vec11);
-    tmp2 = __msa_hadd_s_w(vec12, vec12);
-    tmp3 = __msa_hadd_s_w(vec13, vec13);
-    vec14 = vec3 * vec0;
-    vec15 = vec3 * vec1;
-    vec16 = vec7 * vec0;
-    vec17 = vec7 * vec1;
-    tmp4 = __msa_hadd_s_w(vec14, vec14);
-    tmp5 = __msa_hadd_s_w(vec15, vec15);
-    tmp6 = __msa_hadd_s_w(vec16, vec16);
-    tmp7 = __msa_hadd_s_w(vec17, vec17);
-    vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
-    vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
-    vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
-    vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
-    tmp0 = __msa_hadd_s_w(vec10, vec10);
-    tmp1 = __msa_hadd_s_w(vec11, vec11);
-    tmp2 = __msa_hadd_s_w(vec12, vec12);
-    tmp3 = __msa_hadd_s_w(vec13, vec13);
-    tmp0 = __msa_srai_w(tmp0, 6);
-    tmp1 = __msa_srai_w(tmp1, 6);
-    tmp2 = __msa_srai_w(tmp2, 6);
-    tmp3 = __msa_srai_w(tmp3, 6);
-    vec2 = vec4 * vec0;
-    vec6 = vec4 * vec1;
-    vec3 = vec8 * vec0;
-    vec7 = vec8 * vec1;
-    tmp8 = __msa_hadd_s_w(vec2, vec2);
-    tmp9 = __msa_hadd_s_w(vec6, vec6);
-    tmp10 = __msa_hadd_s_w(vec3, vec3);
-    tmp11 = __msa_hadd_s_w(vec7, vec7);
-    vec4 = vec5 * vec0;
-    vec8 = vec5 * vec1;
-    vec5 = vec9 * vec0;
-    vec9 = vec9 * vec1;
-    tmp12 = __msa_hadd_s_w(vec4, vec4);
-    tmp13 = __msa_hadd_s_w(vec8, vec8);
-    tmp14 = __msa_hadd_s_w(vec5, vec5);
-    tmp15 = __msa_hadd_s_w(vec9, vec9);
-    vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
-    vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
-    vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
-    vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
-    tmp4 = __msa_hadd_s_w(vec14, vec14);
-    tmp5 = __msa_hadd_s_w(vec15, vec15);
-    tmp6 = __msa_hadd_s_w(vec16, vec16);
-    tmp7 = __msa_hadd_s_w(vec17, vec17);
-    tmp4 = __msa_srai_w(tmp4, 6);
-    tmp5 = __msa_srai_w(tmp5, 6);
-    tmp6 = __msa_srai_w(tmp6, 6);
-    tmp7 = __msa_srai_w(tmp7, 6);
-    vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
-    vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
-    vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
-    vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
-    vec10 = __msa_maxi_s_h(vec10, 0);
-    vec11 = __msa_maxi_s_h(vec11, 0);
-    vec12 = __msa_maxi_s_h(vec12, 0);
-    vec13 = __msa_maxi_s_h(vec13, 0);
-    vec10 = __msa_min_s_h(vec10, max);
-    vec11 = __msa_min_s_h(vec11, max);
-    vec12 = __msa_min_s_h(vec12, max);
-    vec13 = __msa_min_s_h(vec13, max);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);
-    ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb += 32;
-    dst_argb += 32;
-  }
-}
-
-void SplitUVRow_MSA(const uint8_t* src_uv,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
-
-  for (x = 0; x < width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((void*)src_uv, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_uv, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_uv, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_uv, 48);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
-    dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    ST_UB2(dst0, dst1, dst_u, 16);
-    ST_UB2(dst2, dst3, dst_v, 16);
-    src_uv += 64;
-    dst_u += 32;
-    dst_v += 32;
-  }
-}
-
-void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
-  int x;
-  v16u8 dst0 = (v16u8)__msa_fill_b(v8);
-
-  for (x = 0; x < width; x += 16) {
-    ST_UB(dst0, dst);
-    dst += 16;
-  }
-}
-
-void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  int x;
-  v16u8 src0, src1, src2, src3;
-  v16u8 dst0, dst1, dst2, dst3;
-  v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
-  v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
-
-  src_uv += (2 * width);
-
-  for (x = 0; x < width; x += 32) {
-    src_uv -= 64;
-    src2 = (v16u8)__msa_ld_b((void*)src_uv, 0);
-    src3 = (v16u8)__msa_ld_b((void*)src_uv, 16);
-    src0 = (v16u8)__msa_ld_b((void*)src_uv, 32);
-    src1 = (v16u8)__msa_ld_b((void*)src_uv, 48);
-    dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
-    dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
-    dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
-    ST_UB2(dst0, dst1, dst_v, 16);
-    ST_UB2(dst2, dst3, dst_u, 16);
-    dst_u += 32;
-    dst_v += 32;
-  }
-}
-
-void SobelXRow_MSA(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   const uint8_t* src_y2,
-                   uint8_t* dst_sobelx,
-                   int32_t width) {
-  int x;
-  v16u8 src0, src1, src2, src3, src4, src5, dst0;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
-  v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
-  v16i8 tmp = __msa_ldi_b(8);
-  v16i8 mask1 = mask0 + tmp;
-  v8i16 zero = {0};
-  v8i16 max = __msa_ldi_h(255);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_y0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_y0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_y1, 0);
-    src3 = (v16u8)__msa_ld_b((void*)src_y1, 16);
-    src4 = (v16u8)__msa_ld_b((void*)src_y2, 0);
-    src5 = (v16u8)__msa_ld_b((void*)src_y2, 16);
-    vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
-    vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
-    vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
-    vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
-    vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
-    vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
-    vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
-    vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
-    vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
-    vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
-    vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
-    vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
-    vec0 += vec2;
-    vec1 += vec3;
-    vec4 += vec2;
-    vec5 += vec3;
-    vec0 += vec4;
-    vec1 += vec5;
-    vec0 = __msa_add_a_h(zero, vec0);
-    vec1 = __msa_add_a_h(zero, vec1);
-    vec0 = __msa_maxi_s_h(vec0, 0);
-    vec1 = __msa_maxi_s_h(vec1, 0);
-    vec0 = __msa_min_s_h(max, vec0);
-    vec1 = __msa_min_s_h(max, vec1);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_sobelx);
-    src_y0 += 16;
-    src_y1 += 16;
-    src_y2 += 16;
-    dst_sobelx += 16;
-  }
-}
-
-void SobelYRow_MSA(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   uint8_t* dst_sobely,
-                   int32_t width) {
-  int x;
-  v16u8 src0, src1, dst0;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
-  v8i16 zero = {0};
-  v8i16 max = __msa_ldi_h(255);
-
-  for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_y0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_y1, 0);
-    vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
-    vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
-    vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
-    vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
-    vec0 -= vec2;
-    vec1 -= vec3;
-    vec6[0] = src_y0[16] - src_y1[16];
-    vec6[1] = src_y0[17] - src_y1[17];
-    vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
-    vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
-    vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
-    vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
-    vec0 += vec2;
-    vec1 += vec3;
-    vec4 += vec2;
-    vec5 += vec3;
-    vec0 += vec4;
-    vec1 += vec5;
-    vec0 = __msa_add_a_h(zero, vec0);
-    vec1 = __msa_add_a_h(zero, vec1);
-    vec0 = __msa_maxi_s_h(vec0, 0);
-    vec1 = __msa_maxi_s_h(vec1, 0);
-    vec0 = __msa_min_s_h(max, vec0);
-    vec1 = __msa_min_s_h(max, vec1);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_sobely);
-    src_y0 += 16;
-    src_y1 += 16;
-    dst_sobely += 16;
-  }
-}
-
-void HalfFloatRow_MSA(const uint16_t* src,
-                      uint16_t* dst,
-                      float scale,
-                      int width) {
-  int i;
-  v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
-  v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
-  v4f32 mult_vec;
-  v8i16 zero = {0};
-  mult_vec[0] = 1.9259299444e-34f * scale;
-  mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
-
-  for (i = 0; i < width; i += 32) {
-    src0 = (v8u16)__msa_ld_h((void*)src, 0);
-    src1 = (v8u16)__msa_ld_h((void*)src, 16);
-    src2 = (v8u16)__msa_ld_h((void*)src, 32);
-    src3 = (v8u16)__msa_ld_h((void*)src, 48);
-    vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
-    vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
-    vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
-    vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
-    vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
-    vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
-    vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
-    vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
-    fvec0 = __msa_ffint_u_w(vec0);
-    fvec1 = __msa_ffint_u_w(vec1);
-    fvec2 = __msa_ffint_u_w(vec2);
-    fvec3 = __msa_ffint_u_w(vec3);
-    fvec4 = __msa_ffint_u_w(vec4);
-    fvec5 = __msa_ffint_u_w(vec5);
-    fvec6 = __msa_ffint_u_w(vec6);
-    fvec7 = __msa_ffint_u_w(vec7);
-    fvec0 *= mult_vec;
-    fvec1 *= mult_vec;
-    fvec2 *= mult_vec;
-    fvec3 *= mult_vec;
-    fvec4 *= mult_vec;
-    fvec5 *= mult_vec;
-    fvec6 *= mult_vec;
-    fvec7 *= mult_vec;
-    vec0 = ((v4u32)fvec0) >> 13;
-    vec1 = ((v4u32)fvec1) >> 13;
-    vec2 = ((v4u32)fvec2) >> 13;
-    vec3 = ((v4u32)fvec3) >> 13;
-    vec4 = ((v4u32)fvec4) >> 13;
-    vec5 = ((v4u32)fvec5) >> 13;
-    vec6 = ((v4u32)fvec6) >> 13;
-    vec7 = ((v4u32)fvec7) >> 13;
-    dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
-    dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
-    dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
-    dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
-    ST_UH2(dst0, dst1, dst, 8);
-    ST_UH2(dst2, dst3, dst + 16, 8);
-    src += 32;
-    dst += 32;
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/thirdparty/libyuv/source/row_neon.cc b/thirdparty/libyuv/source/row_neon.cc
deleted file mode 100644
index ccc4af6..0000000
--- a/thirdparty/libyuv/source/row_neon.cc
+++ /dev/null
@@ -1,3577 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include <stdio.h>
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-// q0: Y uint16x8_t
-// d2: U uint8x8_t
-// d3: V uint8x8_t
-
-// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                               \
-  "vld1.8     {d0}, [%[src_y]]!              \n" \
-  "vld1.32    {d2[0]}, [%[src_u]]!           \n" \
-  "vld1.32    {d2[1]}, [%[src_v]]!           \n" \
-  "vmov.u8    d1, d0                         \n" \
-  "vmovl.u8   q1, d2                         \n" \
-  "vzip.u8    d0, d1                         \n" \
-  "vsli.u16   q1, q1, #8                     \n"
-
-// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                               \
-  "vld1.8     {d0}, [%[src_y]]!              \n" \
-  "vld1.8     {d2}, [%[src_u]]!              \n" \
-  "vmovl.u8   q0, d0                         \n" \
-  "vld1.8     {d3}, [%[src_v]]!              \n" \
-  "vsli.u16   q0, q0, #8                     \n"
-
-// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                               \
-  "vld1.8     {d0}, [%[src_y]]!              \n" \
-  "vmov.u8    q1, #128                       \n" \
-  "vmovl.u8   q0, d0                         \n" \
-  "vsli.u16   q0, q0, #8                     \n"
-
-// Read 8 Y and 4 UV from NV12
-#define READNV12                                                              \
-  "vld1.8     {d0}, [%[src_y]]!              \n"                              \
-  "vld1.8     {d2}, [%[src_uv]]!             \n"                              \
-  "vmov.u8    d1, d0                         \n"                              \
-  "vmov.u8    d3, d2                         \n"                              \
-  "vzip.u8    d0, d1                         \n"                              \
-  "vsli.u16   d2, d2, #8                     \n" /* Duplicate low byte (U) */ \
-  "vsri.u16   d3, d3, #8                     \n" /* Duplicate high byte (V) */
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21                                                               \
-  "vld1.8     {d0}, [%[src_y]]!              \n"                               \
-  "vld1.8     {d2}, [%[src_vu]]!             \n"                               \
-  "vmov.u8    d1, d0                         \n"                               \
-  "vmov.u8    d3, d2                         \n"                               \
-  "vzip.u8    d0, d1                         \n"                               \
-  "vsri.u16   d2, d2, #8                     \n" /* Duplicate high byte (U) */ \
-  "vsli.u16   d3, d3, #8                     \n" /* Duplicate low byte (V) */
-
-// Read 8 YUY2
-#define READYUY2                                 \
-  "vld2.8     {d0, d2}, [%[src_yuy2]]!       \n" \
-  "vmovl.u8   q0, d0                         \n" \
-  "vmov.u8    d3, d2                         \n" \
-  "vsli.u16   q0, q0, #8                     \n" \
-  "vsli.u16   d2, d2, #8                     \n" \
-  "vsri.u16   d3, d3, #8                     \n"
-
-// Read 8 UYVY
-#define READUYVY                                 \
-  "vld2.8     {d2, d3}, [%[src_uyvy]]!       \n" \
-  "vmovl.u8   q0, d3                         \n" \
-  "vmov.u8    d3, d2                         \n" \
-  "vsli.u16   q0, q0, #8                     \n" \
-  "vsli.u16   d2, d2, #8                     \n" \
-  "vsri.u16   d3, d3, #8                     \n"
-
-#define YUVTORGB_SETUP                                        \
-  "vld4.8     {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \
-  "vld1.16    {d31[]}, [%[kRGBCoeffBias]]!   \n"              \
-  "vld1.16    {d20[], d21[]}, [%[kRGBCoeffBias]]! \n"         \
-  "vld1.16    {d22[], d23[]}, [%[kRGBCoeffBias]]! \n"         \
-  "vld1.16    {d24[], d25[]}, [%[kRGBCoeffBias]] \n"
-
-// q0: B uint16x8_t
-// q1: G uint16x8_t
-// q2: R uint16x8_t
-
-// Convert from YUV to 2.14 fixed point RGB
-#define YUVTORGB                                           \
-  "vmull.u16  q2, d1, d31                    \n"           \
-  "vmull.u8   q8, d3, d29                    \n" /* DGV */ \
-  "vmull.u16  q0, d0, d31                    \n"           \
-  "vmlal.u8   q8, d2, d28                    \n" /* DG */  \
-  "vqshrn.u32 d0, q0, #16                    \n"           \
-  "vqshrn.u32 d1, q2, #16                    \n" /* Y */   \
-  "vmull.u8   q9, d2, d26                    \n" /* DB */  \
-  "vmull.u8   q2, d3, d27                    \n" /* DR */  \
-  "vadd.u16   q4, q0, q11                    \n" /* G */   \
-  "vadd.u16   q2, q0, q2                     \n" /* R */   \
-  "vadd.u16   q0, q0, q9                     \n" /* B */   \
-  "vqsub.u16  q1, q4, q8                     \n" /* G */   \
-  "vqsub.u16  q0, q0, q10                    \n" /* B */   \
-  "vqsub.u16  q2, q2, q12                    \n" /* R */
-
-// Convert from 2.14 fixed point RGB To 8 bit RGB
-#define RGBTORGB8                                        \
-  "vqshrn.u16 d4, q2, #6                     \n" /* R */ \
-  "vqshrn.u16 d2, q1, #6                     \n" /* G */ \
-  "vqshrn.u16 d0, q0, #6                     \n" /* B */
-
-#define YUVTORGB_REGS \
-  "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31"
-
-#define STORERGBA                                \
-  "vmov.u8    d1, d0                         \n" \
-  "vmov.u8    d3, d4                         \n" \
-  "vmov.u8    d0, d6                         \n" \
-  "vst4.8     {d0, d1, d2, d3}, [%[dst_rgba]]! \n"
-
-void I444ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READYUV444 YUVTORGB
-          RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "d6");
-}
-
-void I422ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "d6");
-}
-
-void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             const uint8_t* src_a,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV444 YUVTORGB
-          RGBTORGB8
-      "vld1.8      {d6}, [%[src_a]]!             \n"
-      "subs        %[width], %[width], #8        \n"
-      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [src_a] "+r"(src_a),                               // %[src_a]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "d6");
-}
-
-void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             const uint8_t* src_a,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8
-      "vld1.8      {d6}, [%[src_a]]!             \n"
-      "subs        %[width], %[width], #8        \n"
-      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [src_a] "+r"(src_a),                               // %[src_a]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "d6");
-}
-
-void I422ToRGBARow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8 "subs        %[width], %[width], #8        \n" STORERGBA
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_rgba] "+r"(dst_rgba),                         // %[dst_rgba]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "d6");
-}
-
-void I422ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS);
-}
-
-#define ARGBTORGB565                                                        \
-  "vshll.u8    q2, d4, #8                    \n" /* R                    */ \
-  "vshll.u8    q1, d2, #8                    \n" /* G                    */ \
-  "vshll.u8    q0, d0, #8                    \n" /* B                    */ \
-  "vsri.16     q2, q1, #5                    \n" /* RG                   */ \
-  "vsri.16     q2, q0, #11                   \n" /* RGB                  */
-
-void I422ToRGB565Row_NEON(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8 "subs        %[width], %[width], #8        \n" ARGBTORGB565
-      "vst1.8      {q2}, [%[dst_rgb565]]!        \n"  // store 8 pixels RGB565.
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS);
-}
-
-#define ARGBTOARGB1555                                                      \
-  "vshll.u8    q3, d6, #8                    \n" /* A                    */ \
-  "vshll.u8    q2, d4, #8                    \n" /* R                    */ \
-  "vshll.u8    q1, d2, #8                    \n" /* G                    */ \
-  "vshll.u8    q0, d0, #8                    \n" /* B                    */ \
-  "vsri.16     q3, q2, #1                    \n" /* AR                   */ \
-  "vsri.16     q3, q1, #6                    \n" /* ARG                  */ \
-  "vsri.16     q3, q0, #11                   \n" /* ARGB                 */
-
-void I422ToARGB1555Row_NEON(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vmov.u8     d6, #0xff                     \n" ARGBTOARGB1555
-      "vst1.8      {q3}, [%[dst_argb1555]]!      \n"  // store 8 pixels RGB1555.
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_argb1555] "+r"(dst_argb1555),                 // %[dst_argb1555]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "q3");
-}
-
-#define ARGBTOARGB4444                                                      \
-  "vshr.u8    d0, d0, #4                     \n" /* B                    */ \
-  "vbic.32    d2, d2, d7                     \n" /* G                    */ \
-  "vshr.u8    d4, d4, #4                     \n" /* R                    */ \
-  "vbic.32    d6, d6, d7                     \n" /* A                    */ \
-  "vorr       d0, d0, d2                     \n" /* BG                   */ \
-  "vorr       d1, d4, d6                     \n" /* RA                   */ \
-  "vzip.u8    d0, d1                         \n" /* BGRA                 */
-
-void I422ToARGB4444Row_NEON(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "vmov.u8     d7, #0x0f                     \n"  // vbic bits to clear
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8
-      "subs        %[width], %[width], #8        \n" ARGBTOARGB4444
-      "vst1.8      {q0}, [%[dst_argb4444]]!      \n"  // store 8 pixels
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_argb4444] "+r"(dst_argb4444),                 // %[dst_argb4444]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "q3");
-}
-
-void I400ToARGBRow_NEON(const uint8_t* src_y,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READYUV400 YUVTORGB
-          RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "d6");
-}
-
-void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "vmov.u8     d23, #255                     \n"
-      "1:                                        \n"
-      "vld1.8      {d20}, [%0]!                  \n"
-      "vmov        d21, d20                      \n"
-      "vmov        d22, d20                      \n"
-      "subs        %2, %2, #8                    \n"
-      "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d20", "d21", "d22", "d23");
-}
-
-void NV12ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_uv,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_uv] "+r"(src_uv),                             // %[src_uv]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "d6");
-}
-
-void NV21ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_vu,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_vu] "+r"(src_vu),                             // %[src_vu]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "d6");
-}
-
-void NV12ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_uv] "+r"(src_uv),                             // %[src_uv]
-        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS);
-}
-
-void NV21ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_vu] "+r"(src_vu),                             // %[src_vu]
-        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS);
-}
-
-void NV12ToRGB565Row_NEON(const uint8_t* src_y,
-                          const uint8_t* src_uv,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
-      "subs        %[width], %[width], #8        \n" ARGBTORGB565
-      "vst1.8      {q2}, [%[dst_rgb565]]!        \n"  // store 8 pixels RGB565.
-      "bgt         1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_uv] "+r"(src_uv),                             // %[src_uv]
-        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS);
-}
-
-void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
-      "bgt         1b                            \n"
-      : [src_yuy2] "+r"(src_yuy2),                         // %[src_yuy2]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "d6");
-}
-
-void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8     d6, #255                      \n"
-      "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
-      "subs        %[width], %[width], #8        \n"
-      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
-      "bgt         1b                            \n"
-      : [src_uyvy] "+r"(src_uyvy),                         // %[src_uyvy]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "d6");
-}
-
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pairs of UV
-      "subs        %3, %3, #16                   \n"  // 16 processed per loop
-      "vst1.8      {q0}, [%1]!                   \n"  // store U
-      "vst1.8      {q1}, [%2]!                   \n"  // store V
-      "bgt         1b                            \n"
-      : "+r"(src_uv),               // %0
-        "+r"(dst_u),                // %1
-        "+r"(dst_v),                // %2
-        "+r"(width)                 // %3  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load U
-      "vld1.8      {q1}, [%1]!                   \n"  // load V
-      "subs        %3, %3, #16                   \n"  // 16 processed per loop
-      "vst2.8      {q0, q1}, [%2]!               \n"  // store 16 pairs of UV
-      "bgt         1b                            \n"
-      : "+r"(src_u),                // %0
-        "+r"(src_v),                // %1
-        "+r"(dst_uv),               // %2
-        "+r"(width)                 // %3  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
-void SplitRGBRow_NEON(const uint8_t* src_rgb,
-                      uint8_t* dst_r,
-                      uint8_t* dst_g,
-                      uint8_t* dst_b,
-                      int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB
-      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // next 8 RGB
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop
-      "vst1.8      {q0}, [%1]!                   \n"  // store R
-      "vst1.8      {q1}, [%2]!                   \n"  // store G
-      "vst1.8      {q2}, [%3]!                   \n"  // store B
-      "bgt         1b                            \n"
-      : "+r"(src_rgb),                    // %0
-        "+r"(dst_r),                      // %1
-        "+r"(dst_g),                      // %2
-        "+r"(dst_b),                      // %3
-        "+r"(width)                       // %4
-      :                                   // Input registers
-      : "cc", "memory", "d0", "d1", "d2"  // Clobber List
-  );
-}
-
-// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
-void MergeRGBRow_NEON(const uint8_t* src_r,
-                      const uint8_t* src_g,
-                      const uint8_t* src_b,
-                      uint8_t* dst_rgb,
-                      int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load R
-      "vld1.8      {q1}, [%1]!                   \n"  // load G
-      "vld1.8      {q2}, [%2]!                   \n"  // load B
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop
-      "vst3.8      {d0, d2, d4}, [%3]!           \n"  // store 8 RGB
-      "vst3.8      {d1, d3, d5}, [%3]!           \n"  // next 8 RGB
-      "bgt         1b                            \n"
-      : "+r"(src_r),                      // %0
-        "+r"(src_g),                      // %1
-        "+r"(src_b),                      // %2
-        "+r"(dst_rgb),                    // %3
-        "+r"(width)                       // %4
-      :                                   // Input registers
-      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
-void SplitARGBRow_NEON(const uint8_t* src_argb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       uint8_t* dst_a,
-                       int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
-      "subs        %5, %5, #16                   \n"  // 16 processed per loop
-      "vst1.8      {q0}, [%3]!                   \n"  // store B
-      "vst1.8      {q1}, [%2]!                   \n"  // store G
-      "vst1.8      {q2}, [%1]!                   \n"  // store R
-      "vst1.8      {q3}, [%4]!                   \n"  // store A
-      "bgt         1b                            \n"
-      : "+r"(src_argb),                         // %0
-        "+r"(dst_r),                            // %1
-        "+r"(dst_g),                            // %2
-        "+r"(dst_b),                            // %3
-        "+r"(dst_a),                            // %4
-        "+r"(width)                             // %5
-      :                                         // Input registers
-      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
-void MergeARGBRow_NEON(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       const uint8_t* src_a,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8      {q2}, [%0]!                   \n"  // load R
-      "vld1.8      {q1}, [%1]!                   \n"  // load G
-      "vld1.8      {q0}, [%2]!                   \n"  // load B
-      "vld1.8      {q3}, [%3]!                   \n"  // load A
-      "subs        %5, %5, #16                   \n"  // 16 processed per loop
-      "vst4.8      {d0, d2, d4, d6}, [%4]!       \n"  // store 8 ARGB
-      "vst4.8      {d1, d3, d5, d7}, [%4]!       \n"  // next 8 ARGB
-      "bgt         1b                            \n"
-      : "+r"(src_r),                            // %0
-        "+r"(src_g),                            // %1
-        "+r"(src_b),                            // %2
-        "+r"(src_a),                            // %3
-        "+r"(dst_argb),                         // %4
-        "+r"(width)                             // %5
-      :                                         // Input registers
-      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
-void SplitXRGBRow_NEON(const uint8_t* src_argb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop
-      "vst1.8      {q0}, [%3]!                   \n"  // store B
-      "vst1.8      {q1}, [%2]!                   \n"  // store G
-      "vst1.8      {q2}, [%1]!                   \n"  // store R
-      "bgt         1b                            \n"
-      : "+r"(src_argb),                         // %0
-        "+r"(dst_r),                            // %1
-        "+r"(dst_g),                            // %2
-        "+r"(dst_b),                            // %3
-        "+r"(width)                             // %4
-      :                                         // Input registers
-      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
-void MergeXRGBRow_NEON(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-      "vmov.u8     q3, #255                      \n"  // load A(255)
-      "1:                                        \n"
-      "vld1.8      {q2}, [%0]!                   \n"  // load R
-      "vld1.8      {q1}, [%1]!                   \n"  // load G
-      "vld1.8      {q0}, [%2]!                   \n"  // load B
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop
-      "vst4.8      {d0, d2, d4, d6}, [%3]!       \n"  // store 8 ARGB
-      "vst4.8      {d1, d3, d5, d7}, [%3]!       \n"  // next 8 ARGB
-      "bgt         1b                            \n"
-      : "+r"(src_r),                            // %0
-        "+r"(src_g),                            // %1
-        "+r"(src_b),                            // %2
-        "+r"(dst_argb),                         // %3
-        "+r"(width)                             // %4
-      :                                         // Input registers
-      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-void MergeXR30Row_NEON(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       uint8_t* dst_ar30,
-                       int depth,
-                       int width) {
-  int shift = 10 - depth;
-  asm volatile(
-      "vmov.u32    q14, #1023                    \n"
-      "vdup.32     q15, %5                       \n"
-      "1:                                        \n"
-      "vld1.16     {d4}, [%2]!                   \n"  // B
-      "vld1.16     {d2}, [%1]!                   \n"  // G
-      "vld1.16     {d0}, [%0]!                   \n"  // R
-      "vmovl.u16   q2, d4                        \n"  // B
-      "vmovl.u16   q1, d2                        \n"  // G
-      "vmovl.u16   q0, d0                        \n"  // R
-      "vshl.u32    q2, q2, q15                   \n"  // 000B
-      "vshl.u32    q1, q1, q15                   \n"
-      "vshl.u32    q0, q0, q15                   \n"
-      "vmin.u32    q2, q2, q14                   \n"
-      "vmin.u32    q1, q1, q14                   \n"
-      "vmin.u32    q0, q0, q14                   \n"
-      "vsli.u32    q2, q1, #10                   \n"  // 00GB
-      "vsli.u32    q2, q0, #20                   \n"  // 0RGB
-      "vorr.u32    q2, #0xc0000000               \n"  // ARGB (AR30)
-      "subs        %4, %4, #4                    \n"
-      "vst1.8      {q2}, [%3]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_ar30),  // %3
-        "+r"(width)      // %4
-      : "r"(shift)       // %5
-      : "memory", "cc", "q0", "q1", "q2", "q14", "q15");
-}
-
-void MergeXR30Row_10_NEON(const uint16_t* src_r,
-                          const uint16_t* src_g,
-                          const uint16_t* src_b,
-                          uint8_t* dst_ar30,
-                          int /* depth */,
-                          int width) {
-  asm volatile(
-      "vmov.u32    q14, #1023                    \n"
-      "1:                                        \n"
-      "vld1.16     {d4}, [%2]!                   \n"  // B
-      "vld1.16     {d2}, [%1]!                   \n"  // G
-      "vld1.16     {d0}, [%0]!                   \n"  // R
-      "vmovl.u16   q2, d4                        \n"  // 000B
-      "vmovl.u16   q1, d2                        \n"  // G
-      "vmovl.u16   q0, d0                        \n"  // R
-      "vmin.u32    q2, q2, q14                   \n"
-      "vmin.u32    q1, q1, q14                   \n"
-      "vmin.u32    q0, q0, q14                   \n"
-      "vsli.u32    q2, q1, #10                   \n"  // 00GB
-      "vsli.u32    q2, q0, #20                   \n"  // 0RGB
-      "vorr.u32    q2, #0xc0000000               \n"  // ARGB (AR30)
-      "subs        %4, %4, #4                    \n"
-      "vst1.8      {q2}, [%3]!                   \n"
-      "bgt         1b                            \n"
-      "3:                                        \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_ar30),  // %3
-        "+r"(width)      // %4
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q14");
-}
-
-void MergeAR64Row_NEON(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       const uint16_t* src_a,
-                       uint16_t* dst_ar64,
-                       int depth,
-                       int width) {
-  int shift = 16 - depth;
-  int mask = (1 << depth) - 1;
-  asm volatile(
-
-      "vdup.u16    q15, %6                       \n"
-      "vdup.u16    q14, %7                       \n"
-      "1:                                        \n"
-      "vld1.16     {q2}, [%0]!                   \n"  // R
-      "vld1.16     {q1}, [%1]!                   \n"  // G
-      "vld1.16     {q0}, [%2]!                   \n"  // B
-      "vld1.16     {q3}, [%3]!                   \n"  // A
-      "vmin.u16    q2, q2, q14                   \n"
-      "vmin.u16    q1, q1, q14                   \n"
-      "vmin.u16    q0, q0, q14                   \n"
-      "vmin.u16    q3, q3, q14                   \n"
-      "vshl.u16    q2, q2, q15                   \n"
-      "vshl.u16    q1, q1, q15                   \n"
-      "vshl.u16    q0, q0, q15                   \n"
-      "vshl.u16    q3, q3, q15                   \n"
-      "subs        %5, %5, #8                    \n"
-      "vst4.16     {d0, d2, d4, d6}, [%4]!       \n"
-      "vst4.16     {d1, d3, d5, d7}, [%4]!       \n"
-      "bgt         1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_ar64),  // %4
-        "+r"(width)      // %5
-      : "r"(shift),      // %6
-        "r"(mask)        // %7
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
-}
-
-void MergeXR64Row_NEON(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       uint16_t* dst_ar64,
-                       int depth,
-                       int width) {
-  int shift = 16 - depth;
-  int mask = (1 << depth) - 1;
-  asm volatile(
-
-      "vmov.u8     q3, #0xff                     \n"  // A (0xffff)
-      "vdup.u16    q15, %5                       \n"
-      "vdup.u16    q14, %6                       \n"
-      "1:                                        \n"
-      "vld1.16     {q2}, [%0]!                   \n"  // R
-      "vld1.16     {q1}, [%1]!                   \n"  // G
-      "vld1.16     {q0}, [%2]!                   \n"  // B
-      "vmin.u16    q2, q2, q14                   \n"
-      "vmin.u16    q1, q1, q14                   \n"
-      "vmin.u16    q0, q0, q14                   \n"
-      "vshl.u16    q2, q2, q15                   \n"
-      "vshl.u16    q1, q1, q15                   \n"
-      "vshl.u16    q0, q0, q15                   \n"
-      "subs        %4, %4, #8                    \n"
-      "vst4.16     {d0, d2, d4, d6}, [%3]!       \n"
-      "vst4.16     {d1, d3, d5, d7}, [%3]!       \n"
-      "bgt         1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_ar64),  // %3
-        "+r"(width)      // %4
-      : "r"(shift),      // %5
-        "r"(mask)        // %6
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
-}
-
-void MergeARGB16To8Row_NEON(const uint16_t* src_r,
-                            const uint16_t* src_g,
-                            const uint16_t* src_b,
-                            const uint16_t* src_a,
-                            uint8_t* dst_argb,
-                            int depth,
-                            int width) {
-  int shift = 8 - depth;
-  asm volatile(
-
-      "vdup.16     q15, %6                       \n"
-      "1:                                        \n"
-      "vld1.16     {q2}, [%0]!                   \n"  // R
-      "vld1.16     {q1}, [%1]!                   \n"  // G
-      "vld1.16     {q0}, [%2]!                   \n"  // B
-      "vld1.16     {q3}, [%3]!                   \n"  // A
-      "vshl.u16    q2, q2, q15                   \n"
-      "vshl.u16    q1, q1, q15                   \n"
-      "vshl.u16    q0, q0, q15                   \n"
-      "vshl.u16    q3, q3, q15                   \n"
-      "vqmovn.u16  d0, q0                        \n"
-      "vqmovn.u16  d1, q1                        \n"
-      "vqmovn.u16  d2, q2                        \n"
-      "vqmovn.u16  d3, q3                        \n"
-      "subs        %5, %5, #8                    \n"
-      "vst4.8      {d0, d1, d2, d3}, [%4]!       \n"
-      "bgt         1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_argb),  // %4
-        "+r"(width)      // %5
-      : "r"(shift)       // %6
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
-}
-
-void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
-                            const uint16_t* src_g,
-                            const uint16_t* src_b,
-                            uint8_t* dst_argb,
-                            int depth,
-                            int width) {
-  int shift = 8 - depth;
-  asm volatile(
-
-      "vdup.16     q15, %5                       \n"
-      "vmov.u8     d6, #0xff                     \n"  // A (0xff)
-      "1:                                        \n"
-      "vld1.16     {q2}, [%0]!                   \n"  // R
-      "vld1.16     {q1}, [%1]!                   \n"  // G
-      "vld1.16     {q0}, [%2]!                   \n"  // B
-      "vshl.u16    q2, q2, q15                   \n"
-      "vshl.u16    q1, q1, q15                   \n"
-      "vshl.u16    q0, q0, q15                   \n"
-      "vqmovn.u16  d5, q2                        \n"
-      "vqmovn.u16  d4, q1                        \n"
-      "vqmovn.u16  d3, q0                        \n"
-      "subs        %4, %4, #8                    \n"
-      "vst4.u8     {d3, d4, d5, d6}, [%3]!       \n"
-      "bgt         1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_argb),  // %3
-        "+r"(width)      // %4
-      : "r"(shift)       // %5
-      : "memory", "cc", "q0", "q1", "q2", "d6", "q15");
-}
-
-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 32
-      "subs        %2, %2, #32                   \n"  // 32 processed per loop
-      "vst1.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 32
-      "bgt         1b                            \n"
-      : "+r"(src),                  // %0
-        "+r"(dst),                  // %1
-        "+r"(width)                 // %2  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// SetRow writes 'width' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
-  asm volatile(
-      "vdup.8      q0, %2                        \n"  // duplicate 16 bytes
-      "1:                                        \n"
-      "subs        %1, %1, #16                   \n"  // 16 bytes per loop
-      "vst1.8      {q0}, [%0]!                   \n"  // store
-      "bgt         1b                            \n"
-      : "+r"(dst),   // %0
-        "+r"(width)  // %1
-      : "r"(v8)      // %2
-      : "cc", "memory", "q0");
-}
-
-// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
-void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
-  asm volatile(
-      "vdup.u32    q0, %2                        \n"  // duplicate 4 ints
-      "1:                                        \n"
-      "subs        %1, %1, #4                    \n"  // 4 pixels per loop
-      "vst1.8      {q0}, [%0]!                   \n"  // store
-      "bgt         1b                            \n"
-      : "+r"(dst),   // %0
-        "+r"(width)  // %1
-      : "r"(v32)     // %2
-      : "cc", "memory", "q0");
-}
-
-void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      // Start at end of source row.
-      "add         %0, %0, %2                    \n"
-      "sub         %0, %0, #32                   \n"  // 32 bytes per loop
-
-      "1:                                        \n"
-      "vld1.8      {q1, q2}, [%0], %3            \n"  // src -= 32
-      "subs        %2, #32                       \n"  // 32 pixels per loop.
-      "vrev64.8    q0, q2                        \n"
-      "vrev64.8    q1, q1                        \n"
-      "vswp        d0, d1                        \n"
-      "vswp        d2, d3                        \n"
-      "vst1.8      {q0, q1}, [%1]!               \n"  // dst += 32
-      "bgt         1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      : "r"(-32)     // %3
-      : "cc", "memory", "q0", "q1", "q2");
-}
-
-void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  asm volatile(
-      // Start at end of source row.
-      "mov         r12, #-16                     \n"
-      "add         %0, %0, %2, lsl #1            \n"
-      "sub         %0, #16                       \n"
-
-      "1:                                        \n"
-      "vld2.8      {d0, d1}, [%0], r12           \n"  // src -= 16
-      "subs        %2, #8                        \n"  // 8 pixels per loop.
-      "vrev64.8    q0, q0                        \n"
-      "vst2.8      {d0, d1}, [%1]!               \n"  // dst += 16
-      "bgt         1b                            \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_uv),  // %1
-        "+r"(width)    // %2
-      :
-      : "cc", "memory", "r12", "q0");
-}
-
-void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width) {
-  asm volatile(
-      // Start at end of source row.
-      "mov         r12, #-16                     \n"
-      "add         %0, %0, %3, lsl #1            \n"
-      "sub         %0, #16                       \n"
-
-      "1:                                        \n"
-      "vld2.8      {d0, d1}, [%0], r12           \n"  // src -= 16
-      "subs        %3, #8                        \n"  // 8 pixels per loop.
-      "vrev64.8    q0, q0                        \n"
-      "vst1.8      {d0}, [%1]!                   \n"  // dst += 8
-      "vst1.8      {d1}, [%2]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_u),   // %1
-        "+r"(dst_v),   // %2
-        "+r"(width)    // %3
-      :
-      : "cc", "memory", "r12", "q0");
-}
-
-void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "add         %0, %0, %2, lsl #2            \n"
-      "sub         %0, #32                       \n"
-
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0], %3    \n"  // src -= 32
-      "subs        %2, #8                        \n"  // 8 pixels per loop.
-      "vrev64.8    d0, d0                        \n"
-      "vrev64.8    d1, d1                        \n"
-      "vrev64.8    d2, d2                        \n"
-      "vrev64.8    d3, d3                        \n"
-      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // dst += 32
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(-32)         // %3
-      : "cc", "memory", "d0", "d1", "d2", "d3");
-}
-
-void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  src_rgb24 += width * 3 - 24;
-  asm volatile(
-      "1:                                        \n"
-      "vld3.8      {d0, d1, d2}, [%0], %3        \n"  // src -= 24
-      "subs        %2, #8                        \n"  // 8 pixels per loop.
-      "vrev64.8    d0, d0                        \n"
-      "vrev64.8    d1, d1                        \n"
-      "vrev64.8    d2, d2                        \n"
-      "vst3.8      {d0, d1, d2}, [%1]!           \n"  // dst += 24
-      "bgt         1b                            \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_rgb24),  // %1
-        "+r"(width)       // %2
-      : "r"(-24)          // %3
-      : "cc", "memory", "d0", "d1", "d2");
-}
-
-void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
-                         uint8_t* dst_argb,
-                         int width) {
-  asm volatile(
-      "vmov.u8     d4, #255                      \n"  // Alpha
-      "1:                                        \n"
-      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RGB24.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vst4.8      {d1, d2, d3, d4}, [%1]!       \n"  // store 8 pixels of ARGB.
-      "bgt         1b                            \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_argb),   // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "vmov.u8     d4, #255                      \n"  // Alpha
-      "1:                                        \n"
-      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vswp.u8     d1, d3                        \n"  // swap R, B
-      "vst4.8      {d1, d2, d3, d4}, [%1]!       \n"  // store 8 pixels of ARGB.
-      "bgt         1b                            \n"
-      : "+r"(src_raw),   // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  asm volatile(
-      "vmov.u8     d0, #255                      \n"  // Alpha
-      "1:                                        \n"
-      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vswp.u8     d1, d3                        \n"  // swap R, B
-      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of RGBA.
-      "bgt         1b                            \n"
-      : "+r"(src_raw),   // %0
-        "+r"(dst_rgba),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vswp.u8     d1, d3                        \n"  // swap R, B
-      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of
-                                                      // RGB24.
-      "bgt         1b                            \n"
-      : "+r"(src_raw),    // %0
-        "+r"(dst_rgb24),  // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-#define RGB565TOARGB                                                        \
-  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \
-  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \
-  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \
-  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \
-  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
-  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
-  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
-  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \
-  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
-  "vorr.u8    d1, d4, d6                     \n" /* G                    */
-
-void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      "vmov.u8     d3, #255                      \n"  // Alpha
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      RGB565TOARGB
-      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
-      "bgt         1b                            \n"
-      : "+r"(src_rgb565),  // %0
-        "+r"(dst_argb),    // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-#define ARGB1555TOARGB                                                      \
-  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \
-  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \
-  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \
-  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \
-  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \
-  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \
-  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \
-  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \
-  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \
-  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \
-  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \
-  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                        \
-  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \
-  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \
-  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \
-  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \
-  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
-  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
-  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
-  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \
-  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
-  "vorr.u8    d1, d4, d6                     \n" /* G                    */
-
-void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "vmov.u8     d3, #255                      \n"  // Alpha
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      ARGB1555TOARGB
-      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
-      "bgt         1b                            \n"
-      : "+r"(src_argb1555),  // %0
-        "+r"(dst_argb),      // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-#define ARGB4444TOARGB                                                      \
-  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \
-  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \
-  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \
-  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \
-  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \
-  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \
-  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \
-  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */
-
-void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "vmov.u8     d3, #255                      \n"  // Alpha
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      ARGB4444TOARGB
-      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
-      "bgt         1b                            \n"
-      : "+r"(src_argb4444),  // %0
-        "+r"(dst_argb),      // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d1, d2, d3, d4}, [%0]!       \n"  // load 8 pixels of ARGB.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of
-                                                      // RGB24.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(dst_rgb24),  // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d1, d2, d3, d4}, [%0]!       \n"  // load 8 pixels of ARGB.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vswp.u8     d1, d3                        \n"  // swap R, B
-      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of RAW.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_raw),   // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
-      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
-      "vst1.8      {q0}, [%1]!                   \n"  // store 16 pixels of Y.
-      "bgt         1b                            \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of UYVY.
-      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
-      "vst1.8      {q1}, [%1]!                   \n"  // store 16 pixels of Y.
-      "bgt         1b                            \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
-      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
-      "vst1.8      {d1}, [%1]!                   \n"  // store 8 U.
-      "vst1.8      {d3}, [%2]!                   \n"  // store 8 V.
-      "bgt         1b                            \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
-      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 U.
-      "vst1.8      {d2}, [%2]!                   \n"  // store 8 V.
-      "bgt         1b                            \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "add         %1, %0, %1                    \n"  // stride + src_yuy2
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
-      "subs        %4, %4, #16                   \n"  // 16 pixels = 8 UVs.
-      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load next row YUY2.
-      "vrhadd.u8   d1, d1, d5                    \n"  // average rows of U
-      "vrhadd.u8   d3, d3, d7                    \n"  // average rows of V
-      "vst1.8      {d1}, [%2]!                   \n"  // store 8 U.
-      "vst1.8      {d3}, [%3]!                   \n"  // store 8 V.
-      "bgt         1b                            \n"
-      : "+r"(src_yuy2),     // %0
-        "+r"(stride_yuy2),  // %1
-        "+r"(dst_u),        // %2
-        "+r"(dst_v),        // %3
-        "+r"(width)         // %4
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
-        "d7"  // Clobber List
-  );
-}
-
-void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "add         %1, %0, %1                    \n"  // stride + src_uyvy
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
-      "subs        %4, %4, #16                   \n"  // 16 pixels = 8 UVs.
-      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load next row UYVY.
-      "vrhadd.u8   d0, d0, d4                    \n"  // average rows of U
-      "vrhadd.u8   d2, d2, d6                    \n"  // average rows of V
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 U.
-      "vst1.8      {d2}, [%3]!                   \n"  // store 8 V.
-      "bgt         1b                            \n"
-      : "+r"(src_uyvy),     // %0
-        "+r"(stride_uyvy),  // %1
-        "+r"(dst_u),        // %2
-        "+r"(dst_v),        // %3
-        "+r"(width)         // %4
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
-        "d7"  // Clobber List
-  );
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const uint8_t* shuffler,
-                         int width) {
-  asm volatile(
-      "vld1.8      {q2}, [%3]                    \n"  // shuffler
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 4 pixels.
-      "subs        %2, %2, #4                    \n"  // 4 processed per loop
-      "vtbl.8      d2, {d0, d1}, d4              \n"  // look up 2 first pixels
-      "vtbl.8      d3, {d0, d1}, d5              \n"  // look up 2 next pixels
-      "vst1.8      {q1}, [%1]!                   \n"  // store 4.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),                   // %0
-        "+r"(dst_argb),                   // %1
-        "+r"(width)                       // %2
-      : "r"(shuffler)                     // %3
-      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-void I422ToYUY2Row_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 Ys
-      "vld1.8      {d1}, [%1]!                   \n"  // load 8 Us
-      "vld1.8      {d3}, [%2]!                   \n"  // load 8 Vs
-      "subs        %4, %4, #16                   \n"  // 16 pixels
-      "vst4.8      {d0, d1, d2, d3}, [%3]!       \n"  // Store 8 YUY2/16 pixels.
-      "bgt         1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_yuy2),  // %3
-        "+r"(width)      // %4
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3");
-}
-
-void I422ToUYVYRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8      {d1, d3}, [%0]!               \n"  // load 16 Ys
-      "vld1.8      {d0}, [%1]!                   \n"  // load 8 Us
-      "vld1.8      {d2}, [%2]!                   \n"  // load 8 Vs
-      "subs        %4, %4, #16                   \n"  // 16 pixels
-      "vst4.8      {d0, d1, d2, d3}, [%3]!       \n"  // Store 8 UYVY/16 pixels.
-      "bgt         1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_uyvy),  // %3
-        "+r"(width)      // %4
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3");
-}
-
-void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
-                          uint8_t* dst_rgb565,
-                          int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      ARGBTORGB565
-      "vst1.8      {q2}, [%1]!                   \n"  // store 8 pixels RGB565.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),    // %0
-        "+r"(dst_rgb565),  // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "d6");
-}
-
-void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
-                                uint8_t* dst_rgb,
-                                const uint32_t dither4,
-                                int width) {
-  asm volatile(
-      "vdup.32     d7, %2                        \n"  // dither4
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%1]!       \n"  // load 8 pixels of ARGB.
-      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vqadd.u8    d0, d0, d7                    \n"
-      "vqadd.u8    d2, d2, d7                    \n"
-      "vqadd.u8    d4, d4, d7                    \n"  // add for dither
-      ARGBTORGB565
-      "vst1.8      {q2}, [%0]!                   \n"  // store 8 RGB565.
-      "bgt         1b                            \n"
-      : "+r"(dst_rgb)   // %0
-      : "r"(src_argb),  // %1
-        "r"(dither4),   // %2
-        "r"(width)      // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_argb1555,
-                            int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      ARGBTOARGB1555
-      "vst1.8      {q3}, [%1]!                   \n"  // store 8 ARGB1555.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),      // %0
-        "+r"(dst_argb1555),  // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_argb4444,
-                            int width) {
-  asm volatile(
-      "vmov.u8     d7, #0x0f                     \n"  // bits to clear with
-                                                      // vbic.
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      ARGBTOARGB4444
-      "vst1.8      {q0}, [%1]!                   \n"  // store 8 ARGB4444.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),      // %0
-        "+r"(dst_argb4444),  // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
-      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
-      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
-      "vmov.u8     d27, #16                      \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q2, d0, d24                   \n"  // B
-      "vmlal.u8    q2, d1, d25                   \n"  // G
-      "vmlal.u8    q2, d2, d26                   \n"  // R
-      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
-      "vqadd.u8    d0, d27                       \n"
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels
-      "subs        %2, %2, #16                   \n"  // 16 processed per loop
-      "vst1.8      {q3}, [%1]!                   \n"  // store 16 A's.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_a),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
-      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
-      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q2, d0, d24                   \n"  // B
-      "vmlal.u8    q2, d1, d25                   \n"  // G
-      "vmlal.u8    q2, d2, d26                   \n"  // R
-      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
-void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
-      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
-      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 RGBA pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q2, d1, d24                   \n"  // B
-      "vmlal.u8    q2, d2, d25                   \n"  // G
-      "vmlal.u8    q2, d3, d26                   \n"  // R
-      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_rgba),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
-// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "vmov.u8     d24, #112                     \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.u8     d25, #74                      \n"  // UG -0.5781 coefficient
-      "vmov.u8     d26, #38                      \n"  // UR -0.2969 coefficient
-      "vmov.u8     d27, #18                      \n"  // VB -0.1406 coefficient
-      "vmov.u8     d28, #94                      \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
-      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q2, d0, d24                   \n"  // B
-      "vmlsl.u8    q2, d1, d25                   \n"  // G
-      "vmlsl.u8    q2, d2, d26                   \n"  // R
-      "vadd.u16    q2, q2, q15                   \n"  // +128 -> unsigned
-
-      "vmull.u8    q3, d2, d24                   \n"  // R
-      "vmlsl.u8    q3, d1, d28                   \n"  // G
-      "vmlsl.u8    q3, d0, d27                   \n"  // B
-      "vadd.u16    q3, q3, q15                   \n"  // +128 -> unsigned
-
-      "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
-      "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
-
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
-        "q15");
-}
-
-// clang-format off
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR)                                                 \
-  "vmul.s16   q8, " #QB ", q10               \n" /* B                    */ \
-  "vmls.s16   q8, " #QG ", q11               \n" /* G                    */ \
-  "vmls.s16   q8, " #QR ", q12               \n" /* R                    */ \
-  "vadd.u16   q8, q8, q15                    \n" /* +128 -> unsigned     */ \
-  "vmul.s16   q9, " #QR ", q10               \n" /* R                    */ \
-  "vmls.s16   q9, " #QG ", q14               \n" /* G                    */ \
-  "vmls.s16   q9, " #QB ", q13               \n" /* B                    */ \
-  "vadd.u16   q9, q9, q15                    \n" /* +128 -> unsigned     */ \
-  "vqshrn.u16  d0, q8, #8                    \n" /* 16 bit to 8 bit U    */ \
-  "vqshrn.u16  d1, q9, #8                    \n" /* 16 bit to 8 bit V    */
-// clang-format on
-
-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8_t* src_argb,
-                      int src_stride_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
-      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
-      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
-
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-    RGBTOUV(q0, q1, q2)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
-      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
-      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
-      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
-      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
-      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
-      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
-
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-    RGBTOUV(q0, q1, q2)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void BGRAToUVRow_NEON(const uint8_t* src_bgra,
-                      int src_stride_bgra,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_bgra
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 BGRA pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 BGRA pixels.
-      "vpaddl.u8   q3, q3                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q2, q2                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more BGRA pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 BGRA pixels.
-      "vpadal.u8   q3, q7                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q2, q6                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q1, q1, #1                    \n"  // 2x average
-      "vrshr.u16   q2, q2, #1                    \n"
-      "vrshr.u16   q3, q3, #1                    \n"
-
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-    RGBTOUV(q3, q2, q1)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(src_stride_bgra),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void ABGRToUVRow_NEON(const uint8_t* src_abgr,
-                      int src_stride_abgr,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_abgr
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
-      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
-      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
-
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-    RGBTOUV(q2, q1, q0)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(src_stride_abgr),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RGBAToUVRow_NEON(const uint8_t* src_rgba,
-                      int src_stride_rgba,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_rgba
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 RGBA pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 RGBA pixels.
-      "vpaddl.u8   q0, q1                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q2                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q2, q3                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more RGBA pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 RGBA pixels.
-      "vpadal.u8   q0, q5                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q6                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q2, q7                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
-
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-    RGBTOUV(q0, q1, q2)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(src_stride_rgba),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
-                       int src_stride_rgb24,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_rgb24
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB24 pixels.
-      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // load next 8 RGB24 pixels.
-      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
-      "vld3.8      {d8, d10, d12}, [%1]!         \n"  // load 8 more RGB24 pixels.
-      "vld3.8      {d9, d11, d13}, [%1]!         \n"  // load last 8 RGB24 pixels.
-      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
-
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-    RGBTOUV(q0, q1, q2)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(src_stride_rgb24),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RAWToUVRow_NEON(const uint8_t* src_raw,
-                     int src_stride_raw,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_raw
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RAW pixels.
-      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // load next 8 RAW pixels.
-      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
-      "vld3.8      {d8, d10, d12}, [%1]!         \n"  // load 8 more RAW pixels.
-      "vld3.8      {d9, d11, d13}, [%1]!         \n"  // load last 8 RAW pixels.
-      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
-
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-    RGBTOUV(q2, q1, q0)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_raw),  // %0
-    "+r"(src_stride_raw),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
-                        int src_stride_rgb565,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  asm volatile(
-      "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
-      RGB565TOARGB
-      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8      {q0}, [%0]!                   \n"  // next 8 RGB565 pixels.
-      RGB565TOARGB
-      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
-
-      "vld1.8      {q0}, [%1]!                   \n"  // load 8 RGB565 pixels.
-      RGB565TOARGB
-      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8      {q0}, [%1]!                   \n"  // next 8 RGB565 pixels.
-      RGB565TOARGB
-      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
-
-      "vrshr.u16   q4, q4, #1                    \n"  // 2x average
-      "vrshr.u16   q5, q5, #1                    \n"
-      "vrshr.u16   q6, q6, #1                    \n"
-
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-      "vmul.s16    q8, q4, q10                   \n"  // B
-      "vmls.s16    q8, q5, q11                   \n"  // G
-      "vmls.s16    q8, q6, q12                   \n"  // R
-      "vadd.u16    q8, q8, q15                   \n"  // +128 -> unsigned
-      "vmul.s16    q9, q6, q10                   \n"  // R
-      "vmls.s16    q9, q5, q14                   \n"  // G
-      "vmls.s16    q9, q4, q13                   \n"  // B
-      "vadd.u16    q9, q9, q15                   \n"  // +128 -> unsigned
-      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-      : "+r"(src_rgb565),         // %0
-        "+r"(src_stride_rgb565),  // %1
-        "+r"(dst_u),              // %2
-        "+r"(dst_v),              // %3
-        "+r"(width)               // %4
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
-                          int src_stride_argb1555,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  asm volatile(
-      "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8      {q0}, [%0]!                   \n"  // next 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
-
-      "vld1.8      {q0}, [%1]!                   \n"  // load 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8      {q0}, [%1]!                   \n"  // next 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
-
-      "vrshr.u16   q4, q4, #1                    \n"  // 2x average
-      "vrshr.u16   q5, q5, #1                    \n"
-      "vrshr.u16   q6, q6, #1                    \n"
-
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-      "vmul.s16    q8, q4, q10                   \n"  // B
-      "vmls.s16    q8, q5, q11                   \n"  // G
-      "vmls.s16    q8, q6, q12                   \n"  // R
-      "vadd.u16    q8, q8, q15                   \n"  // +128 -> unsigned
-      "vmul.s16    q9, q6, q10                   \n"  // R
-      "vmls.s16    q9, q5, q14                   \n"  // G
-      "vmls.s16    q9, q4, q13                   \n"  // B
-      "vadd.u16    q9, q9, q15                   \n"  // +128 -> unsigned
-      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-      : "+r"(src_argb1555),         // %0
-        "+r"(src_stride_argb1555),  // %1
-        "+r"(dst_u),                // %2
-        "+r"(dst_v),                // %3
-        "+r"(width)                 // %4
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
-                          int src_stride_argb4444,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  asm volatile(
-      "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8080                  \n"  // 128.5
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8      {q0}, [%0]!                   \n"  // next 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
-
-      "vld1.8      {q0}, [%1]!                   \n"  // load 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8      {q0}, [%1]!                   \n"  // next 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
-
-      "vrshr.u16   q0, q4, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q5, #1                    \n"
-      "vrshr.u16   q2, q6, #1                    \n"
-
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-      RGBTOUV(q0, q1, q2)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-      : "+r"(src_argb4444),         // %0
-        "+r"(src_stride_argb4444),  // %1
-        "+r"(dst_u),                // %2
-        "+r"(dst_v),                // %3
-        "+r"(width)                 // %4
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
-      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
-      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
-      "vmov.u8     d27, #16                      \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      RGB565TOARGB
-      "vmull.u8    q2, d0, d24                   \n"  // B
-      "vmlal.u8    q2, d1, d25                   \n"  // G
-      "vmlal.u8    q2, d2, d26                   \n"  // R
-      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
-      "vqadd.u8    d0, d27                       \n"
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_rgb565),  // %0
-        "+r"(dst_y),       // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
-}
-
-void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
-                         uint8_t* dst_y,
-                         int width) {
-  asm volatile(
-      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
-      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
-      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
-      "vmov.u8     d27, #16                      \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      ARGB1555TOARGB
-      "vmull.u8    q2, d0, d24                   \n"  // B
-      "vmlal.u8    q2, d1, d25                   \n"  // G
-      "vmlal.u8    q2, d2, d26                   \n"  // R
-      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
-      "vqadd.u8    d0, d27                       \n"
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_argb1555),  // %0
-        "+r"(dst_y),         // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
-}
-
-void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
-                         uint8_t* dst_y,
-                         int width) {
-  asm volatile(
-      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
-      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
-      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
-      "vmov.u8     d27, #16                      \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      ARGB4444TOARGB
-      "vmull.u8    q2, d0, d24                   \n"  // B
-      "vmlal.u8    q2, d1, d25                   \n"  // G
-      "vmlal.u8    q2, d2, d26                   \n"  // R
-      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
-      "vqadd.u8    d0, d27                       \n"
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_argb4444),  // %0
-        "+r"(dst_y),         // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
-}
-
-static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
-                                         10, 9, 8, 11, 14, 13, 12, 15};
-
-void ARGBToAR64Row_NEON(const uint8_t* src_argb,
-                        uint16_t* dst_ar64,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"
-      "vld1.8      {q2}, [%0]!                   \n"
-      "vmov.u8     q1, q0                        \n"
-      "vmov.u8     q3, q2                        \n"
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vst2.8      {q0, q1}, [%1]!               \n"  // store 4 pixels
-      "vst2.8      {q2, q3}, [%1]!               \n"  // store 4 pixels
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_ar64),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-void ARGBToAB64Row_NEON(const uint8_t* src_argb,
-                        uint16_t* dst_ab64,
-                        int width) {
-  asm volatile(
-      "vld1.8      q4, %3                        \n"  // shuffler
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"
-      "vld1.8      {q2}, [%0]!                   \n"
-      "vtbl.8      d2, {d0, d1}, d8              \n"
-      "vtbl.8      d3, {d0, d1}, d9              \n"
-      "vtbl.8      d6, {d4, d5}, d8              \n"
-      "vtbl.8      d7, {d4, d5}, d9              \n"
-      "vmov.u8     q0, q1                        \n"
-      "vmov.u8     q2, q3                        \n"
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vst2.8      {q0, q1}, [%1]!               \n"  // store 4 pixels
-      "vst2.8      {q2, q3}, [%1]!               \n"  // store 4 pixels
-      "bgt         1b                            \n"
-      : "+r"(src_argb),          // %0
-        "+r"(dst_ab64),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleARGBToABGR)  // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
-}
-
-void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
-                        uint8_t* dst_argb,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.16     {q0}, [%0]!                   \n"
-      "vld1.16     {q1}, [%0]!                   \n"
-      "vld1.16     {q2}, [%0]!                   \n"
-      "vld1.16     {q3}, [%0]!                   \n"
-      "vshrn.u16   d0, q0, #8                    \n"
-      "vshrn.u16   d1, q1, #8                    \n"
-      "vshrn.u16   d4, q2, #8                    \n"
-      "vshrn.u16   d5, q3, #8                    \n"
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vst1.8      {q0}, [%1]!                   \n"  // store 4 pixels
-      "vst1.8      {q2}, [%1]!                   \n"  // store 4 pixels
-      "bgt         1b                            \n"
-      : "+r"(src_ar64),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
-
-void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
-                        uint8_t* dst_argb,
-                        int width) {
-  asm volatile(
-      "vld1.8      d8, %3                        \n"  // shuffler
-      "1:                                        \n"
-      "vld1.16     {q0}, [%0]!                   \n"
-      "vld1.16     {q1}, [%0]!                   \n"
-      "vld1.16     {q2}, [%0]!                   \n"
-      "vld1.16     {q3}, [%0]!                   \n"
-      "vtbl.8      d0, {d0, d1}, d8              \n"
-      "vtbl.8      d1, {d2, d3}, d8              \n"
-      "vtbl.8      d4, {d4, d5}, d8              \n"
-      "vtbl.8      d5, {d6, d7}, d8              \n"
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vst1.8      {q0}, [%1]!                   \n"  // store 4 pixels
-      "vst1.8      {q2}, [%1]!                   \n"  // store 4 pixels
-      "bgt         1b                            \n"
-      : "+r"(src_ab64),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleAB64ToARGB)  // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
-}
-
-void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8     d6, #25                       \n"  // B * 0.1016 coefficient
-      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
-      "vmov.u8     d4, #66                       \n"  // R * 0.2578 coefficient
-      "vmov.u8     d7, #16                       \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of BGRA.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q8, d1, d4                    \n"  // R
-      "vmlal.u8    q8, d2, d5                    \n"  // G
-      "vmlal.u8    q8, d3, d6                    \n"  // B
-      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
-      "vqadd.u8    d0, d7                        \n"
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_bgra),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8     d6, #25                       \n"  // B * 0.1016 coefficient
-      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
-      "vmov.u8     d4, #66                       \n"  // R * 0.2578 coefficient
-      "vmov.u8     d7, #16                       \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ABGR.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q8, d0, d4                    \n"  // R
-      "vmlal.u8    q8, d1, d5                    \n"  // G
-      "vmlal.u8    q8, d2, d6                    \n"  // B
-      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
-      "vqadd.u8    d0, d7                        \n"
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_abgr),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8     d4, #25                       \n"  // B * 0.1016 coefficient
-      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
-      "vmov.u8     d6, #66                       \n"  // R * 0.2578 coefficient
-      "vmov.u8     d7, #16                       \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of RGBA.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q8, d1, d4                    \n"  // B
-      "vmlal.u8    q8, d2, d5                    \n"  // G
-      "vmlal.u8    q8, d3, d6                    \n"  // R
-      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
-      "vqadd.u8    d0, d7                        \n"
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_rgba),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8     d4, #25                       \n"  // B * 0.1016 coefficient
-      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
-      "vmov.u8     d6, #66                       \n"  // R * 0.2578 coefficient
-      "vmov.u8     d7, #16                       \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RGB24.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q8, d0, d4                    \n"  // B
-      "vmlal.u8    q8, d1, d5                    \n"  // G
-      "vmlal.u8    q8, d2, d6                    \n"  // R
-      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
-      "vqadd.u8    d0, d7                        \n"
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_y),      // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8     d6, #25                       \n"  // B * 0.1016 coefficient
-      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
-      "vmov.u8     d4, #66                       \n"  // R * 0.2578 coefficient
-      "vmov.u8     d7, #16                       \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RAW.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q8, d0, d4                    \n"  // B
-      "vmlal.u8    q8, d1, d5                    \n"  // G
-      "vmlal.u8    q8, d2, d6                    \n"  // R
-      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
-      "vqadd.u8    d0, d7                        \n"
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_raw),  // %0
-        "+r"(dst_y),    // %1
-        "+r"(width)     // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
-  asm volatile(
-      "vmov.u8     d4, #29                       \n"  // B * 0.1140 coefficient
-      "vmov.u8     d5, #150                      \n"  // G * 0.5870 coefficient
-      "vmov.u8     d6, #77                       \n"  // R * 0.2990 coefficient
-      "1:                                        \n"
-      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RGB24.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q4, d0, d4                    \n"  // B
-      "vmlal.u8    q4, d1, d5                    \n"  // G
-      "vmlal.u8    q4, d2, d6                    \n"  // R
-      "vqrshrn.u16 d0, q4, #8                    \n"  // 16 bit to 8 bit Y
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_yj),     // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
-}
-
-void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
-  asm volatile(
-      "vmov.u8     d6, #29                       \n"  // B * 0.1140 coefficient
-      "vmov.u8     d5, #150                      \n"  // G * 0.5870 coefficient
-      "vmov.u8     d4, #77                       \n"  // R * 0.2990 coefficient
-      "1:                                        \n"
-      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RAW.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q4, d0, d4                    \n"  // R
-      "vmlal.u8    q4, d1, d5                    \n"  // G
-      "vmlal.u8    q4, d2, d6                    \n"  // B
-      "vqrshrn.u16 d0, q4, #8                    \n"  // 16 bit to 8 bit Y
-      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_raw),  // %0
-        "+r"(dst_yj),   // %1
-        "+r"(width)     // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
-}
-
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int dst_width,
-                         int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  asm volatile(
-      "cmp         %4, #0                        \n"
-      "beq         100f                          \n"
-      "add         %2, %1                        \n"
-      "cmp         %4, #128                      \n"
-      "beq         50f                           \n"
-
-      "vdup.8      d5, %4                        \n"
-      "rsb         %4, #256                      \n"
-      "vdup.8      d4, %4                        \n"
-      // General purpose row blend.
-      "1:                                        \n"
-      "vld1.8      {q0}, [%1]!                   \n"
-      "vld1.8      {q1}, [%2]!                   \n"
-      "subs        %3, %3, #16                   \n"
-      "vmull.u8    q13, d0, d4                   \n"
-      "vmull.u8    q14, d1, d4                   \n"
-      "vmlal.u8    q13, d2, d5                   \n"
-      "vmlal.u8    q14, d3, d5                   \n"
-      "vrshrn.u16  d0, q13, #8                   \n"
-      "vrshrn.u16  d1, q14, #8                   \n"
-      "vst1.8      {q0}, [%0]!                   \n"
-      "bgt         1b                            \n"
-      "b           99f                           \n"
-
-      // Blend 50 / 50.
-      "50:                                       \n"
-      "vld1.8      {q0}, [%1]!                   \n"
-      "vld1.8      {q1}, [%2]!                   \n"
-      "subs        %3, %3, #16                   \n"
-      "vrhadd.u8   q0, q1                        \n"
-      "vst1.8      {q0}, [%0]!                   \n"
-      "bgt         50b                           \n"
-      "b           99f                           \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      "100:                                      \n"
-      "vld1.8      {q0}, [%1]!                   \n"
-      "subs        %3, %3, #16                   \n"
-      "vst1.8      {q0}, [%0]!                   \n"
-      "bgt         100b                          \n"
-
-      "99:                                       \n"
-      : "+r"(dst_ptr),     // %0
-        "+r"(src_ptr),     // %1
-        "+r"(src_stride),  // %2
-        "+r"(dst_width),   // %3
-        "+r"(y1_fraction)  // %4
-      :
-      : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
-}
-
-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8_t* src_argb,
-                       const uint8_t* src_argb1,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-      "subs        %3, #8                        \n"
-      "blt         89f                           \n"
-      // Blend 8 pixels.
-      "8:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB0.
-      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 pixels of ARGB1.
-      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q10, d4, d3                   \n"  // db * a
-      "vmull.u8    q11, d5, d3                   \n"  // dg * a
-      "vmull.u8    q12, d6, d3                   \n"  // dr * a
-      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-      "vqsub.u8    q2, q2, q10                   \n"  // dbg - dbg * a / 256
-      "vqsub.u8    d6, d6, d22                   \n"  // dr - dr * a / 256
-      "vqadd.u8    q0, q0, q2                    \n"  // + sbg
-      "vqadd.u8    d2, d2, d6                    \n"  // + sr
-      "vmov.u8     d3, #255                      \n"  // a = 255
-      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 pixels of ARGB.
-      "bge         8b                            \n"
-
-      "89:                                       \n"
-      "adds        %3, #8-1                      \n"
-      "blt         99f                           \n"
-
-      // Blend 1 pixels.
-      "1:                                        \n"
-      "vld4.8      {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
-      "vld4.8      {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
-      "subs        %3, %3, #1                    \n"  // 1 processed per loop.
-      "vmull.u8    q10, d4, d3                   \n"  // db * a
-      "vmull.u8    q11, d5, d3                   \n"  // dg * a
-      "vmull.u8    q12, d6, d3                   \n"  // dr * a
-      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-      "vqsub.u8    q2, q2, q10                   \n"  // dbg - dbg * a / 256
-      "vqsub.u8    d6, d6, d22                   \n"  // dr - dr * a / 256
-      "vqadd.u8    q0, q0, q2                    \n"  // + sbg
-      "vqadd.u8    d2, d2, d6                    \n"  // + sr
-      "vmov.u8     d3, #255                      \n"  // a = 255
-      "vst4.8      {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
-      "bge         1b                            \n"
-
-      "99:                                       \n"
-
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
-}
-
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
-                           uint8_t* dst_argb,
-                           int width) {
-  asm volatile(
-      // Attenuate 8 pixels.
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q10, d0, d3                   \n"  // b * a
-      "vmull.u8    q11, d1, d3                   \n"  // g * a
-      "vmull.u8    q12, d2, d3                   \n"  // r * a
-      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
-      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
-      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
-      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
-}
-
-// Quantize 8 ARGB pixels (32 bytes).
-// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
-                          int scale,
-                          int interval_size,
-                          int interval_offset,
-                          int width) {
-  asm volatile(
-      "vdup.u16    q8, %2                        \n"
-      "vshr.u16    q8, q8, #1                    \n"  // scale >>= 1
-      "vdup.u16    q9, %3                        \n"  // interval multiply.
-      "vdup.u16    q10, %4                       \n"  // interval add
-
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]        \n"  // load 8 pixels of ARGB.
-      "subs        %1, %1, #8                    \n"  // 8 processed per loop.
-      "vmovl.u8    q0, d0                        \n"  // b (0 .. 255)
-      "vmovl.u8    q1, d2                        \n"
-      "vmovl.u8    q2, d4                        \n"
-      "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
-      "vqdmulh.s16 q1, q1, q8                    \n"  // g
-      "vqdmulh.s16 q2, q2, q8                    \n"  // r
-      "vmul.u16    q0, q0, q9                    \n"  // b * interval_size
-      "vmul.u16    q1, q1, q9                    \n"  // g
-      "vmul.u16    q2, q2, q9                    \n"  // r
-      "vadd.u16    q0, q0, q10                   \n"  // b + interval_offset
-      "vadd.u16    q1, q1, q10                   \n"  // g
-      "vadd.u16    q2, q2, q10                   \n"  // r
-      "vqmovn.u16  d0, q0                        \n"
-      "vqmovn.u16  d2, q1                        \n"
-      "vqmovn.u16  d4, q2                        \n"
-      "vst4.8      {d0, d2, d4, d6}, [%0]!       \n"  // store 8 pixels of ARGB.
-      "bgt         1b                            \n"
-      : "+r"(dst_argb),       // %0
-        "+r"(width)           // %1
-      : "r"(scale),           // %2
-        "r"(interval_size),   // %3
-        "r"(interval_offset)  // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
-}
-
-// Shade 8 pixels at a time by specified value.
-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8_t* src_argb,
-                       uint8_t* dst_argb,
-                       int width,
-                       uint32_t value) {
-  asm volatile(
-      "vdup.u32    q0, %3                        \n"  // duplicate scale value.
-      "vzip.u8     d0, d1                        \n"  // d0 aarrggbb.
-      "vshr.u16    q0, q0, #1                    \n"  // scale / 2.
-
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld4.8      {d20, d22, d24, d26}, [%0]!   \n"  // load 8 pixels of ARGB.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmovl.u8    q10, d20                      \n"  // b (0 .. 255)
-      "vmovl.u8    q11, d22                      \n"
-      "vmovl.u8    q12, d24                      \n"
-      "vmovl.u8    q13, d26                      \n"
-      "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
-      "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
-      "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
-      "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
-      "vqmovn.u16  d20, q10                      \n"
-      "vqmovn.u16  d22, q11                      \n"
-      "vqmovn.u16  d24, q12                      \n"
-      "vqmovn.u16  d26, q13                      \n"
-      "vst4.8      {d20, d22, d24, d26}, [%1]!   \n"  // store 8 pixels of ARGB.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(value)       // %3
-      : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
-}
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-// Similar to ARGBToYJ but stores ARGB.
-// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
-void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
-      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
-      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q2, d0, d24                   \n"  // B
-      "vmlal.u8    q2, d1, d25                   \n"  // G
-      "vmlal.u8    q2, d2, d26                   \n"  // R
-      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit B
-      "vmov        d1, d0                        \n"  // G
-      "vmov        d2, d0                        \n"  // R
-      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 ARGB pixels.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
-  asm volatile(
-      "vmov.u8     d20, #17                      \n"  // BB coefficient
-      "vmov.u8     d21, #68                      \n"  // BG coefficient
-      "vmov.u8     d22, #35                      \n"  // BR coefficient
-      "vmov.u8     d24, #22                      \n"  // GB coefficient
-      "vmov.u8     d25, #88                      \n"  // GG coefficient
-      "vmov.u8     d26, #45                      \n"  // GR coefficient
-      "vmov.u8     d28, #24                      \n"  // BB coefficient
-      "vmov.u8     d29, #98                      \n"  // BG coefficient
-      "vmov.u8     d30, #50                      \n"  // BR coefficient
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]        \n"  // load 8 ARGB pixels.
-      "subs        %1, %1, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q2, d0, d20                   \n"  // B to Sepia B
-      "vmlal.u8    q2, d1, d21                   \n"  // G
-      "vmlal.u8    q2, d2, d22                   \n"  // R
-      "vmull.u8    q3, d0, d24                   \n"  // B to Sepia G
-      "vmlal.u8    q3, d1, d25                   \n"  // G
-      "vmlal.u8    q3, d2, d26                   \n"  // R
-      "vmull.u8    q8, d0, d28                   \n"  // B to Sepia R
-      "vmlal.u8    q8, d1, d29                   \n"  // G
-      "vmlal.u8    q8, d2, d30                   \n"  // R
-      "vqshrn.u16  d0, q2, #7                    \n"  // 16 bit to 8 bit B
-      "vqshrn.u16  d1, q3, #7                    \n"  // 16 bit to 8 bit G
-      "vqshrn.u16  d2, q8, #7                    \n"  // 16 bit to 8 bit R
-      "vst4.8      {d0, d1, d2, d3}, [%0]!       \n"  // store 8 ARGB pixels.
-      "bgt         1b                            \n"
-      : "+r"(dst_argb),  // %0
-        "+r"(width)      // %1
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
-        "q14", "q15");
-}
-
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
-// needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             const int8_t* matrix_argb,
-                             int width) {
-  asm volatile(
-      "vld1.8      {q2}, [%3]                    \n"  // load 3 ARGB vectors.
-      "vmovl.s8    q0, d4                        \n"  // B,G coefficients s16.
-      "vmovl.s8    q1, d5                        \n"  // R,A coefficients s16.
-
-      "1:                                        \n"
-      "vld4.8      {d16, d18, d20, d22}, [%0]!   \n"  // load 8 ARGB pixels.
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
-      "vmovl.u8    q8, d16                       \n"  // b (0 .. 255) 16 bit
-      "vmovl.u8    q9, d18                       \n"  // g
-      "vmovl.u8    q10, d20                      \n"  // r
-      "vmovl.u8    q11, d22                      \n"  // a
-      "vmul.s16    q12, q8, d0[0]                \n"  // B = B * Matrix B
-      "vmul.s16    q13, q8, d1[0]                \n"  // G = B * Matrix G
-      "vmul.s16    q14, q8, d2[0]                \n"  // R = B * Matrix R
-      "vmul.s16    q15, q8, d3[0]                \n"  // A = B * Matrix A
-      "vmul.s16    q4, q9, d0[1]                 \n"  // B += G * Matrix B
-      "vmul.s16    q5, q9, d1[1]                 \n"  // G += G * Matrix G
-      "vmul.s16    q6, q9, d2[1]                 \n"  // R += G * Matrix R
-      "vmul.s16    q7, q9, d3[1]                 \n"  // A += G * Matrix A
-      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
-      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
-      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
-      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
-      "vmul.s16    q4, q10, d0[2]                \n"  // B += R * Matrix B
-      "vmul.s16    q5, q10, d1[2]                \n"  // G += R * Matrix G
-      "vmul.s16    q6, q10, d2[2]                \n"  // R += R * Matrix R
-      "vmul.s16    q7, q10, d3[2]                \n"  // A += R * Matrix A
-      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
-      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
-      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
-      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
-      "vmul.s16    q4, q11, d0[3]                \n"  // B += A * Matrix B
-      "vmul.s16    q5, q11, d1[3]                \n"  // G += A * Matrix G
-      "vmul.s16    q6, q11, d2[3]                \n"  // R += A * Matrix R
-      "vmul.s16    q7, q11, d3[3]                \n"  // A += A * Matrix A
-      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
-      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
-      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
-      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
-      "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
-      "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
-      "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
-      "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
-      "vst4.8      {d16, d18, d20, d22}, [%1]!   \n"  // store 8 ARGB pixels.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(dst_argb),   // %1
-        "+r"(width)       // %2
-      : "r"(matrix_argb)  // %3
-      : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
-        "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%1]!       \n"  // load 8 more ARGB
-      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q0, d0, d1                    \n"  // multiply B
-      "vmull.u8    q1, d2, d3                    \n"  // multiply G
-      "vmull.u8    q2, d4, d5                    \n"  // multiply R
-      "vmull.u8    q3, d6, d7                    \n"  // multiply A
-      "vrshrn.u16  d0, q0, #8                    \n"  // 16 bit to 8 bit B
-      "vrshrn.u16  d1, q1, #8                    \n"  // 16 bit to 8 bit G
-      "vrshrn.u16  d2, q2, #8                    \n"  // 16 bit to 8 bit R
-      "vrshrn.u16  d3, q3, #8                    \n"  // 16 bit to 8 bit A
-      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8_t* src_argb,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 more ARGB
-      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vqadd.u8    q0, q0, q2                    \n"  // add B, G
-      "vqadd.u8    q1, q1, q3                    \n"  // add R, A
-      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 more ARGB
-      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vqsub.u8    q0, q0, q2                    \n"  // subtract B, G
-      "vqsub.u8    q1, q1, q3                    \n"  // subtract R, A
-      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
-      "bgt         1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_NEON(const uint8_t* src_sobelx,
-                   const uint8_t* src_sobely,
-                   uint8_t* dst_argb,
-                   int width) {
-  asm volatile(
-      "vmov.u8     d3, #255                      \n"  // alpha
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld1.8      {d0}, [%0]!                   \n"  // load 8 sobelx.
-      "vld1.8      {d1}, [%1]!                   \n"  // load 8 sobely.
-      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vqadd.u8    d0, d0, d1                    \n"  // add
-      "vmov.u8     d1, d0                        \n"
-      "vmov.u8     d2, d0                        \n"
-      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
-      "bgt         1b                            \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "q0", "q1");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
-                          const uint8_t* src_sobely,
-                          uint8_t* dst_y,
-                          int width) {
-  asm volatile(
-      // 16 pixel loop.
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 16 sobelx.
-      "vld1.8      {q1}, [%1]!                   \n"  // load 16 sobely.
-      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
-      "vqadd.u8    q0, q0, q1                    \n"  // add
-      "vst1.8      {q0}, [%2]!                   \n"  // store 16 pixels.
-      "bgt         1b                            \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_y),       // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "q0", "q1");
-}
-
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_NEON(const uint8_t* src_sobelx,
-                     const uint8_t* src_sobely,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      "vmov.u8     d3, #255                      \n"  // alpha
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld1.8      {d2}, [%0]!                   \n"  // load 8 sobelx.
-      "vld1.8      {d0}, [%1]!                   \n"  // load 8 sobely.
-      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vqadd.u8    d1, d0, d2                    \n"  // add
-      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
-      "bgt         1b                            \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "q0", "q1");
-}
-
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-void SobelXRow_NEON(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    const uint8_t* src_y2,
-                    uint8_t* dst_sobelx,
-                    int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8      {d0}, [%0],%5                 \n"  // top
-      "vld1.8      {d1}, [%0],%6                 \n"
-      "vsubl.u8    q0, d0, d1                    \n"
-      "vld1.8      {d2}, [%1],%5                 \n"  // center * 2
-      "vld1.8      {d3}, [%1],%6                 \n"
-      "vsubl.u8    q1, d2, d3                    \n"
-      "vadd.s16    q0, q0, q1                    \n"
-      "vadd.s16    q0, q0, q1                    \n"
-      "vld1.8      {d2}, [%2],%5                 \n"  // bottom
-      "vld1.8      {d3}, [%2],%6                 \n"
-      "subs        %4, %4, #8                    \n"  // 8 pixels
-      "vsubl.u8    q1, d2, d3                    \n"
-      "vadd.s16    q0, q0, q1                    \n"
-      "vabs.s16    q0, q0                        \n"
-      "vqmovn.u16  d0, q0                        \n"
-      "vst1.8      {d0}, [%3]!                   \n"  // store 8 sobelx
-      "bgt         1b                            \n"
-      : "+r"(src_y0),               // %0
-        "+r"(src_y1),               // %1
-        "+r"(src_y2),               // %2
-        "+r"(dst_sobelx),           // %3
-        "+r"(width)                 // %4
-      : "r"(2),                     // %5
-        "r"(6)                      // %6
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-void SobelYRow_NEON(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    uint8_t* dst_sobely,
-                    int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8      {d0}, [%0],%4                 \n"  // left
-      "vld1.8      {d1}, [%1],%4                 \n"
-      "vsubl.u8    q0, d0, d1                    \n"
-      "vld1.8      {d2}, [%0],%4                 \n"  // center * 2
-      "vld1.8      {d3}, [%1],%4                 \n"
-      "vsubl.u8    q1, d2, d3                    \n"
-      "vadd.s16    q0, q0, q1                    \n"
-      "vadd.s16    q0, q0, q1                    \n"
-      "vld1.8      {d2}, [%0],%5                 \n"  // right
-      "vld1.8      {d3}, [%1],%5                 \n"
-      "subs        %3, %3, #8                    \n"  // 8 pixels
-      "vsubl.u8    q1, d2, d3                    \n"
-      "vadd.s16    q0, q0, q1                    \n"
-      "vabs.s16    q0, q0                        \n"
-      "vqmovn.u16  d0, q0                        \n"
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 sobely
-      "bgt         1b                            \n"
-      : "+r"(src_y0),               // %0
-        "+r"(src_y1),               // %1
-        "+r"(dst_sobely),           // %2
-        "+r"(width)                 // %3
-      : "r"(1),                     // %4
-        "r"(6)                      // %5
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// %y passes a float as a scalar vector for vector * scalar multiply.
-// the regoster must be d0 to d15 and indexed with [0] or [1] to access
-// the float in the first or second float of the d-reg
-
-void HalfFloat1Row_NEON(const uint16_t* src,
-                        uint16_t* dst,
-                        float /*unused*/,
-                        int width) {
-  asm volatile(
-
-      "1:                                        \n"
-      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
-      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
-      "vmovl.u16   q2, d2                        \n"  // 8 int's
-      "vmovl.u16   q3, d3                        \n"
-      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
-      "vcvt.f32.u32 q3, q3                       \n"
-      "vmul.f32    q2, q2, %y3                   \n"  // adjust exponent
-      "vmul.f32    q3, q3, %y3                   \n"
-      "vqshrn.u32  d2, q2, #13                   \n"  // isolate halffloat
-      "vqshrn.u32  d3, q3, #13                   \n"
-      "vst1.8      {q1}, [%1]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src),              // %0
-        "+r"(dst),              // %1
-        "+r"(width)             // %2
-      : "w"(1.9259299444e-34f)  // %3
-      : "cc", "memory", "q1", "q2", "q3");
-}
-
-void HalfFloatRow_NEON(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width) {
-  asm volatile(
-
-      "1:                                        \n"
-      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
-      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
-      "vmovl.u16   q2, d2                        \n"  // 8 int's
-      "vmovl.u16   q3, d3                        \n"
-      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
-      "vcvt.f32.u32 q3, q3                       \n"
-      "vmul.f32    q2, q2, %y3                   \n"  // adjust exponent
-      "vmul.f32    q3, q3, %y3                   \n"
-      "vqshrn.u32  d2, q2, #13                   \n"  // isolate halffloat
-      "vqshrn.u32  d3, q3, #13                   \n"
-      "vst1.8      {q1}, [%1]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src),                      // %0
-        "+r"(dst),                      // %1
-        "+r"(width)                     // %2
-      : "w"(scale * 1.9259299444e-34f)  // %3
-      : "cc", "memory", "q1", "q2", "q3");
-}
-
-void ByteToFloatRow_NEON(const uint8_t* src,
-                         float* dst,
-                         float scale,
-                         int width) {
-  asm volatile(
-
-      "1:                                        \n"
-      "vld1.8      {d2}, [%0]!                   \n"  // load 8 bytes
-      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
-      "vmovl.u8    q1, d2                        \n"  // 8 shorts
-      "vmovl.u16   q2, d2                        \n"  // 8 ints
-      "vmovl.u16   q3, d3                        \n"
-      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
-      "vcvt.f32.u32 q3, q3                       \n"
-      "vmul.f32    q2, q2, %y3                   \n"  // scale
-      "vmul.f32    q3, q3, %y3                   \n"
-      "vst1.8      {q2, q3}, [%1]!               \n"  // store 8 floats
-      "bgt         1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      : "w"(scale)   // %3
-      : "cc", "memory", "q1", "q2", "q3");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussCol_NEON(const uint16_t* src0,
-                   const uint16_t* src1,
-                   const uint16_t* src2,
-                   const uint16_t* src3,
-                   const uint16_t* src4,
-                   uint32_t* dst,
-                   int width) {
-  asm volatile(
-      "vmov.u16    d6, #4                        \n"  // constant 4
-      "vmov.u16    d7, #6                        \n"  // constant 6
-
-      "1:                                        \n"
-      "vld1.16     {q1}, [%0]!                   \n"  // load 8 samples, 5 rows
-      "vld1.16     {q2}, [%4]!                   \n"
-      "vaddl.u16   q0, d2, d4                    \n"  // * 1
-      "vaddl.u16   q1, d3, d5                    \n"  // * 1
-      "vld1.16     {q2}, [%1]!                   \n"
-      "vmlal.u16   q0, d4, d6                    \n"  // * 4
-      "vmlal.u16   q1, d5, d6                    \n"  // * 4
-      "vld1.16     {q2}, [%2]!                   \n"
-      "vmlal.u16   q0, d4, d7                    \n"  // * 6
-      "vmlal.u16   q1, d5, d7                    \n"  // * 6
-      "vld1.16     {q2}, [%3]!                   \n"
-      "vmlal.u16   q0, d4, d6                    \n"  // * 4
-      "vmlal.u16   q1, d5, d6                    \n"  // * 4
-      "subs        %6, %6, #8                    \n"  // 8 processed per loop
-      "vst1.32     {q0, q1}, [%5]!               \n"  // store 8 samples
-      "bgt         1b                            \n"
-      : "+r"(src0),  // %0
-        "+r"(src1),  // %1
-        "+r"(src2),  // %2
-        "+r"(src3),  // %3
-        "+r"(src4),  // %4
-        "+r"(dst),   // %5
-        "+r"(width)  // %6
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
-  const uint32_t* src1 = src + 1;
-  const uint32_t* src2 = src + 2;
-  const uint32_t* src3 = src + 3;
-  asm volatile(
-      "vmov.u32    q10, #4                       \n"  // constant 4
-      "vmov.u32    q11, #6                       \n"  // constant 6
-
-      "1:                                        \n"
-      "vld1.32     {q0, q1}, [%0]!               \n"  // load 12 source samples
-      "vld1.32     {q2}, [%0]                    \n"
-      "vadd.u32    q0, q0, q1                    \n"  // * 1
-      "vadd.u32    q1, q1, q2                    \n"  // * 1
-      "vld1.32     {q2, q3}, [%2]!               \n"
-      "vmla.u32    q0, q2, q11                   \n"  // * 6
-      "vmla.u32    q1, q3, q11                   \n"  // * 6
-      "vld1.32     {q2, q3}, [%1]!               \n"
-      "vld1.32     {q8, q9}, [%3]!               \n"
-      "vadd.u32    q2, q2, q8                    \n"  // add rows for * 4
-      "vadd.u32    q3, q3, q9                    \n"
-      "vmla.u32    q0, q2, q10                   \n"  // * 4
-      "vmla.u32    q1, q3, q10                   \n"  // * 4
-      "subs        %5, %5, #8                    \n"  // 8 processed per loop
-      "vqshrn.u32  d0, q0, #8                    \n"  // round and pack
-      "vqshrn.u32  d1, q1, #8                    \n"
-      "vst1.u16    {q0}, [%4]!                   \n"  // store 8 samples
-      "bgt         1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(src1),  // %1
-        "+r"(src2),  // %2
-        "+r"(src3),  // %3
-        "+r"(dst),   // %4
-        "+r"(width)  // %5
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-}
-
-// Convert biplanar NV21 to packed YUV24
-void NV21ToYUV24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_yuv24,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8      {q2}, [%0]!                   \n"  // load 16 Y values
-      "vld2.8      {d0, d2}, [%1]!               \n"  // load 8 VU values
-      "vmov        d1, d0                        \n"
-      "vzip.u8     d0, d1                        \n"  // VV
-      "vmov        d3, d2                        \n"
-      "vzip.u8     d2, d3                        \n"  // UU
-      "subs        %3, %3, #16                   \n"  // 16 pixels per loop
-      "vst3.8      {d0, d2, d4}, [%2]!           \n"  // store 16 YUV pixels
-      "vst3.8      {d1, d3, d5}, [%2]!           \n"
-      "bgt         1b                            \n"
-      : "+r"(src_y),      // %0
-        "+r"(src_vu),     // %1
-        "+r"(dst_yuv24),  // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2");
-}
-
-void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
-                      int src_stride_ayuv,
-                      uint8_t* dst_uv,
-                      int width) {
-  asm volatile(
-      "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV
-                                                      // pixels.
-      "vpaddl.u8   q0, q0                        \n"  // V 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // U 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more AYUV
-                                                      // pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 AYUV
-                                                      // pixels.
-      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
-      "vqrshrun.s16 d1, q0, #2                   \n"  // 2x2 average
-      "vqrshrun.s16 d0, q1, #2                   \n"
-      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
-      "vst2.8      {d0, d1}, [%2]!               \n"  // store 8 pixels UV.
-      "bgt         1b                            \n"
-      : "+r"(src_ayuv),         // %0
-        "+r"(src_stride_ayuv),  // %1
-        "+r"(dst_uv),           // %2
-        "+r"(width)             // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-}
-
-void AYUVToVURow_NEON(const uint8_t* src_ayuv,
-                      int src_stride_ayuv,
-                      uint8_t* dst_vu,
-                      int width) {
-  asm volatile(
-      "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV
-                                                      // pixels.
-      "vpaddl.u8   q0, q0                        \n"  // V 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // U 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more AYUV
-                                                      // pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 AYUV
-                                                      // pixels.
-      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
-      "vqrshrun.s16 d0, q0, #2                   \n"  // 2x2 average
-      "vqrshrun.s16 d1, q1, #2                   \n"
-      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
-      "vst2.8      {d0, d1}, [%2]!               \n"  // store 8 pixels VU.
-      "bgt         1b                            \n"
-      : "+r"(src_ayuv),         // %0
-        "+r"(src_stride_ayuv),  // %1
-        "+r"(dst_vu),           // %2
-        "+r"(width)             // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-}
-
-// Copy row of AYUV Y's into Y.
-// Similar to ARGBExtractAlphaRow_NEON
-void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV pixels
-      "subs        %2, %2, #16                   \n"  // 16 processed per loop
-      "vst1.8      {q2}, [%1]!                   \n"  // store 16 Y's.
-      "bgt         1b                            \n"
-      : "+r"(src_ayuv),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Convert UV plane of NV12 to VU of NV21.
-void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 UV values
-      "vld2.8      {d1, d3}, [%0]!               \n"
-      "vorr.u8     q2, q0, q0                    \n"  // move U after V
-      "subs        %2, %2, #16                   \n"  // 16 pixels per loop
-      "vst2.8      {q1, q2}, [%1]!               \n"  // store 16 VU pixels
-      "bgt         1b                            \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_vu),  // %1
-        "+r"(width)    // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2");
-}
-
-void HalfMergeUVRow_NEON(const uint8_t* src_u,
-                         int src_stride_u,
-                         const uint8_t* src_v,
-                         int src_stride_v,
-                         uint8_t* dst_uv,
-                         int width) {
-  const uint8_t* src_u_1 = src_u + src_stride_u;
-  const uint8_t* src_v_1 = src_v + src_stride_v;
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load 16 U values
-      "vld1.8      {q1}, [%2]!                   \n"  // load 16 V values
-      "vld1.8      {q2}, [%1]!                   \n"
-      "vld1.8      {q3}, [%3]!                   \n"
-      "vpaddl.u8   q0, q0                        \n"  // half size
-      "vpaddl.u8   q1, q1                        \n"
-      "vpadal.u8   q0, q2                        \n"
-      "vpadal.u8   q1, q3                        \n"
-      "vqrshrn.u16 d0, q0, #2                    \n"
-      "vqrshrn.u16 d1, q1, #2                    \n"
-      "subs        %5, %5, #16                   \n"  // 16 src pixels per loop
-      "vst2.8      {d0, d1}, [%4]!               \n"  // store 8 UV pixels
-      "bgt         1b                            \n"
-      : "+r"(src_u),    // %0
-        "+r"(src_u_1),  // %1
-        "+r"(src_v),    // %2
-        "+r"(src_v_1),  // %3
-        "+r"(dst_uv),   // %4
-        "+r"(width)     // %5
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-void SplitUVRow_16_NEON(const uint16_t* src_uv,
-                        uint16_t* dst_u,
-                        uint16_t* dst_v,
-                        int depth,
-                        int width) {
-  int shift = depth - 16;  // Negative for right shift.
-  asm volatile(
-      "vdup.16     q2, %4                        \n"
-      "1:                                        \n"
-      "vld2.16     {q0, q1}, [%0]!               \n"  // load 8 UV
-      "vshl.u16    q0, q0, q2                    \n"
-      "vshl.u16    q1, q1, q2                    \n"
-      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
-      "vst1.16     {q0}, [%1]!                   \n"  // store 8 U pixels
-      "vst1.16     {q1}, [%2]!                   \n"  // store 8 V pixels
-      "bgt         1b                            \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_u),   // %1
-        "+r"(dst_v),   // %2
-        "+r"(width)    // %3
-      : "r"(shift)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
-}
-
-void MergeUVRow_16_NEON(const uint16_t* src_u,
-                        const uint16_t* src_v,
-                        uint16_t* dst_uv,
-                        int depth,
-                        int width) {
-  int shift = 16 - depth;
-  asm volatile(
-      "vdup.16     q2, %4                        \n"
-      "1:                                        \n"
-      "vld1.16     {q0}, [%0]!                   \n"  // load 8 U
-      "vld1.16     {q1}, [%1]!                   \n"  // load 8 V
-      "vshl.u16    q0, q0, q2                    \n"
-      "vshl.u16    q1, q1, q2                    \n"
-      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
-      "vst2.16     {q0, q1}, [%2]!               \n"  // store 8 UV pixels
-      "bgt         1b                            \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      : "r"(shift)     // %4
-      : "cc", "memory", "q0", "q1", "q2");
-}
-
-void MultiplyRow_16_NEON(const uint16_t* src_y,
-                         uint16_t* dst_y,
-                         int scale,
-                         int width) {
-  asm volatile(
-      "vdup.16     q2, %2                        \n"
-      "1:                                        \n"
-      "vld1.16     {q0}, [%0]!                   \n"
-      "vld1.16     {q1}, [%0]!                   \n"
-      "vmul.u16    q0, q0, q2                    \n"
-      "vmul.u16    q1, q1, q2                    \n"
-      "vst1.16     {q0}, [%1]!                   \n"
-      "vst1.16     {q1}, [%1]!                   \n"
-      "subs        %3, %3, #16                   \n"  // 16 src pixels per loop
-      "bgt         1b                            \n"
-      : "+r"(src_y),  // %0
-        "+r"(dst_y),  // %1
-        "+r"(scale),  // %2
-        "+r"(width)   // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2");
-}
-
-void DivideRow_16_NEON(const uint16_t* src_y,
-                       uint16_t* dst_y,
-                       int scale,
-                       int width) {
-  asm volatile(
-      "vdup.16     q0, %2                        \n"
-      "1:                                        \n"
-      "vld1.16     {q1}, [%0]!                   \n"
-      "vld1.16     {q2}, [%0]!                   \n"
-      "vmovl.u16   q3, d2                        \n"
-      "vmovl.u16   q1, d3                        \n"
-      "vmovl.u16   q4, d4                        \n"
-      "vmovl.u16   q2, d5                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vshl.u32    q1, q1, q0                    \n"
-      "vshl.u32    q2, q2, q0                    \n"
-      "vmovn.u32   d2, q3                        \n"
-      "vmovn.u32   d3, q1                        \n"
-      "vmovn.u32   d4, q4                        \n"
-      "vmovn.u32   d5, q2                        \n"
-      "vst1.16     {q1}, [%1]!                   \n"
-      "vst1.16     {q2}, [%1]!                   \n"
-      "subs        %3, %3, #16                   \n"  // 16 src pixels per loop
-      "bgt         1b                            \n"
-      : "+r"(src_y),  // %0
-        "+r"(dst_y),  // %1
-        "+r"(scale),  // %2
-        "+r"(width)   // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/row_neon64.cc b/thirdparty/libyuv/source/row_neon64.cc
deleted file mode 100644
index ba6ca5d..0000000
--- a/thirdparty/libyuv/source/row_neon64.cc
+++ /dev/null
@@ -1,3855 +0,0 @@
-/*
- *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon armv8 64 bit.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// v0.8h: Y
-// v1.16b: 8U, 8V
-
-// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                               \
-  "ldr        d0, [%[src_y]], #8             \n" \
-  "ld1        {v1.s}[0], [%[src_u]], #4      \n" \
-  "ld1        {v1.s}[1], [%[src_v]], #4      \n" \
-  "zip1       v0.16b, v0.16b, v0.16b         \n" \
-  "prfm       pldl1keep, [%[src_y], 448]     \n" \
-  "zip1       v1.16b, v1.16b, v1.16b         \n" \
-  "prfm       pldl1keep, [%[src_u], 128]     \n" \
-  "prfm       pldl1keep, [%[src_v], 128]     \n"
-
-// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                               \
-  "ldr        d0, [%[src_y]], #8             \n" \
-  "ld1        {v1.d}[0], [%[src_u]], #8      \n" \
-  "prfm       pldl1keep, [%[src_y], 448]     \n" \
-  "ld1        {v1.d}[1], [%[src_v]], #8      \n" \
-  "prfm       pldl1keep, [%[src_u], 448]     \n" \
-  "zip1       v0.16b, v0.16b, v0.16b         \n" \
-  "prfm       pldl1keep, [%[src_v], 448]     \n"
-
-// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                               \
-  "ldr        d0, [%[src_y]], #8             \n" \
-  "movi       v1.16b, #128                   \n" \
-  "prfm       pldl1keep, [%[src_y], 448]     \n" \
-  "zip1       v0.16b, v0.16b, v0.16b         \n"
-
-static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
-                                 1, 1, 3, 3, 5, 5, 7, 7};
-static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
-                                 0, 0, 2, 2, 4, 4, 6, 6};
-
-// Read 8 Y and 4 UV from NV12 or NV21
-#define READNV12                                 \
-  "ldr        d0, [%[src_y]], #8             \n" \
-  "ldr        d1, [%[src_uv]], #8            \n" \
-  "zip1       v0.16b, v0.16b, v0.16b         \n" \
-  "prfm       pldl1keep, [%[src_y], 448]     \n" \
-  "tbl        v1.16b, {v1.16b}, v2.16b       \n" \
-  "prfm       pldl1keep, [%[src_uv], 448]    \n"
-
-// Read 8 YUY2
-#define READYUY2                                     \
-  "ld2        {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \
-  "zip1       v0.16b, v0.16b, v0.16b         \n"     \
-  "prfm       pldl1keep, [%[src_yuy2], 448]  \n"     \
-  "tbl        v1.16b, {v1.16b}, v2.16b       \n"
-
-// Read 8 UYVY
-#define READUYVY                                     \
-  "ld2        {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \
-  "zip1       v0.16b, v4.16b, v4.16b         \n"     \
-  "prfm       pldl1keep, [%[src_uyvy], 448]  \n"     \
-  "tbl        v1.16b, {v3.16b}, v2.16b       \n"
-
-// UB VR UG VG
-// YG BB BG BR
-#define YUVTORGB_SETUP                                                \
-  "ld4r       {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \
-  "ld4r       {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n"
-
-// v16.8h: B
-// v17.8h: G
-// v18.8h: R
-
-// Convert from YUV to 2.14 fixed point RGB
-#define YUVTORGB                                          \
-  "umull2     v3.4s, v0.8h, v24.8h           \n"          \
-  "umull      v6.8h, v1.8b, v30.8b           \n"          \
-  "umull      v0.4s, v0.4h, v24.4h           \n"          \
-  "umlal2     v6.8h, v1.16b, v31.16b         \n" /* DG */ \
-  "uqshrn     v0.4h, v0.4s, #16              \n"          \
-  "uqshrn2    v0.8h, v3.4s, #16              \n" /* Y */  \
-  "umull      v4.8h, v1.8b, v28.8b           \n" /* DB */ \
-  "umull2     v5.8h, v1.16b, v29.16b         \n" /* DR */ \
-  "add        v17.8h, v0.8h, v26.8h          \n" /* G */  \
-  "add        v16.8h, v0.8h, v4.8h           \n" /* B */  \
-  "add        v18.8h, v0.8h, v5.8h           \n" /* R */  \
-  "uqsub      v17.8h, v17.8h, v6.8h          \n" /* G */  \
-  "uqsub      v16.8h, v16.8h, v25.8h         \n" /* B */  \
-  "uqsub      v18.8h, v18.8h, v27.8h         \n" /* R */
-
-// Convert from 2.14 fixed point RGB To 8 bit RGB
-#define RGBTORGB8                                \
-  "uqshrn     v17.8b, v17.8h, #6             \n" \
-  "uqshrn     v16.8b, v16.8h, #6             \n" \
-  "uqshrn     v18.8b, v18.8h, #6             \n"
-
-#define YUVTORGB_REGS                                                          \
-  "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \
-      "v26", "v27", "v28", "v29", "v30", "v31"
-
-void I444ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n" /* A */
-      "1:                                        \n" READYUV444 YUVTORGB
-          RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "v19");
-}
-
-void I422ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n" /* A */
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "v19");
-}
-
-void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             const uint8_t* src_a,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n"
-      "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV444
-      "prfm        pldl1keep, [%[src_a], 448]    \n" YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [src_a] "+r"(src_a),                               // %[src_a]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "v19");
-}
-
-void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             const uint8_t* src_a,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n"
-      "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV422
-      "prfm        pldl1keep, [%[src_a], 448]    \n" YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [src_a] "+r"(src_a),                               // %[src_a]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "v19");
-}
-
-void I422ToRGBARow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v15.8b, #255                  \n" /* A */
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st4         {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_rgba] "+r"(dst_rgba),                         // %[dst_rgba]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "v15");
-}
-
-void I422ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS);
-}
-
-#define ARGBTORGB565                                                        \
-  "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
-  "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
-  "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
-  "sri        v18.8h, v17.8h, #5             \n" /* RG                   */ \
-  "sri        v18.8h, v16.8h, #11            \n" /* RGB                  */
-
-void I422ToRGB565Row_NEON(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8 "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
-      "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8 pixels RGB565.
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS);
-}
-
-#define ARGBTOARGB1555                                                      \
-  "shll       v0.8h,  v19.8b, #8             \n" /* A                    */ \
-  "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
-  "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
-  "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
-  "sri        v0.8h,  v18.8h, #1             \n" /* AR                   */ \
-  "sri        v0.8h,  v17.8h, #6             \n" /* ARG                  */ \
-  "sri        v0.8h,  v16.8h, #11            \n" /* ARGB                 */
-
-void I422ToARGB1555Row_NEON(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n" ARGBTOARGB1555
-      "st1         {v0.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels
-                                                        // RGB565.
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_argb1555] "+r"(dst_argb1555),                 // %[dst_argb1555]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "v19");
-}
-
-#define ARGBTOARGB4444                                                       \
-  /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f       */ \
-  "ushr       v16.8b, v16.8b, #4             \n" /* B                    */  \
-  "bic        v17.8b, v17.8b, v23.8b         \n" /* G                    */  \
-  "ushr       v18.8b, v18.8b, #4             \n" /* R                    */  \
-  "bic        v19.8b, v19.8b, v23.8b         \n" /* A                    */  \
-  "orr        v0.8b,  v16.8b, v17.8b         \n" /* BG                   */  \
-  "orr        v1.8b,  v18.8b, v19.8b         \n" /* RA                   */  \
-  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
-
-void I422ToARGB4444Row_NEON(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v23.16b, #0x0f                \n"  // bits to clear with
-                                                      // vbic.
-      "1:                                        \n" READYUV422 YUVTORGB
-          RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "movi        v19.8b, #255                  \n" ARGBTOARGB4444
-      "st1         {v0.8h}, [%[dst_argb4444]], #16 \n"  // store 8
-                                                        // pixels
-                                                        // ARGB4444.
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [src_u] "+r"(src_u),                               // %[src_u]
-        [src_v] "+r"(src_v),                               // %[src_v]
-        [dst_argb4444] "+r"(dst_argb4444),                 // %[dst_argb4444]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "v19", "v23");
-}
-
-void I400ToARGBRow_NEON(const uint8_t* src_y,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV400 YUVTORGB
-          RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                               // %[src_y]
-        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
-        [width] "+r"(width)                                // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "v19");
-}
-
-void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movi        v23.8b, #255                  \n"
-      "1:                                        \n"
-      "ld1         {v20.8b}, [%0], #8            \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "orr         v21.8b, v20.8b, v20.8b        \n"
-      "orr         v22.8b, v20.8b, v20.8b        \n"
-      "subs        %w2, %w2, #8                  \n"
-      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v20", "v21", "v22", "v23");
-}
-
-void NV12ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_uv,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n"
-      "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                                // %[src_y]
-        [src_uv] "+r"(src_uv),                              // %[src_uv]
-        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
-        [width] "+r"(width)                                 // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
-        [kNV12Table] "r"(&kNV12Table)
-      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
-}
-
-void NV21ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_vu,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n"
-      "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                                // %[src_y]
-        [src_uv] "+r"(src_vu),                              // %[src_uv]
-        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
-        [width] "+r"(width)                                 // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
-        [kNV12Table] "r"(&kNV21Table)
-      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
-}
-
-void NV12ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                                // %[src_y]
-        [src_uv] "+r"(src_uv),                              // %[src_uv]
-        [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
-        [width] "+r"(width)                                 // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
-        [kNV12Table] "r"(&kNV12Table)
-      : "cc", "memory", YUVTORGB_REGS, "v2");
-}
-
-void NV21ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                                // %[src_y]
-        [src_uv] "+r"(src_vu),                              // %[src_uv]
-        [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
-        [width] "+r"(width)                                 // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
-        [kNV12Table] "r"(&kNV21Table)
-      : "cc", "memory", YUVTORGB_REGS, "v2");
-}
-
-void NV12ToRGB565Row_NEON(const uint8_t* src_y,
-                          const uint8_t* src_uv,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
-      "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8
-                                                       // pixels
-                                                       // RGB565.
-      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),                                // %[src_y]
-        [src_uv] "+r"(src_uv),                              // %[src_uv]
-        [dst_rgb565] "+r"(dst_rgb565),                      // %[dst_rgb565]
-        [width] "+r"(width)                                 // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
-        [kNV12Table] "r"(&kNV12Table)
-      : "cc", "memory", YUVTORGB_REGS, "v2");
-}
-
-void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n"
-      "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
-      : [src_yuy2] "+r"(src_yuy2),                          // %[src_yuy2]
-        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
-        [width] "+r"(width)                                 // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
-        [kNV12Table] "r"(&kNV12Table)
-      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
-}
-
-void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n"
-      "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
-      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
-      : [src_uyvy] "+r"(src_uyvy),                          // %[src_yuy2]
-        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
-        [width] "+r"(width)                                 // %[width]
-      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
-        [kNV12Table] "r"(&kNV12Table)
-      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
-}
-
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
-      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "st1         {v0.16b}, [%1], #16           \n"  // store U
-      "st1         {v1.16b}, [%2], #16           \n"  // store V
-      "b.gt        1b                            \n"
-      : "+r"(src_uv),               // %0
-        "+r"(dst_u),                // %1
-        "+r"(dst_v),                // %2
-        "+r"(width)                 // %3  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load U
-      "ld1         {v1.16b}, [%1], #16           \n"  // load V
-      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "st2         {v0.16b,v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
-      "b.gt        1b                            \n"
-      : "+r"(src_u),                // %0
-        "+r"(src_v),                // %1
-        "+r"(dst_uv),               // %2
-        "+r"(width)                 // %3  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
-void SplitRGBRow_NEON(const uint8_t* src_rgb,
-                      uint8_t* dst_r,
-                      uint8_t* dst_g,
-                      uint8_t* dst_b,
-                      int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "st1         {v0.16b}, [%1], #16           \n"  // store R
-      "st1         {v1.16b}, [%2], #16           \n"  // store G
-      "st1         {v2.16b}, [%3], #16           \n"  // store B
-      "b.gt        1b                            \n"
-      : "+r"(src_rgb),                    // %0
-        "+r"(dst_r),                      // %1
-        "+r"(dst_g),                      // %2
-        "+r"(dst_b),                      // %3
-        "+r"(width)                       // %4
-      :                                   // Input registers
-      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-  );
-}
-
-// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
-void MergeRGBRow_NEON(const uint8_t* src_r,
-                      const uint8_t* src_g,
-                      const uint8_t* src_b,
-                      uint8_t* dst_rgb,
-                      int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load R
-      "ld1         {v1.16b}, [%1], #16           \n"  // load G
-      "ld1         {v2.16b}, [%2], #16           \n"  // load B
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st3         {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
-      "b.gt        1b                            \n"
-      : "+r"(src_r),                      // %0
-        "+r"(src_g),                      // %1
-        "+r"(src_b),                      // %2
-        "+r"(dst_rgb),                    // %3
-        "+r"(width)                       // %4
-      :                                   // Input registers
-      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-  );
-}
-
-// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
-void SplitARGBRow_NEON(const uint8_t* src_rgba,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       uint8_t* dst_a,
-                       int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
-      "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "st1         {v0.16b}, [%3], #16           \n"  // store B
-      "st1         {v1.16b}, [%2], #16           \n"  // store G
-      "st1         {v2.16b}, [%1], #16           \n"  // store R
-      "st1         {v3.16b}, [%4], #16           \n"  // store A
-      "b.gt        1b                            \n"
-      : "+r"(src_rgba),                         // %0
-        "+r"(dst_r),                            // %1
-        "+r"(dst_g),                            // %2
-        "+r"(dst_b),                            // %3
-        "+r"(dst_a),                            // %4
-        "+r"(width)                             // %5
-      :                                         // Input registers
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
-void MergeARGBRow_NEON(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       const uint8_t* src_a,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v2.16b}, [%0], #16           \n"  // load R
-      "ld1         {v1.16b}, [%1], #16           \n"  // load G
-      "ld1         {v0.16b}, [%2], #16           \n"  // load B
-      "ld1         {v3.16b}, [%3], #16           \n"  // load A
-      "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "prfm        pldl1keep, [%3, 448]          \n"
-      "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n"  // store 16ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_r),                            // %0
-        "+r"(src_g),                            // %1
-        "+r"(src_b),                            // %2
-        "+r"(src_a),                            // %3
-        "+r"(dst_argb),                         // %4
-        "+r"(width)                             // %5
-      :                                         // Input registers
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
-void SplitXRGBRow_NEON(const uint8_t* src_rgba,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "st1         {v0.16b}, [%3], #16           \n"  // store B
-      "st1         {v1.16b}, [%2], #16           \n"  // store G
-      "st1         {v2.16b}, [%1], #16           \n"  // store R
-      "b.gt        1b                            \n"
-      : "+r"(src_rgba),                         // %0
-        "+r"(dst_r),                            // %1
-        "+r"(dst_g),                            // %2
-        "+r"(dst_b),                            // %3
-        "+r"(width)                             // %4
-      :                                         // Input registers
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
-void MergeXRGBRow_NEON(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-      "movi        v3.16b, #255                  \n"  // load A(255)
-      "1:                                        \n"
-      "ld1         {v2.16b}, [%0], #16           \n"  // load R
-      "ld1         {v1.16b}, [%1], #16           \n"  // load G
-      "ld1         {v0.16b}, [%2], #16           \n"  // load B
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n"  // store 16ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_r),                            // %0
-        "+r"(src_g),                            // %1
-        "+r"(src_b),                            // %2
-        "+r"(dst_argb),                         // %3
-        "+r"(width)                             // %4
-      :                                         // Input registers
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void MergeXR30Row_NEON(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       uint8_t* dst_ar30,
-                       int depth,
-                       int width) {
-  int shift = 10 - depth;
-  asm volatile(
-      "movi        v30.16b, #255                 \n"
-      "ushr        v30.4s, v30.4s, #22           \n"  // 1023
-      "dup         v31.4s, %w5                   \n"
-      "1:                                        \n"
-      "ldr         d2, [%2], #8                  \n"  // B
-      "ldr         d1, [%1], #8                  \n"  // G
-      "ldr         d0, [%0], #8                  \n"  // R
-      "ushll       v2.4s, v2.4h, #0              \n"  // B
-      "ushll       v1.4s, v1.4h, #0              \n"  // G
-      "ushll       v0.4s, v0.4h, #0              \n"  // R
-      "ushl        v2.4s, v2.4s, v31.4s          \n"  // 000B
-      "ushl        v1.4s, v1.4s, v31.4s          \n"  // G
-      "ushl        v0.4s, v0.4s, v31.4s          \n"  // R
-      "umin        v2.4s, v2.4s, v30.4s          \n"
-      "umin        v1.4s, v1.4s, v30.4s          \n"
-      "umin        v0.4s, v0.4s, v30.4s          \n"
-      "sli         v2.4s, v1.4s, #10             \n"  // 00GB
-      "sli         v2.4s, v0.4s, #20             \n"  // 0RGB
-      "orr         v2.4s, #0xc0, lsl #24         \n"  // ARGB (AR30)
-      "subs        %w4, %w4, #4                  \n"
-      "str         q2, [%3], #16                 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_ar30),  // %3
-        "+r"(width)      // %4
-      : "r"(shift)       // %5
-      : "memory", "cc", "v0", "v1", "v2", "v30", "v31");
-}
-
-void MergeXR30Row_10_NEON(const uint16_t* src_r,
-                          const uint16_t* src_g,
-                          const uint16_t* src_b,
-                          uint8_t* dst_ar30,
-                          int /* depth */,
-                          int width) {
-  asm volatile(
-      "movi        v30.16b, #255                 \n"
-      "ushr        v30.4s, v30.4s, #22           \n"  // 1023
-      "1:                                        \n"
-      "ldr         d2, [%2], #8                  \n"  // B
-      "ldr         d1, [%1], #8                  \n"  // G
-      "ldr         d0, [%0], #8                  \n"  // R
-      "ushll       v2.4s, v2.4h, #0              \n"  // 000B
-      "ushll       v1.4s, v1.4h, #0              \n"  // G
-      "ushll       v0.4s, v0.4h, #0              \n"  // R
-      "umin        v2.4s, v2.4s, v30.4s          \n"
-      "umin        v1.4s, v1.4s, v30.4s          \n"
-      "umin        v0.4s, v0.4s, v30.4s          \n"
-      "sli         v2.4s, v1.4s, #10             \n"  // 00GB
-      "sli         v2.4s, v0.4s, #20             \n"  // 0RGB
-      "orr         v2.4s, #0xc0, lsl #24         \n"  // ARGB (AR30)
-      "subs        %w4, %w4, #4                  \n"
-      "str         q2, [%3], #16                 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_ar30),  // %3
-        "+r"(width)      // %4
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v30");
-}
-
-void MergeAR64Row_NEON(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       const uint16_t* src_a,
-                       uint16_t* dst_ar64,
-                       int depth,
-                       int width) {
-  int shift = 16 - depth;
-  int mask = (1 << depth) - 1;
-  asm volatile(
-
-      "dup         v30.8h, %w7                   \n"
-      "dup         v31.8h, %w6                   \n"
-      "1:                                        \n"
-      "ldr         q2, [%0], #16                 \n"  // R
-      "ldr         q1, [%1], #16                 \n"  // G
-      "ldr         q0, [%2], #16                 \n"  // B
-      "ldr         q3, [%3], #16                 \n"  // A
-      "umin        v2.8h, v2.8h, v30.8h          \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umin        v1.8h, v1.8h, v30.8h          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "umin        v0.8h, v0.8h, v30.8h          \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "umin        v3.8h, v3.8h, v30.8h          \n"
-      "prfm        pldl1keep, [%3, 448]          \n"
-      "ushl        v2.8h, v2.8h, v31.8h          \n"
-      "ushl        v1.8h, v1.8h, v31.8h          \n"
-      "ushl        v0.8h, v0.8h, v31.8h          \n"
-      "ushl        v3.8h, v3.8h, v31.8h          \n"
-      "subs        %w5, %w5, #8                  \n"
-      "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_ar64),  // %4
-        "+r"(width)      // %5
-      : "r"(shift),      // %6
-        "r"(mask)        // %7
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
-}
-
-void MergeXR64Row_NEON(const uint16_t* src_r,
-                       const uint16_t* src_g,
-                       const uint16_t* src_b,
-                       uint16_t* dst_ar64,
-                       int depth,
-                       int width) {
-  int shift = 16 - depth;
-  int mask = (1 << depth) - 1;
-  asm volatile(
-
-      "movi        v3.16b, #0xff                 \n"  // A (0xffff)
-      "dup         v30.8h, %w6                   \n"
-      "dup         v31.8h, %w5                   \n"
-
-      "1:                                        \n"
-      "ldr         q2, [%0], #16                 \n"  // R
-      "ldr         q1, [%1], #16                 \n"  // G
-      "ldr         q0, [%2], #16                 \n"  // B
-      "umin        v2.8h, v2.8h, v30.8h          \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umin        v1.8h, v1.8h, v30.8h          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "umin        v0.8h, v0.8h, v30.8h          \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "ushl        v2.8h, v2.8h, v31.8h          \n"
-      "ushl        v1.8h, v1.8h, v31.8h          \n"
-      "ushl        v0.8h, v0.8h, v31.8h          \n"
-      "subs        %w4, %w4, #8                  \n"
-      "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_ar64),  // %3
-        "+r"(width)      // %4
-      : "r"(shift),      // %5
-        "r"(mask)        // %6
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
-}
-
-void MergeARGB16To8Row_NEON(const uint16_t* src_r,
-                            const uint16_t* src_g,
-                            const uint16_t* src_b,
-                            const uint16_t* src_a,
-                            uint8_t* dst_argb,
-                            int depth,
-                            int width) {
-  int shift = 8 - depth;
-  asm volatile(
-
-      "dup         v31.8h, %w6                   \n"
-      "1:                                        \n"
-      "ldr         q2, [%0], #16                 \n"  // R
-      "ldr         q1, [%1], #16                 \n"  // G
-      "ldr         q0, [%2], #16                 \n"  // B
-      "ldr         q3, [%3], #16                 \n"  // A
-      "ushl        v2.8h, v2.8h, v31.8h          \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "ushl        v1.8h, v1.8h, v31.8h          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "ushl        v0.8h, v0.8h, v31.8h          \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "ushl        v3.8h, v3.8h, v31.8h          \n"
-      "prfm        pldl1keep, [%3, 448]          \n"
-      "uqxtn       v2.8b, v2.8h                  \n"
-      "uqxtn       v1.8b, v1.8h                  \n"
-      "uqxtn       v0.8b, v0.8h                  \n"
-      "uqxtn       v3.8b, v3.8h                  \n"
-      "subs        %w5, %w5, #8                  \n"
-      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_argb),  // %4
-        "+r"(width)      // %5
-      : "r"(shift)       // %6
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
-}
-
-void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
-                            const uint16_t* src_g,
-                            const uint16_t* src_b,
-                            uint8_t* dst_argb,
-                            int depth,
-                            int width) {
-  int shift = 8 - depth;
-  asm volatile(
-
-      "dup         v31.8h, %w5                   \n"
-      "movi        v3.8b, #0xff                  \n"  // A (0xff)
-      "1:                                        \n"
-      "ldr         q2, [%0], #16                 \n"  // R
-      "ldr         q1, [%1], #16                 \n"  // G
-      "ldr         q0, [%2], #16                 \n"  // B
-      "ushl        v2.8h, v2.8h, v31.8h          \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "ushl        v1.8h, v1.8h, v31.8h          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "ushl        v0.8h, v0.8h, v31.8h          \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "uqxtn       v2.8b, v2.8h                  \n"
-      "uqxtn       v1.8b, v1.8h                  \n"
-      "uqxtn       v0.8b, v0.8h                  \n"
-      "subs        %w4, %w4, #8                  \n"
-      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_r),     // %0
-        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
-        "+r"(dst_argb),  // %3
-        "+r"(width)      // %4
-      : "r"(shift)       // %5
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
-}
-
-// Copy multiple of 32.
-void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ldp         q0, q1, [%0], #32             \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #32                 \n"  // 32 processed per loop
-      "stp         q0, q1, [%1], #32             \n"
-      "b.gt        1b                            \n"
-      : "+r"(src),                  // %0
-        "+r"(dst),                  // %1
-        "+r"(width)                 // %2  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-// SetRow writes 'width' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
-  asm volatile(
-      "dup         v0.16b, %w2                   \n"  // duplicate 16 bytes
-      "1:                                        \n"
-      "subs        %w1, %w1, #16                 \n"  // 16 bytes per loop
-      "st1         {v0.16b}, [%0], #16           \n"  // store
-      "b.gt        1b                            \n"
-      : "+r"(dst),   // %0
-        "+r"(width)  // %1
-      : "r"(v8)      // %2
-      : "cc", "memory", "v0");
-}
-
-void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
-  asm volatile(
-      "dup         v0.4s, %w2                    \n"  // duplicate 4 ints
-      "1:                                        \n"
-      "subs        %w1, %w1, #4                  \n"  // 4 ints per loop
-      "st1         {v0.16b}, [%0], #16           \n"  // store
-      "b.gt        1b                            \n"
-      : "+r"(dst),   // %0
-        "+r"(width)  // %1
-      : "r"(v32)     // %2
-      : "cc", "memory", "v0");
-}
-
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
-                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
-
-void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      // Start at end of source row.
-      "ld1         {v3.16b}, [%3]                \n"  // shuffler
-      "add         %0, %0, %w2, sxtw             \n"
-      "sub         %0, %0, #32                   \n"
-      "1:                                        \n"
-      "ldr         q2, [%0, 16]                  \n"
-      "ldr         q1, [%0], -32                 \n"  // src -= 32
-      "subs        %w2, %w2, #32                 \n"  // 32 pixels per loop.
-      "tbl         v0.16b, {v2.16b}, v3.16b      \n"
-      "tbl         v1.16b, {v1.16b}, v3.16b      \n"
-      "st1         {v0.16b, v1.16b}, [%1], #32   \n"  // store 32 pixels
-      "b.gt        1b                            \n"
-      : "+r"(src),            // %0
-        "+r"(dst),            // %1
-        "+r"(width)           // %2
-      : "r"(&kShuffleMirror)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-// Shuffle table for reversing the UV.
-static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
-                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
-
-void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  asm volatile(
-      // Start at end of source row.
-      "ld1         {v4.16b}, [%3]                \n"  // shuffler
-      "add         %0, %0, %w2, sxtw #1          \n"
-      "sub         %0, %0, #32                   \n"
-      "1:                                        \n"
-      "ldr         q1, [%0, 16]                  \n"
-      "ldr         q0, [%0], -32                 \n"  // src -= 32
-      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
-      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
-      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
-      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
-      "b.gt        1b                            \n"
-      : "+r"(src_uv),           // %0
-        "+r"(dst_uv),           // %1
-        "+r"(width)             // %2
-      : "r"(&kShuffleMirrorUV)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
-}
-
-void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width) {
-  asm volatile(
-      // Start at end of source row.
-      "ld1         {v4.16b}, [%4]                \n"  // shuffler
-      "add         %0, %0, %w3, sxtw #1          \n"
-      "sub         %0, %0, #32                   \n"
-      "1:                                        \n"
-      "ldr         q1, [%0, 16]                  \n"
-      "ldr         q0, [%0], -32                 \n"  // src -= 32
-      "subs        %w3, %w3, #16                 \n"  // 16 pixels per loop.
-      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
-      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
-      "uzp1        v0.16b, v2.16b, v3.16b        \n"  // U
-      "uzp2        v1.16b, v2.16b, v3.16b        \n"  // V
-      "st1         {v0.16b}, [%1], #16           \n"  // dst += 16
-      "st1         {v1.16b}, [%2], #16           \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_uv),           // %0
-        "+r"(dst_u),            // %1
-        "+r"(dst_v),            // %2
-        "+r"(width)             // %3
-      : "r"(&kShuffleMirrorUV)  // %4
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
-}
-
-// Shuffle table for reversing the ARGB.
-static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
-                                         4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};
-
-void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
-      // Start at end of source row.
-      "ld1         {v4.16b}, [%3]                \n"  // shuffler
-      "add         %0, %0, %w2, sxtw #2          \n"
-      "sub         %0, %0, #32                   \n"
-      "1:                                        \n"
-      "ldr         q1, [%0, 16]                  \n"
-      "ldr         q0, [%0], -32                 \n"  // src -= 32
-      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop.
-      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
-      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
-      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),           // %0
-        "+r"(dst_argb),           // %1
-        "+r"(width)               // %2
-      : "r"(&kShuffleMirrorARGB)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
-}
-
-void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  asm volatile(
-      "ld1         {v3.16b}, [%4]                \n"  // shuffler
-      "add         %0, %0, %w2, sxtw #1          \n"  // Start at end of row.
-      "add         %0, %0, %w2, sxtw             \n"
-      "sub         %0, %0, #48                   \n"
-
-      "1:                                        \n"
-      "ld3         {v0.16b, v1.16b, v2.16b}, [%0], %3 \n"  // src -= 48
-      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
-      "tbl         v0.16b, {v0.16b}, v3.16b      \n"
-      "tbl         v1.16b, {v1.16b}, v3.16b      \n"
-      "tbl         v2.16b, {v2.16b}, v3.16b      \n"
-      "st3         {v0.16b, v1.16b, v2.16b}, [%1], #48 \n"  // dst += 48
-      "b.gt        1b                            \n"
-      : "+r"(src_rgb24),      // %0
-        "+r"(dst_rgb24),      // %1
-        "+r"(width)           // %2
-      : "r"((ptrdiff_t)-48),  // %3
-        "r"(&kShuffleMirror)  // %4
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
-                         uint8_t* dst_argb,
-                         int width) {
-  asm volatile(
-      "movi        v4.8b, #255                   \n"  // Alpha
-      "1:                                        \n"
-      "ld3         {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of
-                                                       // RGB24.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "st4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_argb),   // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movi        v5.8b, #255                   \n"  // Alpha
-      "1:                                        \n"
-      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
-      "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "orr         v4.8b, v0.8b, v0.8b           \n"         // move r
-      "st4         {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
-      "b.gt        1b                            \n"
-      : "+r"(src_raw),   // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-
-void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  asm volatile(
-      "movi        v0.8b, #255                   \n"  // Alpha
-      "1:                                        \n"
-      "ld3         {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
-      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
-      "orr         v2.8b, v4.8b, v4.8b           \n"   // move g
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "orr         v1.8b, v5.8b, v5.8b           \n"         // move r
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
-      "b.gt        1b                            \n"
-      : "+r"(src_raw),   // %0
-        "+r"(dst_rgba),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-
-void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
-      "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "orr         v4.8b, v0.8b, v0.8b           \n"   // move r
-      "st3         {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
-      "b.gt        1b                            \n"
-      : "+r"(src_raw),    // %0
-        "+r"(dst_rgb24),  // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-#define RGB565TOARGB                                                        \
-  "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
-  "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
-  "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
-  "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
-  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
-  "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
-  "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
-  "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
-  "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
-  "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
-  "dup        v2.2D, v0.D[1]                 \n" /* R                    */
-
-void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      "movi        v3.8b, #255                   \n"  // Alpha
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "prfm        pldl1keep, [%0, 448]          \n" RGB565TOARGB
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_rgb565),  // %0
-        "+r"(dst_argb),    // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
-  );
-}
-
-#define ARGB1555TOARGB                                                      \
-  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
-  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
-  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
-                                                                            \
-  "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
-  "xtn2       v3.16b, v2.8h                  \n"                            \
-                                                                            \
-  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
-  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
-                                                                            \
-  "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
-  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
-  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
-                                                                            \
-  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
-  "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
-  "dup        v1.2D, v0.D[1]                 \n"                            \
-  "dup        v3.2D, v2.D[1]                 \n"
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                        \
-  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
-  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
-  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
-                                                                            \
-  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
-  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
-                                                                            \
-  "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
-  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
-  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
-                                                                            \
-  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
-  "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
-  "dup        v1.2D, v0.D[1]                 \n" /* G */
-
-void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "movi        v3.8b, #255                   \n"  // Alpha
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      ARGB1555TOARGB
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_argb1555),  // %0
-        "+r"(dst_argb),      // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
-// clobbers v3
-#define ARGB4444TOARGB                                                      \
-  "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
-  "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
-  "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
-  "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
-  "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
-  "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
-  "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
-  "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
-  "dup        v0.2D, v2.D[1]                 \n"                            \
-  "dup        v1.2D, v3.D[1]                 \n"
-
-void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "prfm        pldl1keep, [%0, 448]          \n" ARGB4444TOARGB
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_argb4444),  // %0
-        "+r"(dst_argb),      // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "st3         {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
-                                                       // RGB24
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(dst_rgb24),  // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "orr         v4.8b, v2.8b, v2.8b           \n"  // mov g
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "orr         v5.8b, v1.8b, v1.8b           \n"   // mov b
-      "st3         {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_raw),   // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-
-void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
-      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "st1         {v0.16b}, [%1], #16           \n"  // store 16 pixels of Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
-      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "st1         {v1.16b}, [%1], #16           \n"  // store 16 pixels of Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
-      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "st1         {v1.8b}, [%1], #8             \n"  // store 8 U.
-      "st1         {v3.8b}, [%2], #8             \n"  // store 8 V.
-      "b.gt        1b                            \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
-      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 U.
-      "st1         {v2.8b}, [%2], #8             \n"  // store 8 V.
-      "b.gt        1b                            \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-      "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-      "urhadd      v1.8b, v1.8b, v5.8b           \n"  // average rows of U
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "urhadd      v3.8b, v3.8b, v7.8b           \n"  // average rows of V
-      "st1         {v1.8b}, [%2], #8             \n"  // store 8 U.
-      "st1         {v3.8b}, [%3], #8             \n"  // store 8 V.
-      "b.gt        1b                            \n"
-      : "+r"(src_yuy2),   // %0
-        "+r"(src_yuy2b),  // %1
-        "+r"(dst_u),      // %2
-        "+r"(dst_v),      // %3
-        "+r"(width)       // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-        "v7"  // Clobber List
-  );
-}
-
-void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-      "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-      "urhadd      v0.8b, v0.8b, v4.8b           \n"  // average rows of U
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "urhadd      v2.8b, v2.8b, v6.8b           \n"  // average rows of V
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 U.
-      "st1         {v2.8b}, [%3], #8             \n"  // store 8 V.
-      "b.gt        1b                            \n"
-      : "+r"(src_uyvy),   // %0
-        "+r"(src_uyvyb),  // %1
-        "+r"(dst_u),      // %2
-        "+r"(dst_v),      // %3
-        "+r"(width)       // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-        "v7"  // Clobber List
-  );
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const uint8_t* shuffler,
-                         int width) {
-  asm volatile(
-      "ld1         {v2.16b}, [%3]                \n"  // shuffler
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 4 pixels.
-      "subs        %w2, %w2, #4                  \n"  // 4 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "tbl         v1.16b, {v0.16b}, v2.16b      \n"  // look up 4 pixels
-      "st1         {v1.16b}, [%1], #16           \n"  // store 4.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),                   // %0
-        "+r"(dst_argb),                   // %1
-        "+r"(width)                       // %2
-      : "r"(shuffler)                     // %3
-      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-  );
-}
-
-void I422ToYUY2Row_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld2         {v0.8b, v1.8b}, [%0], #16     \n"  // load 16 Ys
-      "subs        %w4, %w4, #16                 \n"  // 16 pixels
-      "orr         v2.8b, v1.8b, v1.8b           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "ld1         {v1.8b}, [%1], #8             \n"         // load 8 Us
-      "ld1         {v3.8b}, [%2], #8             \n"         // load 8 Vs
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-      "b.gt        1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_yuy2),  // %3
-        "+r"(width)      // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void I422ToUYVYRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld2         {v1.8b,v2.8b}, [%0], #16      \n"  // load 16 Ys
-      "orr         v3.8b, v2.8b, v2.8b           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "ld1         {v0.8b}, [%1], #8             \n"         // load 8 Us
-      "ld1         {v2.8b}, [%2], #8             \n"         // load 8 Vs
-      "subs        %w4, %w4, #16                 \n"         // 16 pixels
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-      "b.gt        1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_uyvy),  // %3
-        "+r"(width)      // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
-                          uint8_t* dst_rgb565,
-                          int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
-                                                                 // pixels
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "prfm        pldl1keep, [%0, 448]          \n" ARGBTORGB565
-      "st1         {v18.16b}, [%1], #16          \n"  // store 8 pixels RGB565.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),    // %0
-        "+r"(dst_rgb565),  // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "v16", "v17", "v18", "v19");
-}
-
-void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
-                                uint8_t* dst_rgb,
-                                const uint32_t dither4,
-                                int width) {
-  asm volatile(
-      "dup         v1.4s, %w2                    \n"  // dither4
-      "1:                                        \n"
-      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // load 8
-                                                                 // pixels
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "uqadd       v16.8b, v16.8b, v1.8b         \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uqadd       v17.8b, v17.8b, v1.8b         \n"
-      "uqadd       v18.8b, v18.8b, v1.8b         \n" ARGBTORGB565
-      "st1         {v18.16b}, [%0], #16          \n"  // store 8 pixels RGB565.
-      "b.gt        1b                            \n"
-      : "+r"(dst_rgb)   // %0
-      : "r"(src_argb),  // %1
-        "r"(dither4),   // %2
-        "r"(width)      // %3
-      : "cc", "memory", "v1", "v16", "v17", "v18", "v19");
-}
-
-void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_argb1555,
-                            int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
-                                                                 // pixels
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "prfm        pldl1keep, [%0, 448]          \n" ARGBTOARGB1555
-      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),      // %0
-        "+r"(dst_argb1555),  // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v16", "v17", "v18", "v19");
-}
-
-void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_argb4444,
-                            int width) {
-  asm volatile(
-      "movi        v23.16b, #0x0f                \n"  // bits to clear with
-                                                      // vbic.
-      "1:                                        \n"
-      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
-                                                                 // pixels
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "prfm        pldl1keep, [%0, 448]          \n" ARGBTOARGB4444
-      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),      // %0
-        "+r"(dst_argb4444),  // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
-}
-
-static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
-                                         10, 9, 8, 11, 14, 13, 12, 15};
-
-void ARGBToAR64Row_NEON(const uint8_t* src_argb,
-                        uint16_t* dst_ar64,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
-      "mov         v1.16b, v0.16b                \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "mov         v3.16b, v2.16b                \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "st2         {v0.16b, v1.16b}, [%1], #32   \n"  // store 4 pixels
-      "st2         {v2.16b, v3.16b}, [%1], #32   \n"  // store 4 pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_ar64),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void ARGBToAB64Row_NEON(const uint8_t* src_argb,
-                        uint16_t* dst_ab64,
-                        int width) {
-  asm volatile(
-      "ld1         {v4.16b}, %3                  \n"  // shuffler
-      "1:                                        \n"
-      "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
-      "tbl         v0.16b, {v0.16b}, v4.16b      \n"
-      "tbl         v2.16b, {v2.16b}, v4.16b      \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "mov         v1.16b, v0.16b                \n"
-      "mov         v3.16b, v2.16b                \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "st2         {v0.16b, v1.16b}, [%1], #32   \n"  // store 4 pixels
-      "st2         {v2.16b, v3.16b}, [%1], #32   \n"  // store 4 pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),          // %0
-        "+r"(dst_ab64),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleARGBToABGR)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
-}
-
-static const uvec8 kShuffleAR64ToARGB = {1,  3,  5,  7,  9,  11, 13, 15,
-                                         17, 19, 21, 23, 25, 27, 29, 31};
-
-void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
-                        uint8_t* dst_argb,
-                        int width) {
-  asm volatile(
-      "ld1         {v4.16b}, %3                  \n"  // shuffler
-      "1:                                        \n"
-      "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
-      "ldp         q2, q3, [%0], #32             \n"  // load 4 pixels
-      "tbl         v0.16b, {v0.16b, v1.16b}, v4.16b \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "tbl         v2.16b, {v2.16b, v3.16b}, v4.16b \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "stp         q0, q2, [%1], #32             \n"  // store 8 pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_ar64),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleAR64ToARGB)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
-}
-
-static const uvec8 kShuffleAB64ToARGB = {5,  3,  1,  7,  13, 11, 9,  15,
-                                         21, 19, 17, 23, 29, 27, 25, 31};
-
-void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
-                        uint8_t* dst_argb,
-                        int width) {
-  asm volatile(
-      "ld1         {v4.16b}, %3                  \n"  // shuffler
-      "1:                                        \n"
-      "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
-      "ldp         q2, q3, [%0], #32             \n"  // load 4 pixels
-      "tbl         v0.16b, {v0.16b, v1.16b}, v4.16b \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "tbl         v2.16b, {v2.16b, v3.16b}, v4.16b \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "stp         q0, q2, [%1], #32             \n"  // store 8 pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_ab64),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleAB64ToARGB)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
-}
-
-void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
-      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
-      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
-      "movi        v7.8b, #16                    \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v3.8h, v0.8b, v4.8b           \n"  // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
-      "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
-      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.8b, v0.8b, v7.8b           \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
-      "st1         {v3.16b}, [%1], #16           \n"  // store 16 A's.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_a),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
-      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
-      "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v3.8h, v0.8b, v4.8b           \n"  // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
-      "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
-      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
-      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
-      "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 RGBA
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v0.8h, v1.8b, v4.8b           \n"  // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v0.8h, v2.8b, v5.8b           \n"  // G
-      "umlal       v0.8h, v3.8b, v6.8b           \n"  // R
-      "uqrshrn     v3.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
-      "st1         {v3.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_rgba),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "movi        v24.8b, #112                  \n"  // UB / VR 0.875
-                                                      // coefficient
-      "movi        v25.8b, #74                   \n"  // UG -0.5781 coefficient
-      "movi        v26.8b, #38                   \n"  // UR -0.2969 coefficient
-      "movi        v27.8b, #18                   \n"  // VB -0.1406 coefficient
-      "movi        v28.8b, #94                   \n"  // VG -0.7344 coefficient
-      "movi        v29.16b,#0x80                 \n"  // 128.5
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlsl       v4.8h, v1.8b, v25.8b          \n"  // G
-      "umlsl       v4.8h, v2.8b, v26.8b          \n"  // R
-      "add         v4.8h, v4.8h, v29.8h          \n"  // +128 -> unsigned
-
-      "umull       v3.8h, v2.8b, v24.8b          \n"  // R
-      "umlsl       v3.8h, v1.8b, v28.8b          \n"  // G
-      "umlsl       v3.8h, v0.8b, v27.8b          \n"  // B
-      "add         v3.8h, v3.8h, v29.8h          \n"  // +128 -> unsigned
-
-      "uqshrn      v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit U
-      "uqshrn      v1.8b, v3.8h, #8              \n"  // 16 bit to 8 bit V
-
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
-        "v27", "v28", "v29");
-}
-
-#define RGBTOUV_SETUP_REG                                                  \
-  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
-  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
-  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
-  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
-  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
-  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-// clang-format off
-#define RGBTOUV(QB, QG, QR)                                                 \
-  "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \
-  "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \
-  "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \
-  "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \
-  "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \
-  "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \
-  "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
-  "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
-  "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
-  "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
-// clang-format on
-
-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-// TODO(fbarchard): consider ptrdiff_t for all strides.
-
-void ARGBToUVRow_NEON(const uint8_t* src_argb,
-                      int src_stride_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
-
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void ARGBToUVJRow_NEON(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
-      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
-      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
-      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
-      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
-      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
-      "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
-
-      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void BGRAToUVRow_NEON(const uint8_t* src_bgra,
-                      int src_stride_bgra,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "uaddlp      v0.8h, v3.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v3.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v2.8h, v1.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
-      "uadalp      v0.8h, v7.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v3.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
-
-      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(src_bgra_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void ABGRToUVRow_NEON(const uint8_t* src_abgr,
-                      int src_stride_abgr,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "uaddlp      v3.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v2.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v1.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-      "uadalp      v3.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "urshr       v0.8h, v3.8h, #1              \n"  // 2x average
-      "urshr       v2.8h, v2.8h, #1              \n"
-      "urshr       v1.8h, v1.8h, #1              \n"
-
-      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v2.8h, v1.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(src_abgr_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void RGBAToUVRow_NEON(const uint8_t* src_rgba,
-                      int src_stride_rgba,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "uaddlp      v0.8h, v1.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v2.8h, v3.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-      "uadalp      v0.8h, v5.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
-
-      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(src_rgba_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
-                       int src_stride_rgb24,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-      "1:                                        \n"
-      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
-      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
-      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
-
-      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(src_rgb24_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void RAWToUVRow_NEON(const uint8_t* src_raw,
-                     int src_stride_raw,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  const uint8_t* src_raw_1 = src_raw + src_stride_raw;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-      "1:                                        \n"
-      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
-      "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
-      "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "urshr       v2.8h, v2.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v0.8h, v0.8h, #1              \n"
-
-      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
-    RGBTOUV(v2.8h, v1.8h, v0.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_raw),  // %0
-    "+r"(src_raw_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-// 16x2 pixels -> 8x1.  width is number of rgb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
-                        int src_stride_rgb565,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
-  asm volatile(
-      RGBTOUV_SETUP_REG
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
-      RGB565TOARGB
-      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 RGB565 pixels.
-      RGB565TOARGB
-      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-
-      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 RGB565 pixels.
-      RGB565TOARGB
-      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 RGB565 pixels.
-      RGB565TOARGB
-      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-
-      "ins         v16.D[1], v26.D[0]            \n"
-      "ins         v17.D[1], v27.D[0]            \n"
-      "ins         v18.D[1], v28.D[0]            \n"
-
-      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
-      "urshr       v1.8h, v17.8h, #1             \n"
-      "urshr       v2.8h, v18.8h, #1             \n"
-
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
-      RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-      : "+r"(src_rgb565),    // %0
-        "+r"(src_rgb565_1),  // %1
-        "+r"(dst_u),           // %2
-        "+r"(dst_v),           // %3
-        "+r"(width)            // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
-        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
-        "v28");
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
-                          int src_stride_argb1555,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
-  asm volatile(
-      RGBTOUV_SETUP_REG
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-
-      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-
-      "ins         v16.D[1], v26.D[0]            \n"
-      "ins         v17.D[1], v27.D[0]            \n"
-      "ins         v18.D[1], v28.D[0]            \n"
-
-      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
-      "urshr       v1.8h, v17.8h, #1             \n"
-      "urshr       v2.8h, v18.8h, #1             \n"
-
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
-      RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb1555),    // %0
-        "+r"(src_argb1555_1),  // %1
-        "+r"(dst_u),           // %2
-        "+r"(dst_v),           // %3
-        "+r"(width)            // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
-        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
-        "v28");
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
-                          int src_stride_argb4444,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
-  asm volatile(
-      RGBTOUV_SETUP_REG  // sets v20-v25
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-
-      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-
-      "ins         v16.D[1], v26.D[0]            \n"
-      "ins         v17.D[1], v27.D[0]            \n"
-      "ins         v18.D[1], v28.D[0]            \n"
-
-      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
-      "urshr       v1.8h, v17.8h, #1             \n"
-      "urshr       v2.8h, v18.8h, #1             \n"
-
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
-      RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb4444),    // %0
-        "+r"(src_argb4444_1),  // %1
-        "+r"(dst_u),           // %2
-        "+r"(dst_v),           // %3
-        "+r"(width)            // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
-        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
-        "v28"
-
-  );
-}
-
-void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
-      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
-      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
-      "movi        v27.8b, #16                   \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      RGB565TOARGB
-      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
-      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
-      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.8b, v0.8b, v27.8b          \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_rgb565),  // %0
-        "+r"(dst_y),       // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
-        "v27");
-}
-
-void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
-                         uint8_t* dst_y,
-                         int width) {
-  asm volatile(
-      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
-      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
-      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
-      "movi        v7.8b, #16                    \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      ARGB1555TOARGB
-      "umull       v3.8h, v0.8b, v4.8b           \n"  // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
-      "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
-      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.8b, v0.8b, v7.8b           \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb1555),  // %0
-        "+r"(dst_y),         // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
-                         uint8_t* dst_y,
-                         int width) {
-  asm volatile(
-      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
-      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
-      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
-      "movi        v27.8b, #16                   \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      ARGB4444TOARGB
-      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
-      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
-      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.8b, v0.8b, v27.8b          \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb4444),  // %0
-        "+r"(dst_y),         // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
-}
-
-void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
-      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
-      "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
-      "movi        v7.8b, #16                    \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v16.8h, v1.8b, v4.8b          \n"  // R
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v16.8h, v2.8b, v5.8b          \n"  // G
-      "umlal       v16.8h, v3.8b, v6.8b          \n"  // B
-      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.8b, v0.8b, v7.8b           \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_bgra),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
-      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
-      "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
-      "movi        v7.8b, #16                    \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v16.8h, v0.8b, v4.8b          \n"  // R
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
-      "umlal       v16.8h, v2.8b, v6.8b          \n"  // B
-      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.8b, v0.8b, v7.8b           \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_abgr),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
-      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
-      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
-      "movi        v7.8b, #16                    \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v16.8h, v1.8b, v4.8b          \n"  // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v16.8h, v2.8b, v5.8b          \n"  // G
-      "umlal       v16.8h, v3.8b, v6.8b          \n"  // R
-      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.8b, v0.8b, v7.8b           \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_rgba),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
-      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
-      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
-      "movi        v7.8b, #16                    \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
-      "umull       v16.8h, v0.8b, v4.8b          \n"   // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
-      "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
-      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.8b, v0.8b, v7.8b           \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_y),      // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
-      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
-      "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
-      "movi        v7.8b, #16                    \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
-      "umull       v16.8h, v0.8b, v4.8b          \n"   // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
-      "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
-      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.8b, v0.8b, v7.8b           \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_raw),  // %0
-        "+r"(dst_y),    // %1
-        "+r"(width)     // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
-  asm volatile(
-      "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
-      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
-      "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
-      "1:                                        \n"
-      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
-      "umull       v0.8h, v0.8b, v4.8b           \n"   // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v0.8h, v1.8b, v5.8b           \n"  // G
-      "umlal       v0.8h, v2.8b, v6.8b           \n"  // R
-      "uqrshrn     v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_yj),     // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
-  asm volatile(
-      "movi        v6.8b, #29                    \n"  // B * 0.1140 coefficient
-      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
-      "movi        v4.8b, #77                    \n"  // R * 0.2990 coefficient
-      "1:                                        \n"
-      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
-      "umull       v0.8h, v0.8b, v4.8b           \n"   // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v0.8h, v1.8b, v5.8b           \n"  // G
-      "umlal       v0.8h, v2.8b, v6.8b           \n"  // R
-      "uqrshrn     v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_raw),  // %0
-        "+r"(dst_yj),   // %1
-        "+r"(width)     // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int dst_width,
-                         int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  asm volatile(
-      "cmp         %w4, #0                       \n"
-      "b.eq        100f                          \n"
-      "cmp         %w4, #128                     \n"
-      "b.eq        50f                           \n"
-
-      "dup         v5.16b, %w4                   \n"
-      "dup         v4.16b, %w5                   \n"
-      // General purpose row blend.
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "umull       v2.8h, v0.8b,  v4.8b          \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "umull2      v3.8h, v0.16b, v4.16b         \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "umlal       v2.8h, v1.8b,  v5.8b          \n"
-      "umlal2      v3.8h, v1.16b, v5.16b         \n"
-      "rshrn       v0.8b,  v2.8h, #8             \n"
-      "rshrn2      v0.16b, v3.8h, #8             \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        1b                            \n"
-      "b           99f                           \n"
-
-      // Blend 50 / 50.
-      "50:                                       \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        50b                           \n"
-      "b           99f                           \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      "100:                                      \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        100b                          \n"
-
-      "99:                                       \n"
-      : "+r"(dst_ptr),      // %0
-        "+r"(src_ptr),      // %1
-        "+r"(src_ptr1),     // %2
-        "+r"(dst_width),    // %3
-        "+r"(y1_fraction),  // %4
-        "+r"(y0_fraction)   // %5
-      :
-      : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
-}
-
-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8_t* src_argb,
-                       const uint8_t* src_argb1,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-      "subs        %w3, %w3, #8                  \n"
-      "b.lt        89f                           \n"
-      // Blend 8 pixels.
-      "8:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
-      "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
-      "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
-      "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
-      "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
-      "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
-      "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
-      "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
-      "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
-      "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
-      "movi        v3.8b, #255                   \n"  // a = 255
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-                                                             // pixels
-      "b.ge        8b                            \n"
-
-      "89:                                       \n"
-      "adds        %w3, %w3, #8-1                \n"
-      "b.lt        99f                           \n"
-
-      // Blend 1 pixels.
-      "1:                                        \n"
-      "ld4         {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel
-                                                           // ARGB0.
-      "ld4         {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel
-                                                           // ARGB1.
-      "subs        %w3, %w3, #1                  \n"  // 1 processed per loop.
-      "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
-      "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
-      "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
-      "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
-      "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
-      "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
-      "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
-      "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
-      "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
-      "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
-      "movi        v3.8b, #255                   \n"  // a = 255
-      "st4         {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
-      "b.ge        1b                            \n"
-
-      "99:                                       \n"
-
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18");
-}
-
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
-                           uint8_t* dst_argb,
-                           int width) {
-  asm volatile(
-      // Attenuate 8 pixels.
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v4.8h, v0.8b, v3.8b           \n"  // b * a
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umull       v5.8h, v1.8b, v3.8b           \n"         // g * a
-      "umull       v6.8h, v2.8b, v3.8b           \n"         // r * a
-      "uqrshrn     v0.8b, v4.8h, #8              \n"         // b >>= 8
-      "uqrshrn     v1.8b, v5.8h, #8              \n"         // g >>= 8
-      "uqrshrn     v2.8b, v6.8h, #8              \n"         // r >>= 8
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// Quantize 8 ARGB pixels (32 bytes).
-// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
-                          int scale,
-                          int interval_size,
-                          int interval_offset,
-                          int width) {
-  asm volatile(
-      "dup         v4.8h, %w2                    \n"
-      "ushr        v4.8h, v4.8h, #1              \n"  // scale >>= 1
-      "dup         v5.8h, %w3                    \n"  // interval multiply.
-      "dup         v6.8h, %w4                    \n"  // interval add
-
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8  ARGB.
-      "subs        %w1, %w1, #8                  \n"    // 8 processed per loop.
-      "uxtl        v0.8h, v0.8b                  \n"    // b (0 .. 255)
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uxtl        v1.8h, v1.8b                  \n"
-      "uxtl        v2.8h, v2.8b                  \n"
-      "sqdmulh     v0.8h, v0.8h, v4.8h           \n"  // b * scale
-      "sqdmulh     v1.8h, v1.8h, v4.8h           \n"  // g
-      "sqdmulh     v2.8h, v2.8h, v4.8h           \n"  // r
-      "mul         v0.8h, v0.8h, v5.8h           \n"  // b * interval_size
-      "mul         v1.8h, v1.8h, v5.8h           \n"  // g
-      "mul         v2.8h, v2.8h, v5.8h           \n"  // r
-      "add         v0.8h, v0.8h, v6.8h           \n"  // b + interval_offset
-      "add         v1.8h, v1.8h, v6.8h           \n"  // g
-      "add         v2.8h, v2.8h, v6.8h           \n"  // r
-      "uqxtn       v0.8b, v0.8h                  \n"
-      "uqxtn       v1.8b, v1.8h                  \n"
-      "uqxtn       v2.8b, v2.8h                  \n"
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(dst_argb),       // %0
-        "+r"(width)           // %1
-      : "r"(scale),           // %2
-        "r"(interval_size),   // %3
-        "r"(interval_offset)  // %4
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// Shade 8 pixels at a time by specified value.
-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8_t* src_argb,
-                       uint8_t* dst_argb,
-                       int width,
-                       uint32_t value) {
-  asm volatile(
-      "dup         v0.4s, %w3                    \n"  // duplicate scale value.
-      "zip1        v0.8b, v0.8b, v0.8b           \n"  // v0.8b aarrggbb.
-      "ushr        v0.8h, v0.8h, #1              \n"  // scale / 2.
-
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "uxtl        v4.8h, v4.8b                  \n"  // b (0 .. 255)
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uxtl        v5.8h, v5.8b                  \n"
-      "uxtl        v6.8h, v6.8b                  \n"
-      "uxtl        v7.8h, v7.8b                  \n"
-      "sqrdmulh    v4.8h, v4.8h, v0.h[0]         \n"  // b * scale * 2
-      "sqrdmulh    v5.8h, v5.8h, v0.h[1]         \n"  // g
-      "sqrdmulh    v6.8h, v6.8h, v0.h[2]         \n"  // r
-      "sqrdmulh    v7.8h, v7.8h, v0.h[3]         \n"  // a
-      "uqxtn       v4.8b, v4.8h                  \n"
-      "uqxtn       v5.8b, v5.8h                  \n"
-      "uqxtn       v6.8b, v6.8h                  \n"
-      "uqxtn       v7.8b, v7.8h                  \n"
-      "st4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(value)       // %3
-      : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
-}
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-// Similar to ARGBToYJ but stores ARGB.
-// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
-void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movi        v24.8b, #29                   \n"  // B * 0.1140 coefficient
-      "movi        v25.8b, #150                  \n"  // G * 0.5870 coefficient
-      "movi        v26.8b, #77                   \n"  // R * 0.2990 coefficient
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v4.8h, v1.8b, v25.8b          \n"  // G
-      "umlal       v4.8h, v2.8b, v26.8b          \n"  // R
-      "uqrshrn     v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit B
-      "orr         v1.8b, v0.8b, v0.8b           \n"  // G
-      "orr         v2.8b, v0.8b, v0.8b           \n"  // R
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
-}
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-
-void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movi        v20.8b, #17                   \n"  // BB coefficient
-      "movi        v21.8b, #68                   \n"  // BG coefficient
-      "movi        v22.8b, #35                   \n"  // BR coefficient
-      "movi        v24.8b, #22                   \n"  // GB coefficient
-      "movi        v25.8b, #88                   \n"  // GG coefficient
-      "movi        v26.8b, #45                   \n"  // GR coefficient
-      "movi        v28.8b, #24                   \n"  // BB coefficient
-      "movi        v29.8b, #98                   \n"  // BG coefficient
-      "movi        v30.8b, #50                   \n"  // BR coefficient
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
-      "subs        %w1, %w1, #8                  \n"    // 8 processed per loop.
-      "umull       v4.8h, v0.8b, v20.8b          \n"    // B to Sepia B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v4.8h, v1.8b, v21.8b          \n"  // G
-      "umlal       v4.8h, v2.8b, v22.8b          \n"  // R
-      "umull       v5.8h, v0.8b, v24.8b          \n"  // B to Sepia G
-      "umlal       v5.8h, v1.8b, v25.8b          \n"  // G
-      "umlal       v5.8h, v2.8b, v26.8b          \n"  // R
-      "umull       v6.8h, v0.8b, v28.8b          \n"  // B to Sepia R
-      "umlal       v6.8h, v1.8b, v29.8b          \n"  // G
-      "umlal       v6.8h, v2.8b, v30.8b          \n"  // R
-      "uqshrn      v0.8b, v4.8h, #7              \n"  // 16 bit to 8 bit B
-      "uqshrn      v1.8b, v5.8h, #7              \n"  // 16 bit to 8 bit G
-      "uqshrn      v2.8b, v6.8h, #7              \n"  // 16 bit to 8 bit R
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
-      "b.gt        1b                            \n"
-      : "+r"(dst_argb),  // %0
-        "+r"(width)      // %1
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-        "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
-}
-
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
-// needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             const int8_t* matrix_argb,
-                             int width) {
-  asm volatile(
-      "ld1         {v2.16b}, [%3]                \n"  // load 3 ARGB vectors.
-      "sxtl        v0.8h, v2.8b                  \n"  // B,G coefficients s16.
-      "sxtl2       v1.8h, v2.16b                 \n"  // R,A coefficients s16.
-
-      "1:                                        \n"
-      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
-      "uxtl        v16.8h, v16.8b                \n"  // b (0 .. 255) 16 bit
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uxtl        v17.8h, v17.8b                \n"  // g
-      "uxtl        v18.8h, v18.8b                \n"  // r
-      "uxtl        v19.8h, v19.8b                \n"  // a
-      "mul         v22.8h, v16.8h, v0.h[0]       \n"  // B = B * Matrix B
-      "mul         v23.8h, v16.8h, v0.h[4]       \n"  // G = B * Matrix G
-      "mul         v24.8h, v16.8h, v1.h[0]       \n"  // R = B * Matrix R
-      "mul         v25.8h, v16.8h, v1.h[4]       \n"  // A = B * Matrix A
-      "mul         v4.8h, v17.8h, v0.h[1]        \n"  // B += G * Matrix B
-      "mul         v5.8h, v17.8h, v0.h[5]        \n"  // G += G * Matrix G
-      "mul         v6.8h, v17.8h, v1.h[1]        \n"  // R += G * Matrix R
-      "mul         v7.8h, v17.8h, v1.h[5]        \n"  // A += G * Matrix A
-      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
-      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
-      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
-      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
-      "mul         v4.8h, v18.8h, v0.h[2]        \n"  // B += R * Matrix B
-      "mul         v5.8h, v18.8h, v0.h[6]        \n"  // G += R * Matrix G
-      "mul         v6.8h, v18.8h, v1.h[2]        \n"  // R += R * Matrix R
-      "mul         v7.8h, v18.8h, v1.h[6]        \n"  // A += R * Matrix A
-      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
-      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
-      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
-      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
-      "mul         v4.8h, v19.8h, v0.h[3]        \n"  // B += A * Matrix B
-      "mul         v5.8h, v19.8h, v0.h[7]        \n"  // G += A * Matrix G
-      "mul         v6.8h, v19.8h, v1.h[3]        \n"  // R += A * Matrix R
-      "mul         v7.8h, v19.8h, v1.h[7]        \n"  // A += A * Matrix A
-      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
-      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
-      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
-      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
-      "sqshrun     v16.8b, v22.8h, #6            \n"  // 16 bit to 8 bit B
-      "sqshrun     v17.8b, v23.8h, #6            \n"  // 16 bit to 8 bit G
-      "sqshrun     v18.8b, v24.8h, #6            \n"  // 16 bit to 8 bit R
-      "sqshrun     v19.8b, v25.8h, #6            \n"  // 16 bit to 8 bit A
-      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(dst_argb),   // %1
-        "+r"(width)       // %2
-      : "r"(matrix_argb)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18", "v19", "v22", "v23", "v24", "v25");
-}
-
-// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "umull       v0.8h, v0.8b, v4.8b           \n"  // multiply B
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umull       v1.8h, v1.8b, v5.8b           \n"  // multiply G
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "umull       v2.8h, v2.8b, v6.8b           \n"  // multiply R
-      "umull       v3.8h, v3.8b, v7.8b           \n"  // multiply A
-      "rshrn       v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit B
-      "rshrn       v1.8b, v1.8h, #8              \n"  // 16 bit to 8 bit G
-      "rshrn       v2.8b, v2.8h, #8              \n"  // 16 bit to 8 bit R
-      "rshrn       v3.8b, v3.8h, #8              \n"  // 16 bit to 8 bit A
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8_t* src_argb,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "uqadd       v0.8b, v0.8b, v4.8b           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uqadd       v1.8b, v1.8b, v5.8b           \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uqadd       v2.8b, v2.8b, v6.8b           \n"
-      "uqadd       v3.8b, v3.8b, v7.8b           \n"
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8_t* src_argb,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "uqsub       v0.8b, v0.8b, v4.8b           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uqsub       v1.8b, v1.8b, v5.8b           \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uqsub       v2.8b, v2.8b, v6.8b           \n"
-      "uqsub       v3.8b, v3.8b, v7.8b           \n"
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_NEON(const uint8_t* src_sobelx,
-                   const uint8_t* src_sobely,
-                   uint8_t* dst_argb,
-                   int width) {
-  asm volatile(
-      "movi        v3.8b, #255                   \n"  // alpha
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld1         {v0.8b}, [%0], #8             \n"  // load 8 sobelx.
-      "ld1         {v1.8b}, [%1], #8             \n"  // load 8 sobely.
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "uqadd       v0.8b, v0.8b, v1.8b           \n"  // add
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "orr         v1.8b, v0.8b, v0.8b           \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "orr         v2.8b, v0.8b, v0.8b           \n"
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
-                          const uint8_t* src_sobely,
-                          uint8_t* dst_y,
-                          int width) {
-  asm volatile(
-      // 16 pixel loop.
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 sobelx.
-      "ld1         {v1.16b}, [%1], #16           \n"  // load 16 sobely.
-      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uqadd       v0.16b, v0.16b, v1.16b        \n"  // add
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "st1         {v0.16b}, [%2], #16           \n"  // store 16 pixels.
-      "b.gt        1b                            \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_y),       // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "v0", "v1");
-}
-
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_NEON(const uint8_t* src_sobelx,
-                     const uint8_t* src_sobely,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      "movi        v3.8b, #255                   \n"  // alpha
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld1         {v2.8b}, [%0], #8             \n"  // load 8 sobelx.
-      "ld1         {v0.8b}, [%1], #8             \n"  // load 8 sobely.
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uqadd       v1.8b, v0.8b, v2.8b           \n"  // add
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-void SobelXRow_NEON(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    const uint8_t* src_y2,
-                    uint8_t* dst_sobelx,
-                    int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v0.8b}, [%0],%5              \n"  // top
-      "ld1         {v1.8b}, [%0],%6              \n"
-      "usubl       v0.8h, v0.8b, v1.8b           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "ld1         {v2.8b}, [%1],%5              \n"  // center * 2
-      "ld1         {v3.8b}, [%1],%6              \n"
-      "usubl       v1.8h, v2.8b, v3.8b           \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "add         v0.8h, v0.8h, v1.8h           \n"
-      "add         v0.8h, v0.8h, v1.8h           \n"
-      "ld1         {v2.8b}, [%2],%5              \n"  // bottom
-      "ld1         {v3.8b}, [%2],%6              \n"
-      "subs        %w4, %w4, #8                  \n"  // 8 pixels
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "usubl       v1.8h, v2.8b, v3.8b           \n"
-      "add         v0.8h, v0.8h, v1.8h           \n"
-      "abs         v0.8h, v0.8h                  \n"
-      "uqxtn       v0.8b, v0.8h                  \n"
-      "st1         {v0.8b}, [%3], #8             \n"  // store 8 sobelx
-      "b.gt        1b                            \n"
-      : "+r"(src_y0),                           // %0
-        "+r"(src_y1),                           // %1
-        "+r"(src_y2),                           // %2
-        "+r"(dst_sobelx),                       // %3
-        "+r"(width)                             // %4
-      : "r"(2LL),                               // %5
-        "r"(6LL)                                // %6
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-void SobelYRow_NEON(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    uint8_t* dst_sobely,
-                    int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v0.8b}, [%0],%4              \n"  // left
-      "ld1         {v1.8b}, [%1],%4              \n"
-      "usubl       v0.8h, v0.8b, v1.8b           \n"
-      "ld1         {v2.8b}, [%0],%4              \n"  // center * 2
-      "ld1         {v3.8b}, [%1],%4              \n"
-      "usubl       v1.8h, v2.8b, v3.8b           \n"
-      "add         v0.8h, v0.8h, v1.8h           \n"
-      "add         v0.8h, v0.8h, v1.8h           \n"
-      "ld1         {v2.8b}, [%0],%5              \n"  // right
-      "ld1         {v3.8b}, [%1],%5              \n"
-      "subs        %w3, %w3, #8                  \n"  // 8 pixels
-      "usubl       v1.8h, v2.8b, v3.8b           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "add         v0.8h, v0.8h, v1.8h           \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "abs         v0.8h, v0.8h                  \n"
-      "uqxtn       v0.8b, v0.8h                  \n"
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 sobely
-      "b.gt        1b                            \n"
-      : "+r"(src_y0),                           // %0
-        "+r"(src_y1),                           // %1
-        "+r"(dst_sobely),                       // %2
-        "+r"(width)                             // %3
-      : "r"(1LL),                               // %4
-        "r"(6LL)                                // %5
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// Caveat - rounds float to half float whereas scaling version truncates.
-void HalfFloat1Row_NEON(const uint16_t* src,
-                        uint16_t* dst,
-                        float /*unused*/,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
-      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
-      "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uxtl2       v3.4s, v1.8h                  \n"
-      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
-      "scvtf       v3.4s, v3.4s                  \n"
-      "fcvtn       v1.4h, v2.4s                  \n"  // 8 half floats
-      "fcvtn2      v1.8h, v3.4s                  \n"
-      "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
-      "b.gt        1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "cc", "memory", "v1", "v2", "v3");
-}
-
-void HalfFloatRow_NEON(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
-      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
-      "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uxtl2       v3.4s, v1.8h                  \n"
-      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
-      "scvtf       v3.4s, v3.4s                  \n"
-      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // adjust exponent
-      "fmul        v3.4s, v3.4s, %3.s[0]         \n"
-      "uqshrn      v1.4h, v2.4s, #13             \n"  // isolate halffloat
-      "uqshrn2     v1.8h, v3.4s, #13             \n"
-      "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
-      "b.gt        1b                            \n"
-      : "+r"(src),                      // %0
-        "+r"(dst),                      // %1
-        "+r"(width)                     // %2
-      : "w"(scale * 1.9259299444e-34f)  // %3
-      : "cc", "memory", "v1", "v2", "v3");
-}
-
-void ByteToFloatRow_NEON(const uint8_t* src,
-                         float* dst,
-                         float scale,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v1.8b}, [%0], #8             \n"  // load 8 bytes
-      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
-      "uxtl        v1.8h, v1.8b                  \n"  // 8 shorts
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uxtl        v2.4s, v1.4h                  \n"  // 8 ints
-      "uxtl2       v3.4s, v1.8h                  \n"
-      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
-      "scvtf       v3.4s, v3.4s                  \n"
-      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
-      "fmul        v3.4s, v3.4s, %3.s[0]         \n"
-      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // store 8 floats
-      "b.gt        1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      : "w"(scale)   // %3
-      : "cc", "memory", "v1", "v2", "v3");
-}
-
-float ScaleMaxSamples_NEON(const float* src,
-                           float* dst,
-                           float scale,
-                           int width) {
-  float fmax;
-  asm volatile(
-      "movi        v5.4s, #0                     \n"  // max
-      "movi        v6.4s, #0                     \n"
-
-      "1:                                        \n"
-      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
-      "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "fmul        v4.4s, v2.4s, %4.s[0]         \n"  // scale
-      "fmax        v5.4s, v5.4s, v1.4s           \n"  // max
-      "fmax        v6.4s, v6.4s, v2.4s           \n"
-      "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
-      "b.gt        1b                            \n"
-      "fmax        v5.4s, v5.4s, v6.4s           \n"  // max
-      "fmaxv       %s3, v5.4s                    \n"  // signed max acculator
-      : "+r"(src),                                    // %0
-        "+r"(dst),                                    // %1
-        "+r"(width),                                  // %2
-        "=w"(fmax)                                    // %3
-      : "w"(scale)                                    // %4
-      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
-  return fmax;
-}
-
-float ScaleSumSamples_NEON(const float* src,
-                           float* dst,
-                           float scale,
-                           int width) {
-  float fsum;
-  asm volatile(
-      "movi        v5.4s, #0                     \n"  // max
-      "movi        v6.4s, #0                     \n"  // max
-
-      "1:                                        \n"
-      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
-      "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "fmul        v4.4s, v2.4s, %4.s[0]         \n"
-      "fmla        v5.4s, v1.4s, v1.4s           \n"  // sum of squares
-      "fmla        v6.4s, v2.4s, v2.4s           \n"
-      "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
-      "b.gt        1b                            \n"
-      "faddp       v5.4s, v5.4s, v6.4s           \n"
-      "faddp       v5.4s, v5.4s, v5.4s           \n"
-      "faddp       %3.4s, v5.4s, v5.4s           \n"  // sum
-      : "+r"(src),                                    // %0
-        "+r"(dst),                                    // %1
-        "+r"(width),                                  // %2
-        "=w"(fsum)                                    // %3
-      : "w"(scale)                                    // %4
-      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
-  return fsum;
-}
-
-void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
-      "fmul        v1.4s, v1.4s, %3.s[0]         \n"  // scale
-      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
-      "st1         {v1.4s, v2.4s}, [%1], #32     \n"  // store 8 samples
-      "b.gt        1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      : "w"(scale)   // %3
-      : "cc", "memory", "v1", "v2");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussCol_NEON(const uint16_t* src0,
-                   const uint16_t* src1,
-                   const uint16_t* src2,
-                   const uint16_t* src3,
-                   const uint16_t* src4,
-                   uint32_t* dst,
-                   int width) {
-  asm volatile(
-      "movi        v6.8h, #4                     \n"  // constant 4
-      "movi        v7.8h, #6                     \n"  // constant 6
-
-      "1:                                        \n"
-      "ld1         {v1.8h}, [%0], #16            \n"  // load 8 samples, 5 rows
-      "ld1         {v2.8h}, [%4], #16            \n"
-      "uaddl       v0.4s, v1.4h, v2.4h           \n"  // * 1
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddl2      v1.4s, v1.8h, v2.8h           \n"  // * 1
-      "ld1         {v2.8h}, [%1], #16            \n"
-      "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
-      "ld1         {v2.8h}, [%2], #16            \n"
-      "umlal       v0.4s, v2.4h, v7.4h           \n"  // * 6
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "umlal2      v1.4s, v2.8h, v7.8h           \n"  // * 6
-      "ld1         {v2.8h}, [%3], #16            \n"
-      "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
-      "prfm        pldl1keep, [%3, 448]          \n"
-      "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
-      "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
-      "st1         {v0.4s,v1.4s}, [%5], #32      \n"  // store 8 samples
-      "prfm        pldl1keep, [%4, 448]          \n"
-      "b.gt        1b                            \n"
-      : "+r"(src0),  // %0
-        "+r"(src1),  // %1
-        "+r"(src2),  // %2
-        "+r"(src3),  // %3
-        "+r"(src4),  // %4
-        "+r"(dst),   // %5
-        "+r"(width)  // %6
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
-  const uint32_t* src1 = src + 1;
-  const uint32_t* src2 = src + 2;
-  const uint32_t* src3 = src + 3;
-  asm volatile(
-      "movi        v6.4s, #4                     \n"  // constant 4
-      "movi        v7.4s, #6                     \n"  // constant 6
-
-      "1:                                        \n"
-      "ld1         {v0.4s,v1.4s,v2.4s}, [%0], %6 \n"  // load 12 source samples
-      "add         v0.4s, v0.4s, v1.4s           \n"  // * 1
-      "add         v1.4s, v1.4s, v2.4s           \n"  // * 1
-      "ld1         {v2.4s,v3.4s}, [%2], #32      \n"
-      "mla         v0.4s, v2.4s, v7.4s           \n"  // * 6
-      "mla         v1.4s, v3.4s, v7.4s           \n"  // * 6
-      "ld1         {v2.4s,v3.4s}, [%1], #32      \n"
-      "ld1         {v4.4s,v5.4s}, [%3], #32      \n"
-      "add         v2.4s, v2.4s, v4.4s           \n"  // add rows for * 4
-      "add         v3.4s, v3.4s, v5.4s           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "mla         v0.4s, v2.4s, v6.4s           \n"  // * 4
-      "mla         v1.4s, v3.4s, v6.4s           \n"  // * 4
-      "subs        %w5, %w5, #8                  \n"  // 8 processed per loop
-      "uqrshrn     v0.4h, v0.4s, #8              \n"  // round and pack
-      "uqrshrn2    v0.8h, v1.4s, #8              \n"
-      "st1         {v0.8h}, [%4], #16            \n"  // store 8 samples
-      "b.gt        1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(src1),  // %1
-        "+r"(src2),  // %2
-        "+r"(src3),  // %3
-        "+r"(dst),   // %4
-        "+r"(width)  // %5
-      : "r"(32LL)    // %6
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussCol_F32_NEON(const float* src0,
-                       const float* src1,
-                       const float* src2,
-                       const float* src3,
-                       const float* src4,
-                       float* dst,
-                       int width) {
-  asm volatile(
-      "ld2r        {v6.4s, v7.4s}, [%7]          \n"  // constants 4 and 6
-
-      "1:                                        \n"
-      "ld1         {v0.4s, v1.4s}, [%0], #32     \n"  // load 8 samples, 5 rows
-      "ld1         {v2.4s, v3.4s}, [%1], #32     \n"
-      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
-      "ld1         {v4.4s, v5.4s}, [%2], #32     \n"
-      "fmla        v1.4s, v3.4s, v6.4s           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
-      "ld1         {v2.4s, v3.4s}, [%3], #32     \n"
-      "fmla        v1.4s, v5.4s, v7.4s           \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
-      "ld1         {v4.4s, v5.4s}, [%4], #32     \n"
-      "fmla        v1.4s, v3.4s, v6.4s           \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "fadd        v0.4s, v0.4s, v4.4s           \n"  // * 1
-      "prfm        pldl1keep, [%3, 448]          \n"
-      "fadd        v1.4s, v1.4s, v5.4s           \n"
-      "prfm        pldl1keep, [%4, 448]          \n"
-      "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
-      "st1         {v0.4s, v1.4s}, [%5], #32     \n"  // store 8 samples
-      "b.gt        1b                            \n"
-      : "+r"(src0),               // %0
-        "+r"(src1),               // %1
-        "+r"(src2),               // %2
-        "+r"(src3),               // %3
-        "+r"(src4),               // %4
-        "+r"(dst),                // %5
-        "+r"(width)               // %6
-      : "r"(&kGaussCoefficients)  // %7
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussRow_F32_NEON(const float* src, float* dst, int width) {
-  asm volatile(
-      "ld3r        {v6.4s, v7.4s, v8.4s}, [%3]   \n"  // constants 4, 6, 1/256
-
-      "1:                                        \n"
-      "ld1         {v0.4s, v1.4s, v2.4s}, [%0], %4 \n"  // load 12 samples, 5
-                                                        // rows
-      "fadd        v0.4s, v0.4s, v1.4s           \n"    // * 1
-      "ld1         {v4.4s, v5.4s}, [%0], %5      \n"
-      "fadd        v1.4s, v1.4s, v2.4s           \n"
-      "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
-      "ld1         {v2.4s, v3.4s}, [%0], %4      \n"
-      "fmla        v1.4s, v5.4s, v7.4s           \n"
-      "ld1         {v4.4s, v5.4s}, [%0], %6      \n"
-      "fadd        v2.4s, v2.4s, v4.4s           \n"
-      "fadd        v3.4s, v3.4s, v5.4s           \n"
-      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
-      "fmla        v1.4s, v3.4s, v6.4s           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "fmul        v0.4s, v0.4s, v8.4s           \n"  // / 256
-      "fmul        v1.4s, v1.4s, v8.4s           \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
-      "st1         {v0.4s, v1.4s}, [%1], #32     \n"  // store 8 samples
-      "b.gt        1b                            \n"
-      : "+r"(src),                 // %0
-        "+r"(dst),                 // %1
-        "+r"(width)                // %2
-      : "r"(&kGaussCoefficients),  // %3
-        "r"(8LL),                  // %4
-        "r"(-4LL),                 // %5
-        "r"(20LL)                  // %6
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
-}
-
-// Convert biplanar NV21 to packed YUV24
-void NV21ToYUV24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_yuv24,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v2.16b}, [%0], #16           \n"  // load 16 Y values
-      "ld2         {v0.8b, v1.8b}, [%1], #16     \n"  // load 8 VU values
-      "zip1        v0.16b, v0.16b, v0.16b        \n"  // replicate V values
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "zip1        v1.16b, v1.16b, v1.16b        \n"  // replicate U values
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "subs        %w3, %w3, #16                 \n"      // 16 pixels per loop
-      "st3         {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_y),      // %0
-        "+r"(src_vu),     // %1
-        "+r"(dst_yuv24),  // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2");
-}
-
-// AYUV is YVUA in memory.  UV for NV12 is UV order in memory.
-void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
-                      int src_stride_ayuv,
-                      uint8_t* dst_uv,
-                      int width) {
-  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
-  asm volatile(
-
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
-      "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
-      "uqrshrn     v3.8b, v0.8h, #2              \n"  // 2x2 average
-      "uqrshrn     v2.8b, v1.8h, #2              \n"
-      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
-      "st2         {v2.8b,v3.8b}, [%2], #16      \n"  // store 8 pixels UV.
-      "b.gt        1b                            \n"
-      : "+r"(src_ayuv),    // %0
-        "+r"(src_ayuv_1),  // %1
-        "+r"(dst_uv),      // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-void AYUVToVURow_NEON(const uint8_t* src_ayuv,
-                      int src_stride_ayuv,
-                      uint8_t* dst_vu,
-                      int width) {
-  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
-  asm volatile(
-
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
-      "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
-      "uqrshrn     v0.8b, v0.8h, #2              \n"  // 2x2 average
-      "uqrshrn     v1.8b, v1.8h, #2              \n"
-      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
-      "st2         {v0.8b,v1.8b}, [%2], #16      \n"  // store 8 pixels VU.
-      "b.gt        1b                            \n"
-      : "+r"(src_ayuv),    // %0
-        "+r"(src_ayuv_1),  // %1
-        "+r"(dst_vu),      // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Copy row of AYUV Y's into Y
-void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
-      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "st1         {v2.16b}, [%1], #16           \n"  // store 16 Y pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_ayuv),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-// Shuffle table for swapping UV bytes.
-static const uvec8 kShuffleSwapUV = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
-                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
-
-// Convert UV plane of NV12 to VU of NV21.
-void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
-      "ld1         {v2.16b}, [%3]                \n"  // shuffler
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], 16            \n"  // load 16 UV values
-      "ld1         {v1.16b}, [%0], 16            \n"
-      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
-      "tbl         v0.16b, {v0.16b}, v2.16b      \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "tbl         v1.16b, {v1.16b}, v2.16b      \n"
-      "stp         q0, q1, [%1], 32              \n"  // store 16 VU pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_uv),         // %0
-        "+r"(dst_vu),         // %1
-        "+r"(width)           // %2
-      : "r"(&kShuffleSwapUV)  // %3
-      : "cc", "memory", "v0", "v1", "v2");
-}
-
-void HalfMergeUVRow_NEON(const uint8_t* src_u,
-                         int src_stride_u,
-                         const uint8_t* src_v,
-                         int src_stride_v,
-                         uint8_t* dst_uv,
-                         int width) {
-  const uint8_t* src_u_1 = src_u + src_stride_u;
-  const uint8_t* src_v_1 = src_v + src_stride_v;
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 U values
-      "ld1         {v1.16b}, [%2], #16           \n"  // load 16 V values
-      "ld1         {v2.16b}, [%1], #16           \n"
-      "ld1         {v3.16b}, [%3], #16           \n"
-      "uaddlp      v0.8h, v0.16b                 \n"  // half size
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v1.16b                 \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "uadalp      v0.8h, v2.16b                 \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v3.16b                 \n"
-      "prfm        pldl1keep, [%3, 448]          \n"
-      "uqrshrn     v0.8b, v0.8h, #2              \n"
-      "uqrshrn     v1.8b, v1.8h, #2              \n"
-      "subs        %w5, %w5, #16                 \n"  // 16 src pixels per loop
-      "st2         {v0.8b, v1.8b}, [%4], #16     \n"  // store 8 UV pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_u),    // %0
-        "+r"(src_u_1),  // %1
-        "+r"(src_v),    // %2
-        "+r"(src_v_1),  // %3
-        "+r"(dst_uv),   // %4
-        "+r"(width)     // %5
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void SplitUVRow_16_NEON(const uint16_t* src_uv,
-                        uint16_t* dst_u,
-                        uint16_t* dst_v,
-                        int depth,
-                        int width) {
-  int shift = depth - 16;  // Negative for right shift.
-  asm volatile(
-      "dup         v2.8h, %w4                    \n"
-      "1:                                        \n"
-      "ld2         {v0.8h, v1.8h}, [%0], #32     \n"  // load 8 UV
-      "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
-      "ushl        v0.8h, v0.8h, v2.8h           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "ushl        v1.8h, v1.8h, v2.8h           \n"
-      "st1         {v0.8h}, [%1], #16            \n"  // store 8 U pixels
-      "st1         {v1.8h}, [%2], #16            \n"  // store 8 V pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_u),   // %1
-        "+r"(dst_v),   // %2
-        "+r"(width)    // %3
-      : "r"(shift)     // %4
-      : "cc", "memory", "v0", "v1", "v2");
-}
-
-void MergeUVRow_16_NEON(const uint16_t* src_u,
-                        const uint16_t* src_v,
-                        uint16_t* dst_uv,
-                        int depth,
-                        int width) {
-  int shift = 16 - depth;
-  asm volatile(
-      "dup         v2.8h, %w4                    \n"
-      "1:                                        \n"
-      "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
-      "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
-      "ld1         {v1.8h}, [%1], #16            \n"  // load 8 V
-      "ushl        v0.8h, v0.8h, v2.8h           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "ushl        v1.8h, v1.8h, v2.8h           \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "st2         {v0.8h, v1.8h}, [%2], #32     \n"  // store 8 UV pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      : "r"(shift)     // %4
-      : "cc", "memory", "v0", "v1", "v2");
-}
-
-void MultiplyRow_16_NEON(const uint16_t* src_y,
-                         uint16_t* dst_y,
-                         int scale,
-                         int width) {
-  asm volatile(
-      "dup         v2.8h, %w2                    \n"
-      "1:                                        \n"
-      "ldp         q0, q1, [%0], #32             \n"
-      "mul         v0.8h, v0.8h, v2.8h           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "mul         v1.8h, v1.8h, v2.8h           \n"
-      "stp         q0, q1, [%1]                  \n"  // store 16 pixels
-      "add         %1, %1, #32                   \n"
-      "subs        %w3, %w3, #16                 \n"  // 16 src pixels per loop
-      "b.gt        1b                            \n"
-      : "+r"(src_y),  // %0
-        "+r"(dst_y),  // %1
-        "+r"(scale),  // %2
-        "+r"(width)   // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2");
-}
-
-void DivideRow_16_NEON(const uint16_t* src_y,
-                       uint16_t* dst_y,
-                       int scale,
-                       int width) {
-  asm volatile(
-      "dup         v0.8h, %w2                    \n"
-      "1:                                        \n"
-      "ldp         q1, q2, [%0], #32             \n"
-      "ushll       v3.4s, v1.4h, #0              \n"
-      "ushll       v4.4s, v2.4h, #0              \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "ushll2      v1.4s, v1.8h, #0              \n"
-      "ushll2      v2.4s, v2.8h, #0              \n"
-      "mul         v3.4s, v0.4s, v3.4s           \n"
-      "mul         v4.4s, v0.4s, v4.4s           \n"
-      "mul         v1.4s, v0.4s, v1.4s           \n"
-      "mul         v2.4s, v0.4s, v2.4s           \n"
-      "shrn        v3.4h, v3.4s, #16             \n"
-      "shrn        v4.4h, v4.4s, #16             \n"
-      "shrn2       v3.8h, v1.4s, #16             \n"
-      "shrn2       v4.8h, v2.4s, #16             \n"
-      "stp         q3, q3, [%1]                  \n"  // store 16 pixels
-      "add         %1, %1, #32                   \n"
-      "subs        %w3, %w3, #16                 \n"  // 16 src pixels per loop
-      "b.gt        1b                            \n"
-      : "+r"(src_y),  // %0
-        "+r"(dst_y),  // %1
-        "+r"(scale),  // %2
-        "+r"(width)   // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/row_win.cc b/thirdparty/libyuv/source/row_win.cc
deleted file mode 100644
index 2c3241c..0000000
--- a/thirdparty/libyuv/source/row_win.cc
+++ /dev/null
@@ -1,6404 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-// This module is for Visual C 32/64 bit
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
-
-#if defined(_M_X64)
-#include <emmintrin.h>
-#include <tmmintrin.h>  // For _mm_maddubs_epi16
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// 64 bit
-#if defined(_M_X64)
-
-// Read 8 UV from 444
-#define READYUV444                                    \
-  xmm3 = _mm_loadl_epi64((__m128i*)u_buf);            \
-  xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
-  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);               \
-  u_buf += 8;                                         \
-  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);            \
-  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);               \
-  y_buf += 8;
-
-// Read 8 UV from 444, With 8 Alpha.
-#define READYUVA444                                   \
-  xmm3 = _mm_loadl_epi64((__m128i*)u_buf);            \
-  xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
-  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);               \
-  u_buf += 8;                                         \
-  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);            \
-  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);               \
-  y_buf += 8;                                         \
-  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);            \
-  a_buf += 8;
-
-// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422                                        \
-  xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
-  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
-  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);                   \
-  xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);                  \
-  u_buf += 4;                                             \
-  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
-  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
-  y_buf += 8;
-
-// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                       \
-  xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
-  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
-  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);                   \
-  xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);                  \
-  u_buf += 4;                                             \
-  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
-  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
-  y_buf += 8;                                             \
-  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \
-  a_buf += 8;
-
-// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants)                                            \
-  xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8(0x80));                         \
-  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);         \
-  xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb);       \
-  xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3);        \
-  xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3);        \
-  xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3);        \
-  xmm0 = _mm_adds_epi16(xmm4, xmm0);                                      \
-  xmm1 = _mm_subs_epi16(xmm4, xmm1);                                      \
-  xmm2 = _mm_adds_epi16(xmm4, xmm2);                                      \
-  xmm0 = _mm_srai_epi16(xmm0, 6);                                         \
-  xmm1 = _mm_srai_epi16(xmm1, 6);                                         \
-  xmm2 = _mm_srai_epi16(xmm2, 6);                                         \
-  xmm0 = _mm_packus_epi16(xmm0, xmm0);                                    \
-  xmm1 = _mm_packus_epi16(xmm1, xmm1);                                    \
-  xmm2 = _mm_packus_epi16(xmm2, xmm2);
-
-// Store 8 ARGB values.
-#define STOREARGB                                    \
-  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
-  xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
-  xmm1 = _mm_loadu_si128(&xmm0);                     \
-  xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
-  xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
-  _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
-  _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
-  dst_argb += 32;
-
-#if defined(HAS_I422TOARGBROW_SSSE3)
-void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
-                         const uint8_t* u_buf,
-                         const uint8_t* v_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
-  const __m128i xmm5 = _mm_set1_epi8(-1);
-  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
-  while (width > 0) {
-    READYUV422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    width -= 8;
-  }
-}
-#endif
-
-#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
-                              const uint8_t* u_buf,
-                              const uint8_t* v_buf,
-                              const uint8_t* a_buf,
-                              uint8_t* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
-  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
-  while (width > 0) {
-    READYUVA422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    width -= 8;
-  }
-}
-#endif
-
-#if defined(HAS_I444TOARGBROW_SSSE3)
-void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
-                         const uint8_t* u_buf,
-                         const uint8_t* v_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
-  const __m128i xmm5 = _mm_set1_epi8(-1);
-  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
-  while (width > 0) {
-    READYUV444
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    width -= 8;
-  }
-}
-#endif
-
-#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
-void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
-                              const uint8_t* u_buf,
-                              const uint8_t* v_buf,
-                              const uint8_t* a_buf,
-                              uint8_t* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
-  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
-  while (width > 0) {
-    READYUVA444
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    width -= 8;
-  }
-}
-#endif
-
-// 32 bit
-#else  // defined(_M_X64)
-#ifdef HAS_ARGBTOYROW_SSSE3
-
-// Constants for ARGB.
-static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
-                              13, 65, 33, 0, 13, 65, 33, 0};
-
-// JPeg full range.
-static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
-                               15, 75, 38, 0, 15, 75, 38, 0};
-
-static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
-                              112, -74, -38, 0, 112, -74, -38, 0};
-
-static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
-                               127, -84, -43, 0, 127, -84, -43, 0};
-
-static const vec8 kARGBToV = {
-    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
-
-static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
-                               -20, -107, 127, 0, -20, -107, 127, 0};
-
-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
-
-// Constants for BGRA.
-static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
-                              0, 33, 65, 13, 0, 33, 65, 13};
-
-static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
-                              0, -38, -74, 112, 0, -38, -74, 112};
-
-static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
-                              0, 112, -94, -18, 0, 112, -94, -18};
-
-// Constants for ABGR.
-static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
-                              33, 65, 13, 0, 33, 65, 13, 0};
-
-static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
-                              -38, -74, 112, 0, -38, -74, 112, 0};
-
-static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
-                              112, -94, -18, 0, 112, -94, -18, 0};
-
-// Constants for RGBA.
-static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
-                              0, 13, 65, 33, 0, 13, 65, 33};
-
-static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
-                              0, 112, -74, -38, 0, 112, -74, -38};
-
-static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
-                              0, -18, -94, 112, 0, -18, -94, 112};
-
-static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
-                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
-
-// 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
-
-// 8 bit fixed point 0.5, for bias of UV.
-static const ulvec8 kBiasUV128 = {
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-// Shuffle table for converting RGB24 to ARGB.
-static const uvec8 kShuffleMaskRGB24ToARGB = {
-    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
-
-// Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
-                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
-
-// Shuffle table for converting RAW to RGB24.  First 8.
-static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting RAW to RGB24.  Middle 8.
-static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting RAW to RGB24.  Last 8.
-static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGB to RGB24.
-static const uvec8 kShuffleMaskARGBToRGB24 = {
-    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGB to RAW.
-static const uvec8 kShuffleMaskARGBToRAW = {
-    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
-static const uvec8 kShuffleMaskARGBToRGB24_0 = {
-    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
-
-// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
-                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
-                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
-
-// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
-                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
-                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
-
-// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
-                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
-                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
-
-// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
-                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
-                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
-
-// NV21 shuf 8 VU to 16 UV.
-static const lvec8 kShuffleNV21 = {
-    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-};
-
-// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
-                                          uint8_t* dst_argb,
-                                          int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_y
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
-    pslld      xmm5, 24
-
-  convertloop:
-    movq       xmm0, qword ptr [eax]
-    lea        eax,  [eax + 8]
-    punpcklbw  xmm0, xmm0
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0
-    punpckhwd  xmm1, xmm1
-    por        xmm0, xmm5
-    por        xmm1, xmm5
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-
-#ifdef HAS_J400TOARGBROW_AVX2
-// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
-                                          uint8_t* dst_argb,
-                                          int width) {
-  __asm {
-    mov         eax, [esp + 4]  // src_y
-    mov         edx, [esp + 8]  // dst_argb
-    mov         ecx, [esp + 12]  // width
-    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
-    vpslld      ymm5, ymm5, 24
-
-  convertloop:
-    vmovdqu     xmm0, [eax]
-    lea         eax,  [eax + 16]
-    vpermq      ymm0, ymm0, 0xd8
-    vpunpcklbw  ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8
-    vpunpckhwd  ymm1, ymm0, ymm0
-    vpunpcklwd  ymm0, ymm0, ymm0
-    vpor        ymm0, ymm0, ymm5
-    vpor        ymm1, ymm1, ymm5
-    vmovdqu     [edx], ymm0
-    vmovdqu     [edx + 32], ymm1
-    lea         edx, [edx + 64]
-    sub         ecx, 16
-    jg          convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_J400TOARGBROW_AVX2
-
-__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_rgb24
-    mov       edx, [esp + 8]  // dst_argb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
-    pslld     xmm5, 24
-    movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
-
- convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm3, [eax + 32]
-    lea       eax, [eax + 48]
-    movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
-    pshufb    xmm2, xmm4
-    por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
-    pshufb    xmm0, xmm4
-    movdqu    [edx + 32], xmm2
-    por       xmm0, xmm5
-    pshufb    xmm1, xmm4
-    movdqu    [edx], xmm0
-    por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
-    pshufb    xmm3, xmm4
-    movdqu    [edx + 16], xmm1
-    por       xmm3, xmm5
-    movdqu    [edx + 48], xmm3
-    lea       edx, [edx + 64]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
-                                          uint8_t* dst_argb,
-                                          int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_raw
-    mov       edx, [esp + 8]  // dst_argb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
-    pslld     xmm5, 24
-    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
-
- convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm3, [eax + 32]
-    lea       eax, [eax + 48]
-    movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
-    pshufb    xmm2, xmm4
-    por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
-    pshufb    xmm0, xmm4
-    movdqu    [edx + 32], xmm2
-    por       xmm0, xmm5
-    pshufb    xmm1, xmm4
-    movdqu    [edx], xmm0
-    por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
-    pshufb    xmm3, xmm4
-    movdqu    [edx + 16], xmm1
-    por       xmm3, xmm5
-    movdqu    [edx + 48], xmm3
-    lea       edx, [edx + 64]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
-                                           uint8_t* dst_rgb24,
-                                           int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_raw
-    mov       edx, [esp + 8]  // dst_rgb24
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
-    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
-    movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
-
- convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + 4]
-    movdqu    xmm2, [eax + 8]
-    lea       eax, [eax + 24]
-    pshufb    xmm0, xmm3
-    pshufb    xmm1, xmm4
-    pshufb    xmm2, xmm5
-    movq      qword ptr [edx], xmm0
-    movq      qword ptr [edx + 8], xmm1
-    movq      qword ptr [edx + 16], xmm2
-    lea       edx, [edx + 24]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-// pmul method to replicate bits.
-// Math to replicate bits:
-// (v << 8) | (v << 3)
-// v * 256 + v * 8
-// v * (256 + 8)
-// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-// 20 instructions.
-__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
-    movd      xmm5, eax
-    pshufd    xmm5, xmm5, 0
-    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
-    movd      xmm6, eax
-    pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
-    psllw     xmm3, 11
-    pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
-    psllw     xmm4, 10
-    psrlw     xmm4, 5
-    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
-    psllw     xmm7, 8
-
-    mov       eax, [esp + 4]  // src_rgb565
-    mov       edx, [esp + 8]  // dst_argb
-    mov       ecx, [esp + 12]  // width
-    sub       edx, eax
-    sub       edx, eax
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
-    movdqa    xmm1, xmm0
-    movdqa    xmm2, xmm0
-    pand      xmm1, xmm3  // R in upper 5 bits
-    psllw     xmm2, 11  // B in upper 5 bits
-    pmulhuw   xmm1, xmm5  // * (256 + 8)
-    pmulhuw   xmm2, xmm5  // * (256 + 8)
-    psllw     xmm1, 8
-    por       xmm1, xmm2  // RB
-    pand      xmm0, xmm4  // G in middle 6 bits
-    pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
-    por       xmm0, xmm7  // AG
-    movdqa    xmm2, xmm1
-    punpcklbw xmm1, xmm0
-    punpckhbw xmm2, xmm0
-    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
-    lea       eax, [eax + 16]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-#ifdef HAS_RGB565TOARGBROW_AVX2
-// pmul method to replicate bits.
-// Math to replicate bits:
-// (v << 8) | (v << 3)
-// v * 256 + v * 8
-// v * (256 + 8)
-// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
-    vmovd      xmm5, eax
-    vbroadcastss ymm5, xmm5
-    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
-    vmovd      xmm6, eax
-    vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
-    vpsllw     ymm3, ymm3, 11
-    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
-    vpsllw     ymm4, ymm4, 10
-    vpsrlw     ymm4, ymm4, 5
-    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
-    vpsllw     ymm7, ymm7, 8
-
-    mov        eax, [esp + 4]  // src_rgb565
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    sub        edx, eax
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
-    vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
-    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
-    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
-    vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2  // RB
-    vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
-    vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
-    vpor       ymm0, ymm0, ymm7  // AG
-    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpckhbw ymm2, ymm1, ymm0
-    vpunpcklbw ymm1, ymm1, ymm0
-    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
-    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
-    lea       eax, [eax + 32]
-    sub       ecx, 16
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_RGB565TOARGBROW_AVX2
-
-#ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
-                                              uint8_t* dst_argb,
-                                              int width) {
-  __asm {
-    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
-    vmovd      xmm5, eax
-    vbroadcastss ymm5, xmm5
-    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
-    vmovd      xmm6, eax
-    vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
-    vpsllw     ymm3, ymm3, 11
-    vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
-    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
-    vpsllw     ymm7, ymm7, 8
-
-    mov        eax,  [esp + 4]  // src_argb1555
-    mov        edx,  [esp + 8]  // dst_argb
-    mov        ecx,  [esp + 12]  // width
-    sub        edx,  eax
-    sub        edx,  eax
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
-    vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
-    vpand      ymm1, ymm1, ymm3
-    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
-    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
-    vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2  // RB
-    vpsraw     ymm2, ymm0, 8  // A
-    vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
-    vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
-    vpand      ymm2, ymm2, ymm7
-    vpor       ymm0, ymm0, ymm2  // AG
-    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpckhbw ymm2, ymm1, ymm0
-    vpunpcklbw ymm1, ymm1, ymm0
-    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
-    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
-    lea       eax, [eax + 32]
-    sub       ecx, 16
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGB1555TOARGBROW_AVX2
-
-#ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
-                                              uint8_t* dst_argb,
-                                              int width) {
-  __asm {
-    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
-    vmovd     xmm4, eax
-    vbroadcastss ymm4, xmm4
-    vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
-    mov       eax,  [esp + 4]  // src_argb4444
-    mov       edx,  [esp + 8]  // dst_argb
-    mov       ecx,  [esp + 12]  // width
-    sub       edx,  eax
-    sub       edx,  eax
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
-    vpand      ymm2, ymm0, ymm5  // mask high nibbles
-    vpand      ymm0, ymm0, ymm4  // mask low nibbles
-    vpsrlw     ymm3, ymm2, 4
-    vpsllw     ymm1, ymm0, 4
-    vpor       ymm2, ymm2, ymm3
-    vpor       ymm0, ymm0, ymm1
-    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
-    vpermq     ymm2, ymm2, 0xd8
-    vpunpckhbw ymm1, ymm0, ymm2
-    vpunpcklbw ymm0, ymm0, ymm2
-    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
-    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
-    lea       eax, [eax + 32]
-    sub       ecx, 16
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGB4444TOARGBROW_AVX2
-
-// 24 instructions
-__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
-                                              uint8_t* dst_argb,
-                                              int width) {
-  __asm {
-    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
-    movd      xmm5, eax
-    pshufd    xmm5, xmm5, 0
-    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
-    movd      xmm6, eax
-    pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
-    psllw     xmm3, 11
-    movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
-    psrlw     xmm4, 6
-    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
-    psllw     xmm7, 8
-
-    mov       eax, [esp + 4]  // src_argb1555
-    mov       edx, [esp + 8]  // dst_argb
-    mov       ecx, [esp + 12]  // width
-    sub       edx, eax
-    sub       edx, eax
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
-    movdqa    xmm1, xmm0
-    movdqa    xmm2, xmm0
-    psllw     xmm1, 1  // R in upper 5 bits
-    psllw     xmm2, 11  // B in upper 5 bits
-    pand      xmm1, xmm3
-    pmulhuw   xmm2, xmm5  // * (256 + 8)
-    pmulhuw   xmm1, xmm5  // * (256 + 8)
-    psllw     xmm1, 8
-    por       xmm1, xmm2  // RB
-    movdqa    xmm2, xmm0
-    pand      xmm0, xmm4  // G in middle 5 bits
-    psraw     xmm2, 8  // A
-    pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
-    pand      xmm2, xmm7
-    por       xmm0, xmm2  // AG
-    movdqa    xmm2, xmm1
-    punpcklbw xmm1, xmm0
-    punpckhbw xmm2, xmm0
-    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
-    lea       eax, [eax + 16]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-// 18 instructions.
-__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
-                                              uint8_t* dst_argb,
-                                              int width) {
-  __asm {
-    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
-    movd      xmm4, eax
-    pshufd    xmm4, xmm4, 0
-    movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
-    pslld     xmm5, 4
-    mov       eax, [esp + 4]  // src_argb4444
-    mov       edx, [esp + 8]  // dst_argb
-    mov       ecx, [esp + 12]  // width
-    sub       edx, eax
-    sub       edx, eax
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
-    movdqa    xmm2, xmm0
-    pand      xmm0, xmm4  // mask low nibbles
-    pand      xmm2, xmm5  // mask high nibbles
-    movdqa    xmm1, xmm0
-    movdqa    xmm3, xmm2
-    psllw     xmm1, 4
-    psrlw     xmm3, 4
-    por       xmm0, xmm1
-    por       xmm2, xmm3
-    movdqa    xmm1, xmm0
-    punpcklbw xmm0, xmm2
-    punpckhbw xmm1, xmm2
-    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
-    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
-    lea       eax, [eax + 16]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
-                                            uint8_t* dst_rgb,
-                                            int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm2, [eax + 32]
-    movdqu    xmm3, [eax + 48]
-    lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
-    pshufb    xmm1, xmm6
-    pshufb    xmm2, xmm6
-    pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
-    psrldq    xmm1, 4  // 8 bytes from 1
-    pslldq    xmm4, 12  // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
-    por       xmm0, xmm4  // 4 bytes from 1 for 0
-    pslldq    xmm5, 8  // 8 bytes from 2 for 1
-    movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5  // 8 bytes from 2 for 1
-    psrldq    xmm2, 8  // 4 bytes from 2
-    pslldq    xmm3, 4  // 12 bytes from 3 for 2
-    por       xmm2, xmm3  // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1  // store 1
-    movdqu    [edx + 32], xmm2  // store 2
-    lea       edx, [edx + 48]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
-                                          uint8_t* dst_rgb,
-                                          int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm2, [eax + 32]
-    movdqu    xmm3, [eax + 48]
-    lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
-    pshufb    xmm1, xmm6
-    pshufb    xmm2, xmm6
-    pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
-    psrldq    xmm1, 4  // 8 bytes from 1
-    pslldq    xmm4, 12  // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
-    por       xmm0, xmm4  // 4 bytes from 1 for 0
-    pslldq    xmm5, 8  // 8 bytes from 2 for 1
-    movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5  // 8 bytes from 2 for 1
-    psrldq    xmm2, 8  // 4 bytes from 2
-    pslldq    xmm3, 4  // 12 bytes from 3 for 2
-    por       xmm2, xmm3  // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1  // store 1
-    movdqu    [edx + 32], xmm2  // store 2
-    lea       edx, [edx + 48]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
-                                            uint8_t* dst_rgb,
-                                            int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
-    psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
-    psrld     xmm4, 26
-    pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
-    pslld     xmm5, 11
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0  // B
-    movdqa    xmm2, xmm0  // G
-    pslld     xmm0, 8  // R
-    psrld     xmm1, 3  // B
-    psrld     xmm2, 5  // G
-    psrad     xmm0, 16  // R
-    pand      xmm1, xmm3  // B
-    pand      xmm2, xmm4  // G
-    pand      xmm0, xmm5  // R
-    por       xmm1, xmm2  // BG
-    por       xmm0, xmm1  // BGR
-    packssdw  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
-                                                  uint8_t* dst_rgb,
-                                                  const uint32_t dither4,
-                                                  int width) {
-  __asm {
-
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    movd      xmm6, [esp + 12]  // dither4
-    mov       ecx, [esp + 16]  // width
-    punpcklbw xmm6, xmm6  // make dither 16 bytes
-    movdqa    xmm7, xmm6
-    punpcklwd xmm6, xmm6
-    punpckhwd xmm7, xmm7
-    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
-    psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
-    psrld     xmm4, 26
-    pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
-    pslld     xmm5, 11
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
-    paddusb   xmm0, xmm6  // add dither
-    movdqa    xmm1, xmm0  // B
-    movdqa    xmm2, xmm0  // G
-    pslld     xmm0, 8  // R
-    psrld     xmm1, 3  // B
-    psrld     xmm2, 5  // G
-    psrad     xmm0, 16  // R
-    pand      xmm1, xmm3  // B
-    pand      xmm2, xmm4  // G
-    pand      xmm0, xmm5  // R
-    por       xmm1, xmm2  // BG
-    por       xmm0, xmm1  // BGR
-    packssdw  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
-                                                  uint8_t* dst_rgb,
-                                                  const uint32_t dither4,
-                                                  int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_rgb
-    vbroadcastss xmm6, [esp + 12]  // dither4
-    mov        ecx, [esp + 16]  // width
-    vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
-    vpermq     ymm6, ymm6, 0xd8
-    vpunpcklwd ymm6, ymm6, ymm6
-    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
-    vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
-    vpsrld     ymm4, ymm4, 26
-    vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
-    vpaddusb   ymm0, ymm0, ymm6  // add dither
-    vpsrld     ymm2, ymm0, 5  // G
-    vpsrld     ymm1, ymm0, 3  // B
-    vpsrld     ymm0, ymm0, 8  // R
-    vpand      ymm2, ymm2, ymm4  // G
-    vpand      ymm1, ymm1, ymm3  // B
-    vpand      ymm0, ymm0, ymm5  // R
-    vpor       ymm1, ymm1, ymm2  // BG
-    vpor       ymm0, ymm0, ymm1  // BGR
-    vpackusdw  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
-
-// TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
-                                              uint8_t* dst_rgb,
-                                              int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
-    psrld     xmm4, 27
-    movdqa    xmm5, xmm4  // generate mask 0x000003e0
-    pslld     xmm5, 5
-    movdqa    xmm6, xmm4  // generate mask 0x00007c00
-    pslld     xmm6, 10
-    pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
-    pslld     xmm7, 15
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0  // B
-    movdqa    xmm2, xmm0  // G
-    movdqa    xmm3, xmm0  // R
-    psrad     xmm0, 16  // A
-    psrld     xmm1, 3  // B
-    psrld     xmm2, 6  // G
-    psrld     xmm3, 9  // R
-    pand      xmm0, xmm7  // A
-    pand      xmm1, xmm4  // B
-    pand      xmm2, xmm5  // G
-    pand      xmm3, xmm6  // R
-    por       xmm0, xmm1  // BA
-    por       xmm2, xmm3  // GR
-    por       xmm0, xmm2  // BGRA
-    packssdw  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
-                                              uint8_t* dst_rgb,
-                                              int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
-    psllw     xmm4, 12
-    movdqa    xmm3, xmm4  // generate mask 0x00f000f0
-    psrlw     xmm3, 8
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0
-    pand      xmm0, xmm3  // low nibble
-    pand      xmm1, xmm4  // high nibble
-    psrld     xmm0, 4
-    psrld     xmm1, 8
-    por       xmm0, xmm1
-    packuswb  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
-                                            uint8_t* dst_rgb,
-                                            int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_rgb
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
-    vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
-    vpsrld     ymm4, ymm4, 26
-    vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
-    vpsrld     ymm2, ymm0, 5  // G
-    vpsrld     ymm1, ymm0, 3  // B
-    vpsrld     ymm0, ymm0, 8  // R
-    vpand      ymm2, ymm2, ymm4  // G
-    vpand      ymm1, ymm1, ymm3  // B
-    vpand      ymm0, ymm0, ymm5  // R
-    vpor       ymm1, ymm1, ymm2  // BG
-    vpor       ymm0, ymm0, ymm1  // BGR
-    vpackusdw  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTORGB565ROW_AVX2
-
-#ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
-                                              uint8_t* dst_rgb,
-                                              int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_rgb
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm4, ymm4, ymm4
-    vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
-    vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
-    vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
-    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
-    vpslld     ymm7, ymm7, 15
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
-    vpsrld     ymm3, ymm0, 9  // R
-    vpsrld     ymm2, ymm0, 6  // G
-    vpsrld     ymm1, ymm0, 3  // B
-    vpsrad     ymm0, ymm0, 16  // A
-    vpand      ymm3, ymm3, ymm6  // R
-    vpand      ymm2, ymm2, ymm5  // G
-    vpand      ymm1, ymm1, ymm4  // B
-    vpand      ymm0, ymm0, ymm7  // A
-    vpor       ymm0, ymm0, ymm1  // BA
-    vpor       ymm2, ymm2, ymm3  // GR
-    vpor       ymm0, ymm0, ymm2  // BGRA
-    vpackssdw  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOARGB1555ROW_AVX2
-
-#ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
-                                              uint8_t* dst_rgb,
-                                              int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_rgb
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
-    vpsllw     ymm4, ymm4, 12
-    vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
-    vpand      ymm1, ymm0, ymm4  // high nibble
-    vpand      ymm0, ymm0, ymm3  // low nibble
-    vpsrld     ymm1, ymm1, 8
-    vpsrld     ymm0, ymm0, 4
-    vpor       ymm0, ymm0, ymm1
-    vpackuswb  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOARGB4444ROW_AVX2
-
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
-                                        uint8_t* dst_y,
-                                        int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kARGBToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
-                                         uint8_t* dst_y,
-                                         int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kARGBToYJ
-    movdqa     xmm5, xmmword ptr kAddYJ64
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    paddw      xmm0, xmm5  // Add .5 for rounding.
-    paddw      xmm2, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTOYROW_AVX2
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
-
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
-                                       uint8_t* dst_y,
-                                       int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    vbroadcastf128 ymm4, xmmword ptr kARGBToY
-    vbroadcastf128 ymm5, xmmword ptr kAddY16
-    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
-
- convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpmaddubsw ymm0, ymm0, ymm4
-    vpmaddubsw ymm1, ymm1, ymm4
-    vpmaddubsw ymm2, ymm2, ymm4
-    vpmaddubsw ymm3, ymm3, ymm4
-    lea        eax, [eax + 128]
-    vphaddw    ymm0, ymm0, ymm1  // mutates.
-    vphaddw    ymm2, ymm2, ymm3
-    vpsrlw     ymm0, ymm0, 7
-    vpsrlw     ymm2, ymm2, 7
-    vpackuswb  ymm0, ymm0, ymm2  // mutates.
-    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
-    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  //  HAS_ARGBTOYROW_AVX2
-
-#ifdef HAS_ARGBTOYJROW_AVX2
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
-                                        uint8_t* dst_y,
-                                        int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
-    vbroadcastf128 ymm5, xmmword ptr kAddYJ64
-    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
-
- convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpmaddubsw ymm0, ymm0, ymm4
-    vpmaddubsw ymm1, ymm1, ymm4
-    vpmaddubsw ymm2, ymm2, ymm4
-    vpmaddubsw ymm3, ymm3, ymm4
-    lea        eax, [eax + 128]
-    vphaddw    ymm0, ymm0, ymm1  // mutates.
-    vphaddw    ymm2, ymm2, ymm3
-    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
-    vpaddw     ymm2, ymm2, ymm5
-    vpsrlw     ymm0, ymm0, 7
-    vpsrlw     ymm2, ymm2, 7
-    vpackuswb  ymm0, ymm0, ymm2  // mutates.
-    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  //  HAS_ARGBTOYJROW_AVX2
-
-__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
-                                        uint8_t* dst_y,
-                                        int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kBGRAToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
-                                        uint8_t* dst_y,
-                                        int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kABGRToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
-                                        uint8_t* dst_y,
-                                        int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kRGBAToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
-                                         int src_stride_argb,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kARGBToV
-    movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx  // stride from u to v
-
- convertloop:
-         /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5  // -> unsigned
-
-        // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0  // U
-    movhps     qword ptr [edx + edi], xmm0  // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
-                                          int src_stride_argb,
-                                          uint8_t* dst_u,
-                                          uint8_t* dst_v,
-                                          int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kARGBToVJ
-    movdqa     xmm7, xmmword ptr kARGBToUJ
-    sub        edi, edx  // stride from u to v
-
- convertloop:
-         /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    paddw      xmm0, xmm5  // +.5 rounding -> unsigned
-    paddw      xmm1, xmm5
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-
-        // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0  // U
-    movhps     qword ptr [edx + edi], xmm0  // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
-                                        int src_stride_argb,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kBiasUV128
-    vbroadcastf128 ymm6, xmmword ptr kARGBToV
-    vbroadcastf128 ymm7, xmmword ptr kARGBToU
-    sub        edi, edx   // stride from u to v
-
- convertloop:
-        /* step 1 - subsample 32x2 argb pixels to 16x1 */
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    vpavgb     ymm2, ymm2, [eax + esi + 64]
-    vpavgb     ymm3, ymm3, [eax + esi + 96]
-    lea        eax,  [eax + 128]
-    vshufps    ymm4, ymm0, ymm1, 0x88
-    vshufps    ymm0, ymm0, ymm1, 0xdd
-    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
-    vshufps    ymm4, ymm2, ymm3, 0x88
-    vshufps    ymm2, ymm2, ymm3, 0xdd
-    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 32 different pixels, its 16 pixels of U and 16 of V
-    vpmaddubsw ymm1, ymm0, ymm7  // U
-    vpmaddubsw ymm3, ymm2, ymm7
-    vpmaddubsw ymm0, ymm0, ymm6  // V
-    vpmaddubsw ymm2, ymm2, ymm6
-    vphaddw    ymm1, ymm1, ymm3  // mutates
-    vphaddw    ymm0, ymm0, ymm2
-    vpsraw     ymm1, ymm1, 8
-    vpsraw     ymm0, ymm0, 8
-    vpacksswb  ymm0, ymm1, ymm0  // mutates
-    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
-    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
-    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
-
-        // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0  // U
-    vextractf128 [edx + edi], ymm0, 1  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOUVROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
-                                         int src_stride_argb,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kBiasUV128
-    vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
-    vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
-    sub        edi, edx   // stride from u to v
-
- convertloop:
-        /* step 1 - subsample 32x2 argb pixels to 16x1 */
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    vpavgb     ymm2, ymm2, [eax + esi + 64]
-    vpavgb     ymm3, ymm3, [eax + esi + 96]
-    lea        eax,  [eax + 128]
-    vshufps    ymm4, ymm0, ymm1, 0x88
-    vshufps    ymm0, ymm0, ymm1, 0xdd
-    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
-    vshufps    ymm4, ymm2, ymm3, 0x88
-    vshufps    ymm2, ymm2, ymm3, 0xdd
-    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 32 different pixels, its 16 pixels of U and 16 of V
-    vpmaddubsw ymm1, ymm0, ymm7  // U
-    vpmaddubsw ymm3, ymm2, ymm7
-    vpmaddubsw ymm0, ymm0, ymm6  // V
-    vpmaddubsw ymm2, ymm2, ymm6
-    vphaddw    ymm1, ymm1, ymm3  // mutates
-    vphaddw    ymm0, ymm0, ymm2
-    vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
-    vpaddw     ymm0, ymm0, ymm5
-    vpsraw     ymm1, ymm1, 8
-    vpsraw     ymm0, ymm0, 8
-    vpacksswb  ymm0, ymm1, ymm0  // mutates
-    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
-    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
-
-        // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0  // U
-    vextractf128 [edx + edi], ymm0, 1  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOUVJROW_AVX2
-
-__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
-                                            uint8_t* dst_u,
-                                            uint8_t* dst_v,
-                                            int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kARGBToV
-    movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx    // stride from u to v
-
- convertloop:
-        /* convert to U and V */
-    movdqu     xmm0, [eax]  // U
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm7
-    pmaddubsw  xmm1, xmm7
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm3, xmm7
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psraw      xmm0, 8
-    psraw      xmm2, 8
-    packsswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-
-    movdqu     xmm0, [eax]  // V
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm6
-    pmaddubsw  xmm1, xmm6
-    pmaddubsw  xmm2, xmm6
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psraw      xmm0, 8
-    psraw      xmm2, 8
-    packsswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    lea        eax,  [eax + 64]
-    movdqu     [edx + edi], xmm0
-    lea        edx,  [edx + 16]
-    sub        ecx,  16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
-                                         int src_stride_argb,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kBGRAToV
-    movdqa     xmm7, xmmword ptr kBGRAToU
-    sub        edi, edx  // stride from u to v
-
- convertloop:
-         /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5  // -> unsigned
-
-        // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0  // U
-    movhps     qword ptr [edx + edi], xmm0  // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
-                                         int src_stride_argb,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kABGRToV
-    movdqa     xmm7, xmmword ptr kABGRToU
-    sub        edi, edx  // stride from u to v
-
- convertloop:
-         /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5  // -> unsigned
-
-        // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0  // U
-    movhps     qword ptr [edx + edi], xmm0  // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
-                                         int src_stride_argb,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kRGBAToV
-    movdqa     xmm7, xmmword ptr kRGBAToU
-    sub        edi, edx  // stride from u to v
-
- convertloop:
-         /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5  // -> unsigned
-
-        // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0  // U
-    movhps     qword ptr [edx + edi], xmm0  // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBTOYROW_SSSE3
-
-// Read 16 UV from 444
-#define READYUV444_AVX2 \
-  __asm {                                                \
-    __asm vmovdqu    xmm3, [esi] /* U */                      \
-    __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                     \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]}
-
-// Read 16 UV from 444.  With 16 Alpha.
-#define READYUVA444_AVX2 \
-  __asm {                                                                      \
-    __asm vmovdqu    xmm3, [esi] /* U */                                       \
-    __asm vmovdqu    xmm1, [esi + edi] /* V */                                 \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
-    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-    __asm vmovdqu    xmm5, [ebp] /* A */                                       \
-    __asm vpermq     ymm5, ymm5, 0xd8                                          \
-    __asm lea        ebp, [ebp + 16]}
-
-// Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 \
-  __asm {                                                \
-    __asm vmovq      xmm3, qword ptr [esi] /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                     \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]}
-
-// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2 \
-  __asm {                                               \
-    __asm vmovq      xmm3, qword ptr [esi] /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                     \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-    __asm vmovdqu    xmm5, [ebp] /* A */                      \
-    __asm vpermq     ymm5, ymm5, 0xd8                                          \
-    __asm lea        ebp, [ebp + 16]}
-
-// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    xmm3, [esi] /* UV */                     \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]}
-
-// Read 8 UV from NV21, upsample to 16 UV.
-#define READNV21_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    xmm3, [esi] /* UV */                     \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleNV21                      \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]}
-
-// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
-    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
-    __asm vmovdqu    ymm3, [eax] /* UV */                             \
-    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleYUY2UV                    \
-    __asm lea        eax, [eax + 32]}
-
-// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
-    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
-    __asm vmovdqu    ymm3, [eax] /* UV */                             \
-    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleUYVYUV                    \
-    __asm lea        eax, [eax + 32]}
-
-// Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_AVX2(YuvConstants) \
-  __asm {                                                                      \
-    __asm vpsubb     ymm3, ymm3, ymmword ptr kBiasUV128                        \
-    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
-    __asm vmovdqa    ymm0, ymmword ptr [YuvConstants + KUVTOB]                 \
-    __asm vmovdqa    ymm1, ymmword ptr [YuvConstants + KUVTOG]                 \
-    __asm vmovdqa    ymm2, ymmword ptr [YuvConstants + KUVTOR]                 \
-    __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */                               \
-    __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */                               \
-    __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */                               \
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KYBIASTORGB]            \
-    __asm vpaddw     ymm4, ymm3, ymm4                                          \
-    __asm vpaddsw    ymm0, ymm0, ymm4                                          \
-    __asm vpsubsw    ymm1, ymm4, ymm1                                          \
-    __asm vpaddsw    ymm2, ymm2, ymm4                                          \
-    __asm vpsraw     ymm0, ymm0, 6                                             \
-    __asm vpsraw     ymm1, ymm1, 6                                             \
-    __asm vpsraw     ymm2, ymm2, 6                                             \
-    __asm vpackuswb  ymm0, ymm0, ymm0                                          \
-    __asm vpackuswb  ymm1, ymm1, ymm1                                          \
-    __asm vpackuswb  ymm2, ymm2, ymm2                                          \
-  }
-
-// Store 16 ARGB values.
-#define STOREARGB_AVX2 \
-  __asm {                                                 \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
-    __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
-    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
-    __asm vmovdqu    0[edx], ymm1                                              \
-    __asm vmovdqu    32[edx], ymm0                                             \
-    __asm lea        edx,  [edx + 64]}
-
-// Store 16 RGBA values.
-#define STORERGBA_AVX2 \
-  __asm {                                                 \
-    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
-    __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
-    __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
-    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
-    __asm vmovdqu    [edx], ymm0                                               \
-    __asm vmovdqu    [edx + 32], ymm1                                          \
-    __asm lea        edx,  [edx + 64]}
-
-#ifdef HAS_I422TOARGBROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) void I422ToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV422_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I422TOARGBROW_AVX2
-
-#ifdef HAS_I422ALPHATOARGBROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-__declspec(naked) void I422AlphaToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    const uint8_t* a_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]  // Y
-    mov        esi, [esp + 16 + 8]  // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        ebp, [esp + 16 + 16]  // A
-    mov        edx, [esp + 16 + 20]  // argb
-    mov        ebx, [esp + 16 + 24]  // yuvconstants
-    mov        ecx, [esp + 16 + 28]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUVA422_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I422ALPHATOARGBROW_AVX2
-
-#ifdef HAS_I444TOARGBROW_AVX2
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) void I444ToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
- convertloop:
-    READYUV444_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I444TOARGBROW_AVX2
-
-#ifdef HAS_I444ALPHATOARGBROW_AVX2
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) void I444AlphaToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    const uint8_t* a_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-  push       esi
-  push       edi
-  push       ebx
-  push       ebp
-  mov        eax, [esp + 16 + 4]  // Y
-  mov        esi, [esp + 16 + 8]  // U
-  mov        edi, [esp + 16 + 12]  // V
-  mov        ebp, [esp + 16 + 16]  // A
-  mov        edx, [esp + 16 + 20]  // argb
-  mov        ebx, [esp + 16 + 24]  // yuvconstants
-  mov        ecx, [esp + 16 + 28]  // width
-  sub        edi, esi
-  convertloop:
-  READYUVA444_AVX2
-  YUVTORGB_AVX2(ebx)
-  STOREARGB_AVX2
-
-  sub        ecx, 16
-  jg         convertloop
-
-  pop        ebp
-  pop        ebx
-  pop        edi
-  pop        esi
-  vzeroupper
-  ret
-  }
-}
-#endif  // HAS_I444AlphaTOARGBROW_AVX2
-
-#ifdef HAS_NV12TOARGBROW_AVX2
-// 16 pixels.
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) void NV12ToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* uv_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]  // Y
-    mov        esi, [esp + 8 + 8]  // UV
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READNV12_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_NV12TOARGBROW_AVX2
-
-#ifdef HAS_NV21TOARGBROW_AVX2
-// 16 pixels.
-// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) void NV21ToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* vu_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]  // Y
-    mov        esi, [esp + 8 + 8]  // VU
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READNV21_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_NV21TOARGBROW_AVX2
-
-#ifdef HAS_YUY2TOARGBROW_AVX2
-// 16 pixels.
-// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked) void YUY2ToARGBRow_AVX2(
-    const uint8_t* src_yuy2,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]  // yuy2
-    mov        edx, [esp + 4 + 8]  // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUY2_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_YUY2TOARGBROW_AVX2
-
-#ifdef HAS_UYVYTOARGBROW_AVX2
-// 16 pixels.
-// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked) void UYVYToARGBRow_AVX2(
-    const uint8_t* src_uyvy,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]  // uyvy
-    mov        edx, [esp + 4 + 8]  // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READUYVY_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_UYVYTOARGBROW_AVX2
-
-#ifdef HAS_I422TORGBAROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-__declspec(naked) void I422ToRGBARow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // abgr
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV422_AVX2
-    YUVTORGB_AVX2(ebx)
-    STORERGBA_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I422TORGBAROW_AVX2
-
-#if defined(HAS_I422TOARGBROW_SSSE3)
-// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
-// Allows a conversion with half size scaling.
-
-// Read 8 UV from 444.
-#define READYUV444 \
-  __asm {                                                     \
-    __asm movq       xmm3, qword ptr [esi] /* U */                             \
-    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm3, xmm1 /* UV */                             \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]}
-
-// Read 4 UV from 444.  With 8 Alpha.
-#define READYUVA444 \
-  __asm {                                                                      \
-    __asm movq       xmm3, qword ptr [esi] /* U */                             \
-    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm movq       xmm5, qword ptr [ebp] /* A */                             \
-    __asm lea        ebp, [ebp + 8]}
-
-// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 \
-  __asm {                                                     \
-    __asm movd       xmm3, [esi] /* U */                              \
-    __asm movd       xmm1, [esi + edi] /* V */                              \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm3, xmm1 /* UV */                             \
-    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]}
-
-// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422 \
-  __asm {                                                    \
-    __asm movd       xmm3, [esi] /* U */                              \
-    __asm movd       xmm1, [esi + edi] /* V */                              \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm3, xmm1 /* UV */                             \
-    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax] /* Y */                           \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm movq       xmm5, qword ptr [ebp] /* A */                           \
-    __asm lea        ebp, [ebp + 8]}
-
-// Read 4 UV from NV12, upsample to 8 UV.
-#define READNV12 \
-  __asm {                                                       \
-    __asm movq       xmm3, qword ptr [esi] /* UV */                            \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]}
-
-// Read 4 VU from NV21, upsample to 8 UV.
-#define READNV21 \
-  __asm {                                                       \
-    __asm movq       xmm3, qword ptr [esi] /* UV */                            \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm pshufb     xmm3, xmmword ptr kShuffleNV21                            \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]}
-
-// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
-#define READYUY2 \
-  __asm {                                                       \
-    __asm movdqu     xmm4, [eax] /* YUY2 */                           \
-    __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
-    __asm movdqu     xmm3, [eax] /* UV */                             \
-    __asm pshufb     xmm3, xmmword ptr kShuffleYUY2UV                          \
-    __asm lea        eax, [eax + 16]}
-
-// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
-#define READUYVY \
-  __asm {                                                       \
-    __asm movdqu     xmm4, [eax] /* UYVY */                           \
-    __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
-    __asm movdqu     xmm3, [eax] /* UV */                             \
-    __asm pshufb     xmm3, xmmword ptr kShuffleUYVYUV                          \
-    __asm lea        eax, [eax + 16]}
-
-// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(YuvConstants) \
-  __asm {                                                                      \
-    __asm psubb      xmm3, xmmword ptr kBiasUV128                              \
-    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
-    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVTOB]                 \
-    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVTOG]                 \
-    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVTOR]                 \
-    __asm pmaddubsw  xmm0, xmm3                                                \
-    __asm pmaddubsw  xmm1, xmm3                                                \
-    __asm pmaddubsw  xmm2, xmm3                                                \
-    __asm movdqa     xmm3, xmmword ptr [YuvConstants + KYBIASTORGB]            \
-    __asm paddw      xmm4, xmm3                                                \
-    __asm paddsw     xmm0, xmm4                                                \
-    __asm paddsw     xmm2, xmm4                                                \
-    __asm psubsw     xmm4, xmm1                                                \
-    __asm movdqa     xmm1, xmm4                                                \
-    __asm psraw      xmm0, 6                                                   \
-    __asm psraw      xmm1, 6                                                   \
-    __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0 /* B */                                        \
-    __asm packuswb   xmm1, xmm1 /* G */                                        \
-    __asm packuswb   xmm2, xmm2 /* R */                                        \
-  }
-
-// Store 8 ARGB values.
-#define STOREARGB \
-  __asm {                                                      \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
-    __asm punpcklbw  xmm2, xmm5 /* RA */                             \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
-    __asm movdqu     0[edx], xmm0                                              \
-    __asm movdqu     16[edx], xmm1                                             \
-    __asm lea        edx,  [edx + 32]}
-
-// Store 8 BGRA values.
-#define STOREBGRA \
-  __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm0 /* GB */                             \
-    __asm punpcklbw  xmm5, xmm2 /* AR */                             \
-    __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
-    __asm movdqu     0[edx], xmm5                                              \
-    __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]}
-
-// Store 8 RGBA values.
-#define STORERGBA \
-  __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm2 /* GR */                             \
-    __asm punpcklbw  xmm5, xmm0 /* AB */                             \
-    __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
-    __asm movdqu     0[edx], xmm5                                              \
-    __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]}
-
-// Store 8 RGB24 values.
-#define STORERGB24 \
-  __asm {/* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
-    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
-    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
-    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
-    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
-    __asm lea        edx,  [edx + 24]}
-
-// Store 8 RGB565 values.
-#define STORERGB565 \
-  __asm {/* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
-    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
-    __asm movdqa     xmm2, xmm0 /* G */                                     \
-    __asm pslld      xmm0, 8 /* R */                                     \
-    __asm psrld      xmm3, 3 /* B */                                     \
-    __asm psrld      xmm2, 5 /* G */                                     \
-    __asm psrad      xmm0, 16 /* R */                                     \
-    __asm pand       xmm3, xmm5 /* B */                                     \
-    __asm pand       xmm2, xmm6 /* G */                                     \
-    __asm pand       xmm0, xmm7 /* R */                                     \
-    __asm por        xmm3, xmm2 /* BG */                                    \
-    __asm por        xmm0, xmm3 /* BGR */                                   \
-    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
-    __asm movdqa     xmm2, xmm1 /* G */                                     \
-    __asm pslld      xmm1, 8 /* R */                                     \
-    __asm psrld      xmm3, 3 /* B */                                     \
-    __asm psrld      xmm2, 5 /* G */                                     \
-    __asm psrad      xmm1, 16 /* R */                                     \
-    __asm pand       xmm3, xmm5 /* B */                                     \
-    __asm pand       xmm2, xmm6 /* G */                                     \
-    __asm pand       xmm1, xmm7 /* R */                                     \
-    __asm por        xmm3, xmm2 /* BG */                                    \
-    __asm por        xmm1, xmm3 /* BGR */                                   \
-    __asm packssdw   xmm0, xmm1                                                \
-    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
-    __asm lea        edx, [edx + 16]}
-
-// 8 pixels.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) void I444ToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV444
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes).
-__declspec(naked) void I444AlphaToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    const uint8_t* a_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]  // Y
-    mov        esi, [esp + 16 + 8]  // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        ebp, [esp + 16 + 16]  // A
-    mov        edx, [esp + 16 + 20]  // argb
-    mov        ebx, [esp + 16 + 24]  // yuvconstants
-    mov        ecx, [esp + 16 + 28]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUVA444
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked) void I422ToRGB24Row_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_rgb24,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
-    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STORERGB24
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
-__declspec(naked) void I422ToRGB565Row_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* rgb565_buf,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
-    psrld      xmm5, 27
-    pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
-    psrld      xmm6, 26
-    pslld      xmm6, 5
-    pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
-    pslld      xmm7, 11
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STORERGB565
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) void I422ToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
-__declspec(naked) void I422AlphaToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    const uint8_t* a_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]  // Y
-    mov        esi, [esp + 16 + 8]  // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        ebp, [esp + 16 + 16]  // A
-    mov        edx, [esp + 16 + 20]  // argb
-    mov        ebx, [esp + 16 + 24]  // yuvconstants
-    mov        ecx, [esp + 16 + 28]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUVA422
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) void NV12ToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* uv_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]  // Y
-    mov        esi, [esp + 8 + 8]  // UV
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READNV12
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) void NV21ToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* vu_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]  // Y
-    mov        esi, [esp + 8 + 8]  // VU
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READNV21
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked) void YUY2ToARGBRow_SSSE3(
-    const uint8_t* src_yuy2,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]  // yuy2
-    mov        edx, [esp + 4 + 8]  // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READYUY2
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked) void UYVYToARGBRow_SSSE3(
-    const uint8_t* src_uyvy,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]  // uyvy
-    mov        edx, [esp + 4 + 8]  // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READUYVY
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    ret
-  }
-}
-
-__declspec(naked) void I422ToRGBARow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_rgba,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STORERGBA
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_I422TOARGBROW_SSSE3
-
-// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
-#ifdef HAS_I400TOARGBROW_SSE2
-// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
-                                          uint8_t* rgb_buf,
-                                          const struct YuvConstants*,
-                                          int width) {
-  __asm {
-    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
-    movd       xmm2, eax
-    pshufd     xmm2, xmm2,0
-    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
-    movd       xmm3, eax
-    pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
-    pslld      xmm4, 24
-
-    mov        eax, [esp + 4]  // Y
-    mov        edx, [esp + 8]  // rgb
-    mov        ecx, [esp + 12]  // width
-
- convertloop:
-        // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    movq       xmm0, qword ptr [eax]
-    lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm0  // Y.Y
-    pmulhuw    xmm0, xmm2
-    psubusw    xmm0, xmm3
-    psrlw      xmm0, 6
-    packuswb   xmm0, xmm0        // G
-
-        // Step 2: Weave into ARGB
-    punpcklbw  xmm0, xmm0  // GG
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
-    por        xmm0, xmm4
-    por        xmm1, xmm4
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_I400TOARGBROW_SSE2
-
-#ifdef HAS_I400TOARGBROW_AVX2
-// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
-// note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
-                                          uint8_t* rgb_buf,
-                                          const struct YuvConstants*,
-                                          int width) {
-  __asm {
-    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
-    vmovd      xmm2, eax
-    vbroadcastss ymm2, xmm2
-    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
-    vmovd      xmm3, eax
-    vbroadcastss ymm3, xmm3
-    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
-    vpslld     ymm4, ymm4, 24
-
-    mov        eax, [esp + 4]  // Y
-    mov        edx, [esp + 8]  // rgb
-    mov        ecx, [esp + 12]  // width
-
- convertloop:
-        // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
-    vmovdqu    xmm0, [eax]
-    lea        eax, [eax + 16]
-    vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
-    vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
-    vpmulhuw   ymm0, ymm0, ymm2
-    vpsubusw   ymm0, ymm0, ymm3
-    vpsrlw     ymm0, ymm0, 6
-    vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
-
-        // TODO(fbarchard): Weave alpha with unpack.
-        // Step 2: Weave into ARGB
-    vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
-    vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
-    vpor       ymm0, ymm0, ymm4
-    vpor       ymm1, ymm1, ymm4
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx,  [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I400TOARGBROW_AVX2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
-                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
-
-// TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
-                                       uint8_t* dst,
-                                       int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src
-    mov       edx, [esp + 8]  // dst
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm5, xmmword ptr kShuffleMirror
-
- convertloop:
-    movdqu    xmm0, [eax - 16 + ecx]
-    pshufb    xmm0, xmm5
-    movdqu    [edx], xmm0
-    lea       edx, [edx + 16]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-#endif  // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_AVX2
-__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
-                                      uint8_t* dst,
-                                      int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src
-    mov       edx, [esp + 8]  // dst
-    mov       ecx, [esp + 12]  // width
-    vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
-
- convertloop:
-    vmovdqu   ymm0, [eax - 32 + ecx]
-    vpshufb   ymm0, ymm0, ymm5
-    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
-    vmovdqu   [edx], ymm0
-    lea       edx, [edx + 32]
-    sub       ecx, 32
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_MIRRORROW_AVX2
-
-#ifdef HAS_MIRRORSPLITUVROW_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
-                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-
-__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
-                                              uint8_t* dst_u,
-                                              uint8_t* dst_v,
-                                              int width) {
-  __asm {
-    push      edi
-    mov       eax, [esp + 4 + 4]  // src
-    mov       edx, [esp + 4 + 8]  // dst_u
-    mov       edi, [esp + 4 + 12]  // dst_v
-    mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm1, xmmword ptr kShuffleMirrorUV
-    lea       eax, [eax + ecx * 2 - 16]
-    sub       edi, edx
-
- convertloop:
-    movdqu    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufb    xmm0, xmm1
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [edx + edi], xmm0
-    lea       edx, [edx + 8]
-    sub       ecx, 8
-    jg        convertloop
-
-    pop       edi
-    ret
-  }
-}
-#endif  // HAS_MIRRORSPLITUVROW_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
-                                          uint8_t* dst,
-                                          int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src
-    mov       edx, [esp + 8]  // dst
-    mov       ecx, [esp + 12]  // width
-    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
-
- convertloop:
-    movdqu    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufd    xmm0, xmm0, 0x1b
-    movdqu    [edx], xmm0
-    lea       edx, [edx + 16]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBMIRRORROW_SSE2
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-
-__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
-                                          uint8_t* dst,
-                                          int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src
-    mov       edx, [esp + 8]  // dst
-    mov       ecx, [esp + 12]  // width
-    vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
-
- convertloop:
-    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
-    vmovdqu   [edx], ymm0
-    lea       edx, [edx + 32]
-    sub       ecx, 8
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBMIRRORROW_AVX2
-
-#ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
-                                       uint8_t* dst_u,
-                                       uint8_t* dst_v,
-                                       int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_uv
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0
-    movdqa     xmm3, xmm1
-    pand       xmm0, xmm5  // even bytes
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    psrlw      xmm2, 8  // odd bytes
-    psrlw      xmm3, 8
-    packuswb   xmm2, xmm3
-    movdqu     [edx], xmm0
-    movdqu     [edx + edi], xmm2
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-#endif  // HAS_SPLITUVROW_SSE2
-
-#ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
-                                       uint8_t* dst_u,
-                                       uint8_t* dst_v,
-                                       int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_uv
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm2, ymm0, 8  // odd bytes
-    vpsrlw     ymm3, ymm1, 8
-    vpand      ymm0, ymm0, ymm5  // even bytes
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1
-    vpackuswb  ymm2, ymm2, ymm3
-    vpermq     ymm0, ymm0, 0xd8
-    vpermq     ymm2, ymm2, 0xd8
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + edi], ymm2
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SPLITUVROW_AVX2
-
-#ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
-                                       const uint8_t* src_v,
-                                       uint8_t* dst_uv,
-                                       int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_u
-    mov        edx, [esp + 4 + 8]  // src_v
-    mov        edi, [esp + 4 + 12]  // dst_uv
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        edx, eax
-
-  convertloop:
-    movdqu     xmm0, [eax]  // read 16 U's
-    movdqu     xmm1, [eax + edx]  // and 16 V's
-    lea        eax,  [eax + 16]
-    movdqa     xmm2, xmm0
-    punpcklbw  xmm0, xmm1  // first 8 UV pairs
-    punpckhbw  xmm2, xmm1  // next 8 UV pairs
-    movdqu     [edi], xmm0
-    movdqu     [edi + 16], xmm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-#endif  //  HAS_MERGEUVROW_SSE2
-
-#ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
-                                       const uint8_t* src_v,
-                                       uint8_t* dst_uv,
-                                       int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_u
-    mov        edx, [esp + 4 + 8]  // src_v
-    mov        edi, [esp + 4 + 12]  // dst_uv
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        edx, eax
-
-  convertloop:
-    vmovdqu    ymm0, [eax]  // read 32 U's
-    vmovdqu    ymm1, [eax + edx]  // and 32 V's
-    lea        eax,  [eax + 32]
-    vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
-    vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
-    vextractf128 [edi], ymm2, 0  // bytes 0..15
-    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
-    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
-    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
-    lea        edi, [edi + 64]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  //  HAS_MERGEUVROW_AVX2
-
-#ifdef HAS_COPYROW_SSE2
-// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) void CopyRow_SSE2(const uint8_t* src,
-                                    uint8_t* dst,
-                                    int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    test       eax, 15
-    jne        convertloopu
-    test       edx, 15
-    jne        convertloopu
-
-  convertloopa:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloopa
-    ret
-
-  convertloopu:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloopu
-    ret
-  }
-}
-#endif  // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_AVX
-// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked) void CopyRow_AVX(const uint8_t* src,
-                                   uint8_t* dst,
-                                   int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax, [eax + 64]
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx, [edx + 64]
-    sub        ecx, 64
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_COPYROW_AVX
-
-// Multiple of 1.
-__declspec(naked) void CopyRow_ERMS(const uint8_t* src,
-                                    uint8_t* dst,
-                                    int width) {
-  __asm {
-    mov        eax, esi
-    mov        edx, edi
-    mov        esi, [esp + 4]  // src
-    mov        edi, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    rep movsb
-    mov        edi, edx
-    mov        esi, eax
-    ret
-  }
-}
-
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-// width in pixels
-__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
-                                             uint8_t* dst,
-                                             int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
-    pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
-    psrld      xmm1, 8
-
-  convertloop:
-    movdqu     xmm2, [eax]
-    movdqu     xmm3, [eax + 16]
-    lea        eax, [eax + 32]
-    movdqu     xmm4, [edx]
-    movdqu     xmm5, [edx + 16]
-    pand       xmm2, xmm0
-    pand       xmm3, xmm0
-    pand       xmm4, xmm1
-    pand       xmm5, xmm1
-    por        xmm2, xmm4
-    por        xmm3, xmm5
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm3
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-// width in pixels
-__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
-                                             uint8_t* dst,
-                                             int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
-
-  convertloop:
-    vmovdqu    ymm1, [eax]
-    vmovdqu    ymm2, [eax + 32]
-    lea        eax, [eax + 64]
-    vpblendvb  ymm1, ymm1, [edx], ymm0
-    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
-    vmovdqu    [edx], ymm1
-    vmovdqu    [edx + 32], ymm2
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYALPHAROW_AVX2
-
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-// width in pixels
-__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
-                                                uint8_t* dst_a,
-                                                int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_a
-    mov        ecx, [esp + 12]  // width
-
-  extractloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    psrld      xmm0, 24
-    psrld      xmm1, 24
-    packssdw   xmm0, xmm1
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 8
-    jg         extractloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
-
-#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
-// width in pixels
-__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
-                                                uint8_t* dst_a,
-                                                int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_a
-    mov        ecx, [esp + 12]  // width
-    vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
-
-  extractloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vpsrld     ymm0, ymm0, 24
-    vpsrld     ymm1, ymm1, 24
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    lea        eax, [eax + 128]
-    vpackssdw  ymm0, ymm0, ymm1  // mutates
-    vpsrld     ymm2, ymm2, 24
-    vpsrld     ymm3, ymm3, 24
-    vpackssdw  ymm2, ymm2, ymm3  // mutates
-    vpackuswb  ymm0, ymm0, ymm2  // mutates
-    vpermd     ymm0, ymm4, ymm0  // unmutate
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         extractloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-// width in pixels
-__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
-                                                uint8_t* dst,
-                                                int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
-    pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
-    psrld      xmm1, 8
-
-  convertloop:
-    movq       xmm2, qword ptr [eax]  // 8 Y's
-    lea        eax, [eax + 8]
-    punpcklbw  xmm2, xmm2
-    punpckhwd  xmm3, xmm2
-    punpcklwd  xmm2, xmm2
-    movdqu     xmm4, [edx]
-    movdqu     xmm5, [edx + 16]
-    pand       xmm2, xmm0
-    pand       xmm3, xmm0
-    pand       xmm4, xmm1
-    pand       xmm5, xmm1
-    por        xmm2, xmm4
-    por        xmm3, xmm5
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm3
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-// width in pixels
-__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
-                                                uint8_t* dst,
-                                                int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
-
-  convertloop:
-    vpmovzxbd  ymm1, qword ptr [eax]
-    vpmovzxbd  ymm2, qword ptr [eax + 8]
-    lea        eax, [eax + 16]
-    vpslld     ymm1, ymm1, 24
-    vpslld     ymm2, ymm2, 24
-    vpblendvb  ymm1, ymm1, [edx], ymm0
-    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
-    vmovdqu    [edx], ymm1
-    vmovdqu    [edx + 32], ymm2
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
-
-#ifdef HAS_SETROW_X86
-// Write 'width' bytes using an 8 bit value repeated.
-// width should be multiple of 4.
-__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
-  __asm {
-    movzx      eax, byte ptr [esp + 8]  // v8
-    mov        edx, 0x01010101  // Duplicate byte to all bytes.
-    mul        edx  // overwrites edx with upper part of result.
-    mov        edx, edi
-    mov        edi, [esp + 4]  // dst
-    mov        ecx, [esp + 12]  // width
-    shr        ecx, 2
-    rep stosd
-    mov        edi, edx
-    ret
-  }
-}
-
-// Write 'width' bytes using an 8 bit value repeated.
-__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
-  __asm {
-    mov        edx, edi
-    mov        edi, [esp + 4]  // dst
-    mov        eax, [esp + 8]  // v8
-    mov        ecx, [esp + 12]  // width
-    rep stosb
-    mov        edi, edx
-    ret
-  }
-}
-
-// Write 'width' 32 bit values.
-__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
-                                      uint32_t v32,
-                                      int width) {
-  __asm {
-    mov        edx, edi
-    mov        edi, [esp + 4]  // dst
-    mov        eax, [esp + 8]  // v32
-    mov        ecx, [esp + 12]  // width
-    rep stosd
-    mov        edi, edx
-    ret
-  }
-}
-#endif  // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
-                                       uint8_t* dst_y,
-                                       int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_yuy2
-    mov        edx, [esp + 8]  // dst_y
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5  // even bytes are Y
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
-                                        int stride_yuy2,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_yuy2
-    mov        esi, [esp + 8 + 8]  // stride_yuy2
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8  // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
-                                           uint8_t* dst_u,
-                                           uint8_t* dst_v,
-                                           int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_yuy2
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8  // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
-                                       uint8_t* dst_y,
-                                       int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_uyvy
-    mov        edx, [esp + 8]  // dst_y
-    mov        ecx, [esp + 12]  // width
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
-                                        int stride_uyvy,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_yuy2
-    mov        esi, [esp + 8 + 8]  // stride_yuy2
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8  // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
-                                           uint8_t* dst_u,
-                                           uint8_t* dst_v,
-                                           int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_yuy2
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8  // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_YUY2TOYROW_AVX2
-
-#ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
-                                       uint8_t* dst_y,
-                                       int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_yuy2
-    mov        edx, [esp + 8]  // dst_y
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5  // even bytes are Y
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
-                                        int stride_yuy2,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_yuy2
-    mov        esi, [esp + 8 + 8]  // stride_yuy2
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    psrlw      xmm0, 8  // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8  // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
-                                           uint8_t* dst_u,
-                                           uint8_t* dst_v,
-                                           int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_yuy2
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8  // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8  // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
-                                       uint8_t* dst_y,
-                                       int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_uyvy
-    mov        edx, [esp + 8]  // dst_y
-    mov        ecx, [esp + 12]  // width
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8  // odd bytes are Y
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
-                                        int stride_uyvy,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_yuy2
-    mov        esi, [esp + 8 + 8]  // stride_yuy2
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    pand       xmm0, xmm5  // UYVY -> UVUV
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8  // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
-                                           uint8_t* dst_u,
-                                           uint8_t* dst_v,
-                                           int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_yuy2
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5  // UYVY -> UVUV
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8  // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-#endif  // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_BLENDPLANEROW_SSSE3
-// Blend 8 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
-                                           const uint8_t* src1,
-                                           const uint8_t* alpha,
-                                           uint8_t* dst,
-                                           int width) {
-  __asm {
-    push       esi
-    push       edi
-    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    mov        eax, 0x80808080  // 128 for biasing image to signed.
-    movd       xmm6, eax
-    pshufd     xmm6, xmm6, 0x00
-
-    mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
-    movd       xmm7, eax
-    pshufd     xmm7, xmm7, 0x00
-    mov        eax, [esp + 8 + 4]  // src0
-    mov        edx, [esp + 8 + 8]  // src1
-    mov        esi, [esp + 8 + 12]  // alpha
-    mov        edi, [esp + 8 + 16]  // dst
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        eax, esi
-    sub        edx, esi
-    sub        edi, esi
-
-        // 8 pixel loop.
-  convertloop8:
-    movq       xmm0, qword ptr [esi]  // alpha
-    punpcklbw  xmm0, xmm0
-    pxor       xmm0, xmm5  // a, 255-a
-    movq       xmm1, qword ptr [eax + esi]  // src0
-    movq       xmm2, qword ptr [edx + esi]  // src1
-    punpcklbw  xmm1, xmm2
-    psubb      xmm1, xmm6  // bias src0/1 - 128
-    pmaddubsw  xmm0, xmm1
-    paddw      xmm0, xmm7  // unbias result - 32768 and round.
-    psrlw      xmm0, 8
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edi + esi], xmm0
-    lea        esi, [esi + 8]
-    sub        ecx, 8
-    jg         convertloop8
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_BLENDPLANEROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 32 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
-                                          const uint8_t* src1,
-                                          const uint8_t* alpha,
-                                          uint8_t* dst,
-                                          int width) {
-  __asm {
-    push        esi
-    push        edi
-    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
-    vpsllw      ymm5, ymm5, 8
-    mov         eax, 0x80808080  // 128 for biasing image to signed.
-    vmovd       xmm6, eax
-    vbroadcastss ymm6, xmm6
-    mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
-    vmovd       xmm7, eax
-    vbroadcastss ymm7, xmm7
-    mov         eax, [esp + 8 + 4]  // src0
-    mov         edx, [esp + 8 + 8]  // src1
-    mov         esi, [esp + 8 + 12]  // alpha
-    mov         edi, [esp + 8 + 16]  // dst
-    mov         ecx, [esp + 8 + 20]  // width
-    sub         eax, esi
-    sub         edx, esi
-    sub         edi, esi
-
-        // 32 pixel loop.
-  convertloop32:
-    vmovdqu     ymm0, [esi]  // alpha
-    vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
-    vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
-    vpxor       ymm3, ymm3, ymm5  // a, 255-a
-    vpxor       ymm0, ymm0, ymm5  // a, 255-a
-    vmovdqu     ymm1, [eax + esi]  // src0
-    vmovdqu     ymm2, [edx + esi]  // src1
-    vpunpckhbw  ymm4, ymm1, ymm2
-    vpunpcklbw  ymm1, ymm1, ymm2
-    vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
-    vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpmaddubsw  ymm0, ymm0, ymm1
-    vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
-    vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
-    vpsrlw      ymm3, ymm3, 8
-    vpsrlw      ymm0, ymm0, 8
-    vpackuswb   ymm0, ymm0, ymm3
-    vmovdqu     [edi + esi], ymm0
-    lea         esi, [esi + 32]
-    sub         ecx, 32
-    jg          convertloop32
-
-    pop         edi
-    pop         esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_BLENDPLANEROW_AVX2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
-                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
-
-// Blend 8 pixels at a time.
-__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
-                                          const uint8_t* src_argb1,
-                                          uint8_t* dst_argb,
-                                          int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7  // generate constant 0x0001
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
-    pslld      xmm4, 24
-    sub        ecx, 4
-    jl         convertloop4b  // less than 4 pixels?
-
-        // 4 pixel loop.
-  convertloop4:
-    movdqu     xmm3, [eax]  // src argb
-    lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3  // src argb
-    pxor       xmm3, xmm4  // ~alpha
-    movdqu     xmm2, [esi]  // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
-    pand       xmm2, xmm6  // _r_b
-    paddw      xmm3, xmm7  // 256 - alpha
-    pmullw     xmm2, xmm3  // _r_b * alpha
-    movdqu     xmm1, [esi]  // _a_g
-    lea        esi, [esi + 16]
-    psrlw      xmm1, 8  // _a_g
-    por        xmm0, xmm4  // set alpha to 255
-    pmullw     xmm1, xmm3  // _a_g * alpha
-    psrlw      xmm2, 8  // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2  // + src argb
-    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1  // + src argb
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jge        convertloop4
-
-  convertloop4b:
-    add        ecx, 4 - 1
-    jl         convertloop1b
-
-            // 1 pixel loop.
-  convertloop1:
-    movd       xmm3, [eax]  // src argb
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3  // src argb
-    pxor       xmm3, xmm4  // ~alpha
-    movd       xmm2, [esi]  // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
-    pand       xmm2, xmm6  // _r_b
-    paddw      xmm3, xmm7  // 256 - alpha
-    pmullw     xmm2, xmm3  // _r_b * alpha
-    movd       xmm1, [esi]  // _a_g
-    lea        esi, [esi + 4]
-    psrlw      xmm1, 8  // _a_g
-    por        xmm0, xmm4  // set alpha to 255
-    pmullw     xmm1, xmm3  // _a_g * alpha
-    psrlw      xmm2, 8  // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2  // + src argb
-    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1  // + src argb
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jge        convertloop1
-
-  convertloop1b:
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha0 = {
-    3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
-};
-static const uvec8 kShuffleAlpha1 = {
-    11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-    15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
-};
-__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
-                                              uint8_t* dst_argb,
-                                              int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
-    pslld      xmm3, 24
-    movdqa     xmm4, xmmword ptr kShuffleAlpha0
-    movdqa     xmm5, xmmword ptr kShuffleAlpha1
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels
-    pshufb     xmm0, xmm4  // isolate first 2 alphas
-    movdqu     xmm1, [eax]  // read 4 pixels
-    punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
-    pmulhuw    xmm0, xmm1  // rgb * a
-    movdqu     xmm1, [eax]  // read 4 pixels
-    pshufb     xmm1, xmm5  // isolate next 2 alphas
-    movdqu     xmm2, [eax]  // read 4 pixels
-    punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
-    pmulhuw    xmm1, xmm2  // rgb * a
-    movdqu     xmm2, [eax]  // mask original alpha
-    lea        eax, [eax + 16]
-    pand       xmm2, xmm3
-    psrlw      xmm0, 8
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    por        xmm0, xmm2  // copy original alpha
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
-                                         128u, 128u, 14u,  15u, 14u, 15u,
-                                         14u,  15u,  128u, 128u};
-__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
-                                             uint8_t* dst_argb,
-                                             int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
-    vpslld     ymm5, ymm5, 24
-
- convertloop:
-    vmovdqu    ymm6, [eax]  // read 8 pixels.
-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
-    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
-    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
-    vpand      ymm6, ymm6, ymm5  // isolate alpha
-    vpsrlw     ymm0, ymm0, 8
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    vpor       ymm0, ymm0, ymm6  // copy original alpha
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
-                                               uint8_t* dst_argb,
-                                               int width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]  // src_argb
-    mov        edx, [esp + 12 + 8]  // dst_argb
-    mov        ecx, [esp + 12 + 12]  // width
-    lea        ebx, fixed_invtbl8
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels
-    movzx      esi, byte ptr [eax + 3]  // first alpha
-    movzx      edi, byte ptr [eax + 7]  // second alpha
-    punpcklbw  xmm0, xmm0  // first 2
-    movd       xmm2, dword ptr [ebx + esi * 4]
-    movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
-    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
-    movlhps    xmm2, xmm3
-    pmulhuw    xmm0, xmm2  // rgb * a
-
-    movdqu     xmm1, [eax]  // read 4 pixels
-    movzx      esi, byte ptr [eax + 11]  // third alpha
-    movzx      edi, byte ptr [eax + 15]  // forth alpha
-    punpckhbw  xmm1, xmm1  // next 2
-    movd       xmm2, dword ptr [ebx + esi * 4]
-    movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
-    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
-    movlhps    xmm2, xmm3
-    pmulhuw    xmm1, xmm2  // rgb * a
-    lea        eax, [eax + 16]
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-#endif  // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
-// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
-// USE_GATHER is not on by default, due to being a slow instruction.
-#ifdef USE_GATHER
-__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
-                                               uint8_t* dst_argb,
-                                               int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
-
- convertloop:
-    vmovdqu    ymm6, [eax]  // read 8 pixels.
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
-    vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
-    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
-    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
-    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
-    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
-    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#else   // USE_GATHER
-__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
-                                               uint8_t* dst_argb,
-                                               int width) {
-  __asm {
-
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]  // src_argb
-    mov        edx, [esp + 12 + 8]  // dst_argb
-    mov        ecx, [esp + 12 + 12]  // width
-    sub        edx, eax
-    lea        ebx, fixed_invtbl8
-    vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
-
- convertloop:
-        // replace VPGATHER
-    movzx      esi, byte ptr [eax + 3]  // alpha0
-    movzx      edi, byte ptr [eax + 7]  // alpha1
-    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
-    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
-    movzx      esi, byte ptr [eax + 11]  // alpha2
-    movzx      edi, byte ptr [eax + 15]  // alpha3
-    vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
-    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
-    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
-    movzx      esi, byte ptr [eax + 19]  // alpha4
-    movzx      edi, byte ptr [eax + 23]  // alpha5
-    vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
-    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
-    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
-    movzx      esi, byte ptr [eax + 27]  // alpha6
-    movzx      edi, byte ptr [eax + 31]  // alpha7
-    vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
-    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
-    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
-    vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
-    vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
-    vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
-    vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
-    // end of VPGATHER
-
-    vmovdqu    ymm6, [eax]  // read 8 pixels.
-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
-    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
-    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
-    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
-    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
-    vpackuswb  ymm0, ymm0, ymm1             // unmutated.
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    vzeroupper
-    ret
-  }
-}
-#endif  // USE_GATHER
-#endif  // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
-                                         uint8_t* dst_argb,
-                                         int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_argb */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kARGBToYJ
-    movdqa     xmm5, xmmword ptr kAddYJ64
-
- convertloop:
-    movdqu     xmm0, [eax]  // G
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    phaddw     xmm0, xmm1
-    paddw      xmm0, xmm5  // Add .5 for rounding.
-    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0  // 8 G bytes
-    movdqu     xmm2, [eax]  // A
-    movdqu     xmm3, [eax + 16]
-    lea        eax, [eax + 32]
-    psrld      xmm2, 24
-    psrld      xmm3, 24
-    packuswb   xmm2, xmm3
-    packuswb   xmm2, xmm2  // 8 A bytes
-    movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
-    punpcklbw  xmm0, xmm0  // 8 GG words
-    punpcklbw  xmm3, xmm2  // 8 GA words
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm3  // GGGA first 4
-    punpckhwd  xmm1, xmm3  // GGGA next 4
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone.
-static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
-                                   17, 68, 35, 0, 17, 68, 35, 0};
-
-static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
-                                   22, 88, 45, 0, 22, 88, 45, 0};
-
-static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
-                                   24, 98, 50, 0, 24, 98, 50, 0};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4] /* dst_argb */
-    mov        ecx, [esp + 8] /* width */
-    movdqa     xmm2, xmmword ptr kARGBToSepiaB
-    movdqa     xmm3, xmmword ptr kARGBToSepiaG
-    movdqa     xmm4, xmmword ptr kARGBToSepiaR
-
- convertloop:
-    movdqu     xmm0, [eax]  // B
-    movdqu     xmm6, [eax + 16]
-    pmaddubsw  xmm0, xmm2
-    pmaddubsw  xmm6, xmm2
-    phaddw     xmm0, xmm6
-    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0  // 8 B values
-    movdqu     xmm5, [eax]  // G
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm3
-    pmaddubsw  xmm1, xmm3
-    phaddw     xmm5, xmm1
-    psrlw      xmm5, 7
-    packuswb   xmm5, xmm5  // 8 G values
-    punpcklbw  xmm0, xmm5  // 8 BG values
-    movdqu     xmm5, [eax]  // R
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm4
-    pmaddubsw  xmm1, xmm4
-    phaddw     xmm5, xmm1
-    psrlw      xmm5, 7
-    packuswb   xmm5, xmm5  // 8 R values
-    movdqu     xmm6, [eax]  // A
-    movdqu     xmm1, [eax + 16]
-    psrld      xmm6, 24
-    psrld      xmm1, 24
-    packuswb   xmm6, xmm1
-    packuswb   xmm6, xmm6  // 8 A values
-    punpcklbw  xmm5, xmm6  // 8 RA values
-    movdqa     xmm1, xmm0  // Weave BG, RA together
-    punpcklwd  xmm0, xmm5  // BGRA first 4
-    punpckhwd  xmm1, xmm5  // BGRA next 4
-    movdqu     [eax], xmm0
-    movdqu     [eax + 16], xmm1
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
-// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
-                                                uint8_t* dst_argb,
-                                                const int8_t* matrix_argb,
-                                                int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_argb */
-    mov        ecx, [esp + 12] /* matrix_argb */
-    movdqu     xmm5, [ecx]
-    pshufd     xmm2, xmm5, 0x00
-    pshufd     xmm3, xmm5, 0x55
-    pshufd     xmm4, xmm5, 0xaa
-    pshufd     xmm5, xmm5, 0xff
-    mov        ecx, [esp + 16] /* width */
-
- convertloop:
-    movdqu     xmm0, [eax]  // B
-    movdqu     xmm7, [eax + 16]
-    pmaddubsw  xmm0, xmm2
-    pmaddubsw  xmm7, xmm2
-    movdqu     xmm6, [eax]  // G
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm6, xmm3
-    pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm7  // B
-    phaddsw    xmm6, xmm1  // G
-    psraw      xmm0, 6  // B
-    psraw      xmm6, 6  // G
-    packuswb   xmm0, xmm0  // 8 B values
-    packuswb   xmm6, xmm6  // 8 G values
-    punpcklbw  xmm0, xmm6  // 8 BG values
-    movdqu     xmm1, [eax]  // R
-    movdqu     xmm7, [eax + 16]
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm7, xmm4
-    phaddsw    xmm1, xmm7  // R
-    movdqu     xmm6, [eax]  // A
-    movdqu     xmm7, [eax + 16]
-    pmaddubsw  xmm6, xmm5
-    pmaddubsw  xmm7, xmm5
-    phaddsw    xmm6, xmm7  // A
-    psraw      xmm1, 6  // R
-    psraw      xmm6, 6  // A
-    packuswb   xmm1, xmm1  // 8 R values
-    packuswb   xmm6, xmm6  // 8 A values
-    punpcklbw  xmm1, xmm6  // 8 RA values
-    movdqa     xmm6, xmm0  // Weave BG, RA together
-    punpcklwd  xmm0, xmm1  // BGRA first 4
-    punpckhwd  xmm6, xmm1  // BGRA next 4
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm6
-    lea        eax, [eax + 32]
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
-                                            int scale,
-                                            int interval_size,
-                                            int interval_offset,
-                                            int width) {
-  __asm {
-    mov        eax, [esp + 4] /* dst_argb */
-    movd       xmm2, [esp + 8] /* scale */
-    movd       xmm3, [esp + 12] /* interval_size */
-    movd       xmm4, [esp + 16] /* interval_offset */
-    mov        ecx, [esp + 20] /* width */
-    pshuflw    xmm2, xmm2, 040h
-    pshufd     xmm2, xmm2, 044h
-    pshuflw    xmm3, xmm3, 040h
-    pshufd     xmm3, xmm3, 044h
-    pshuflw    xmm4, xmm4, 040h
-    pshufd     xmm4, xmm4, 044h
-    pxor       xmm5, xmm5  // constant 0
-    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
-    pslld      xmm6, 24
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels
-    punpcklbw  xmm0, xmm5  // first 2 pixels
-    pmulhuw    xmm0, xmm2  // pixel * scale >> 16
-    movdqu     xmm1, [eax]  // read 4 pixels
-    punpckhbw  xmm1, xmm5  // next 2 pixels
-    pmulhuw    xmm1, xmm2
-    pmullw     xmm0, xmm3  // * interval_size
-    movdqu     xmm7, [eax]  // read 4 pixels
-    pmullw     xmm1, xmm3
-    pand       xmm7, xmm6  // mask alpha
-    paddw      xmm0, xmm4  // + interval_size / 2
-    paddw      xmm1, xmm4
-    packuswb   xmm0, xmm1
-    por        xmm0, xmm7
-    movdqu     [eax], xmm0
-    lea        eax, [eax + 16]
-    sub        ecx, 4
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_ARGBSHADEROW_SSE2
-// Shade 4 pixels at a time by specified value.
-__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
-                                         uint8_t* dst_argb,
-                                         int width,
-                                         uint32_t value) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    movd       xmm2, [esp + 16]  // value
-    punpcklbw  xmm2, xmm2
-    punpcklqdq xmm2, xmm2
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0  // first 2
-    punpckhbw  xmm1, xmm1  // next 2
-    pmulhuw    xmm0, xmm2  // argb * value
-    pmulhuw    xmm1, xmm2  // argb * value
-    psrlw      xmm0, 8
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBSHADEROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
-                                            const uint8_t* src_argb1,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pxor       xmm5, xmm5  // constant 0
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
-    movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
-    movdqu     xmm1, xmm0
-    movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0  // first 2
-    punpckhbw  xmm1, xmm1  // next 2
-    punpcklbw  xmm2, xmm5  // first 2
-    punpckhbw  xmm3, xmm5  // next 2
-    pmulhuw    xmm0, xmm2  // src_argb * src_argb1 first 2
-    pmulhuw    xmm1, xmm3  // src_argb * src_argb1 next 2
-    lea        eax, [eax + 16]
-    lea        esi, [esi + 16]
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBMULTIPLYROW_SSE2
-
-#ifdef HAS_ARGBADDROW_SSE2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
-                                       const uint8_t* src_argb1,
-                                       uint8_t* dst_argb,
-                                       int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
-    sub        ecx, 4
-    jl         convertloop49
-
- convertloop4:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
-    lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
-    lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1  // src_argb + src_argb1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jge        convertloop4
-
- convertloop49:
-    add        ecx, 4 - 1
-    jl         convertloop19
-
- convertloop1:
-    movd       xmm0, [eax]  // read 1 pixels from src_argb
-    lea        eax, [eax + 4]
-    movd       xmm1, [esi]  // read 1 pixels from src_argb1
-    lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1  // src_argb + src_argb1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jge        convertloop1
-
- convertloop19:
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBADDROW_SSE2
-
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
-                                            const uint8_t* src_argb1,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
-    lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
-    lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1  // src_argb - src_argb1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBSUBTRACTROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
-                                            const uint8_t* src_argb1,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5  // constant 0
-
- convertloop:
-    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb
-    lea        eax, [eax + 32]
-    vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
-    lea        esi, [esi + 32]
-    vpunpcklbw ymm0, ymm1, ymm1  // low 4
-    vpunpckhbw ymm1, ymm1, ymm1  // high 4
-    vpunpcklbw ymm2, ymm3, ymm5  // low 4
-    vpunpckhbw ymm3, ymm3, ymm5  // high 4
-    vpmulhuw   ymm0, ymm0, ymm2  // src_argb * src_argb1 low 4
-    vpmulhuw   ymm1, ymm1, ymm3  // src_argb * src_argb1 high 4
-    vpackuswb  ymm0, ymm0, ymm1
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBMULTIPLYROW_AVX2
-
-#ifdef HAS_ARGBADDROW_AVX2
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
-                                       const uint8_t* src_argb1,
-                                       uint8_t* dst_argb,
-                                       int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb
-    lea        eax, [eax + 32]
-    vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
-    lea        esi, [esi + 32]
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBADDROW_AVX2
-
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
-                                            const uint8_t* src_argb1,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb
-    lea        eax, [eax + 32]
-    vpsubusb   ymm0, ymm0, [esi]  // src_argb - src_argb1
-    lea        esi, [esi + 32]
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBSUBTRACTROW_AVX2
-
-#ifdef HAS_SOBELXROW_SSE2
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
-                                      const uint8_t* src_y1,
-                                      const uint8_t* src_y2,
-                                      uint8_t* dst_sobelx,
-                                      int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_y0
-    mov        esi, [esp + 8 + 8]  // src_y1
-    mov        edi, [esp + 8 + 12]  // src_y2
-    mov        edx, [esp + 8 + 16]  // dst_sobelx
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        esi, eax
-    sub        edi, eax
-    sub        edx, eax
-    pxor       xmm5, xmm5  // constant 0
-
- convertloop:
-    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
-    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm2, xmm5
-    psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
-    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
-    punpcklbw  xmm2, xmm5
-    punpcklbw  xmm3, xmm5
-    psubw      xmm2, xmm3
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm1
-    paddw      xmm0, xmm1
-    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
-    psubw      xmm1, xmm0
-    pmaxsw     xmm0, xmm1
-    packuswb   xmm0, xmm0
-    movq       qword ptr [eax + edx], xmm0
-    lea        eax, [eax + 8]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELXROW_SSE2
-
-#ifdef HAS_SOBELYROW_SSE2
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
-                                      const uint8_t* src_y1,
-                                      uint8_t* dst_sobely,
-                                      int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_y0
-    mov        esi, [esp + 4 + 8]  // src_y1
-    mov        edx, [esp + 4 + 12]  // dst_sobely
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
-    pxor       xmm5, xmm5  // constant 0
-
- convertloop:
-    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
-    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm2, xmm5
-    psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
-    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
-    punpcklbw  xmm2, xmm5
-    punpcklbw  xmm3, xmm5
-    psubw      xmm2, xmm3
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm1
-    paddw      xmm0, xmm1
-    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
-    psubw      xmm1, xmm0
-    pmaxsw     xmm0, xmm1
-    packuswb   xmm0, xmm0
-    movq       qword ptr [eax + edx], xmm0
-    lea        eax, [eax + 8]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELYROW_SSE2
-
-#ifdef HAS_SOBELROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
-                                     const uint8_t* src_sobely,
-                                     uint8_t* dst_argb,
-                                     int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_sobelx
-    mov        esi, [esp + 4 + 8]  // src_sobely
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    pcmpeqb    xmm5, xmm5  // alpha 255
-    pslld      xmm5, 24  // 0xff000000
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
-    lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
-    movdqa     xmm2, xmm0  // GG
-    punpcklbw  xmm2, xmm0  // First 8
-    punpckhbw  xmm0, xmm0  // Next 8
-    movdqa     xmm1, xmm2  // GGGG
-    punpcklwd  xmm1, xmm2  // First 4
-    punpckhwd  xmm2, xmm2  // Next 4
-    por        xmm1, xmm5  // GGGA
-    por        xmm2, xmm5
-    movdqa     xmm3, xmm0  // GGGG
-    punpcklwd  xmm3, xmm0  // Next 4
-    punpckhwd  xmm0, xmm0  // Last 4
-    por        xmm3, xmm5  // GGGA
-    por        xmm0, xmm5
-    movdqu     [edx], xmm1
-    movdqu     [edx + 16], xmm2
-    movdqu     [edx + 32], xmm3
-    movdqu     [edx + 48], xmm0
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELROW_SSE2
-
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
-                                            const uint8_t* src_sobely,
-                                            uint8_t* dst_y,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_sobelx
-    mov        esi, [esp + 4 + 8]  // src_sobely
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
-    lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELTOPLANEROW_SSE2
-
-#ifdef HAS_SOBELXYROW_SSE2
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
-                                       const uint8_t* src_sobely,
-                                       uint8_t* dst_argb,
-                                       int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_sobelx
-    mov        esi, [esp + 4 + 8]  // src_sobely
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    pcmpeqb    xmm5, xmm5  // alpha 255
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
-    lea        eax, [eax + 16]
-    movdqa     xmm2, xmm0
-    paddusb    xmm2, xmm1  // sobel = sobelx + sobely
-    movdqa     xmm3, xmm0  // XA
-    punpcklbw  xmm3, xmm5
-    punpckhbw  xmm0, xmm5
-    movdqa     xmm4, xmm1  // YS
-    punpcklbw  xmm4, xmm2
-    punpckhbw  xmm1, xmm2
-    movdqa     xmm6, xmm4  // YSXA
-    punpcklwd  xmm6, xmm3  // First 4
-    punpckhwd  xmm4, xmm3  // Next 4
-    movdqa     xmm7, xmm1  // YSXA
-    punpcklwd  xmm7, xmm0  // Next 4
-    punpckhwd  xmm1, xmm0  // Last 4
-    movdqu     [edx], xmm6
-    movdqu     [edx + 16], xmm4
-    movdqu     [edx + 32], xmm7
-    movdqu     [edx + 48], xmm1
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELXYROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-// Consider float CumulativeSum.
-// Consider calling CumulativeSum one row at time as needed.
-// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
-// Convert cumulative sum for an area to an average for 1 pixel.
-// topleft is pointer to top left of CumulativeSum buffer for area.
-// botleft is pointer to bottom left of CumulativeSum buffer.
-// width is offset from left to right of area in CumulativeSum buffer measured
-//   in number of ints.
-// area is the number of pixels in the area being averaged.
-// dst points to pixel to store result to.
-// count is number of averaged pixels to produce.
-// Does 4 pixels at a time.
-// This function requires alignment on accumulation buffer pointers.
-void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
-                                    const int32_t* botleft,
-                                    int width,
-                                    int area,
-                                    uint8_t* dst,
-                                    int count) {
-  __asm {
-    mov        eax, topleft  // eax topleft
-    mov        esi, botleft  // esi botleft
-    mov        edx, width
-    movd       xmm5, area
-    mov        edi, dst
-    mov        ecx, count
-    cvtdq2ps   xmm5, xmm5
-    rcpss      xmm4, xmm5  // 1.0f / area
-    pshufd     xmm4, xmm4, 0
-    sub        ecx, 4
-    jl         l4b
-
-    cmp        area, 128  // 128 pixels will not overflow 15 bits.
-    ja         l4
-
-    pshufd     xmm5, xmm5, 0  // area
-    pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
-    psrld      xmm6, 16
-    cvtdq2ps   xmm6, xmm6
-    addps      xmm5, xmm6  // (65536.0 + area - 1)
-    mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
-    cvtps2dq   xmm5, xmm5  // 0.16 fixed point
-    packssdw   xmm5, xmm5  // 16 bit shorts
-
-        // 4 pixel loop small blocks.
-  s4:
-        // top left
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-
-    // - top right
-    psubd      xmm0, [eax + edx * 4]
-    psubd      xmm1, [eax + edx * 4 + 16]
-    psubd      xmm2, [eax + edx * 4 + 32]
-    psubd      xmm3, [eax + edx * 4 + 48]
-    lea        eax, [eax + 64]
-
-    // - bottom left
-    psubd      xmm0, [esi]
-    psubd      xmm1, [esi + 16]
-    psubd      xmm2, [esi + 32]
-    psubd      xmm3, [esi + 48]
-
-    // + bottom right
-    paddd      xmm0, [esi + edx * 4]
-    paddd      xmm1, [esi + edx * 4 + 16]
-    paddd      xmm2, [esi + edx * 4 + 32]
-    paddd      xmm3, [esi + edx * 4 + 48]
-    lea        esi, [esi + 64]
-
-    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
-    packssdw   xmm2, xmm3
-
-    pmulhuw    xmm0, xmm5
-    pmulhuw    xmm2, xmm5
-
-    packuswb   xmm0, xmm2
-    movdqu     [edi], xmm0
-    lea        edi, [edi + 16]
-    sub        ecx, 4
-    jge        s4
-
-    jmp        l4b
-
-            // 4 pixel loop
-  l4:
-        // top left
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-
-    // - top right
-    psubd      xmm0, [eax + edx * 4]
-    psubd      xmm1, [eax + edx * 4 + 16]
-    psubd      xmm2, [eax + edx * 4 + 32]
-    psubd      xmm3, [eax + edx * 4 + 48]
-    lea        eax, [eax + 64]
-
-    // - bottom left
-    psubd      xmm0, [esi]
-    psubd      xmm1, [esi + 16]
-    psubd      xmm2, [esi + 32]
-    psubd      xmm3, [esi + 48]
-
-    // + bottom right
-    paddd      xmm0, [esi + edx * 4]
-    paddd      xmm1, [esi + edx * 4 + 16]
-    paddd      xmm2, [esi + edx * 4 + 32]
-    paddd      xmm3, [esi + edx * 4 + 48]
-    lea        esi, [esi + 64]
-
-    cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
-    cvtdq2ps   xmm1, xmm1
-    mulps      xmm0, xmm4
-    mulps      xmm1, xmm4
-    cvtdq2ps   xmm2, xmm2
-    cvtdq2ps   xmm3, xmm3
-    mulps      xmm2, xmm4
-    mulps      xmm3, xmm4
-    cvtps2dq   xmm0, xmm0
-    cvtps2dq   xmm1, xmm1
-    cvtps2dq   xmm2, xmm2
-    cvtps2dq   xmm3, xmm3
-    packssdw   xmm0, xmm1
-    packssdw   xmm2, xmm3
-    packuswb   xmm0, xmm2
-    movdqu     [edi], xmm0
-    lea        edi, [edi + 16]
-    sub        ecx, 4
-    jge        l4
-
-  l4b:
-    add        ecx, 4 - 1
-    jl         l1b
-
-            // 1 pixel loop
-  l1:
-    movdqu     xmm0, [eax]
-    psubd      xmm0, [eax + edx * 4]
-    lea        eax, [eax + 16]
-    psubd      xmm0, [esi]
-    paddd      xmm0, [esi + edx * 4]
-    lea        esi, [esi + 16]
-    cvtdq2ps   xmm0, xmm0
-    mulps      xmm0, xmm4
-    cvtps2dq   xmm0, xmm0
-    packssdw   xmm0, xmm0
-    packuswb   xmm0, xmm0
-    movd       dword ptr [edi], xmm0
-    lea        edi, [edi + 4]
-    sub        ecx, 1
-    jge        l1
-  l1b:
-  }
-}
-#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
-                                  int32_t* cumsum,
-                                  const int32_t* previous_cumsum,
-                                  int width) {
-  __asm {
-    mov        eax, row
-    mov        edx, cumsum
-    mov        esi, previous_cumsum
-    mov        ecx, width
-    pxor       xmm0, xmm0
-    pxor       xmm1, xmm1
-
-    sub        ecx, 4
-    jl         l4b
-    test       edx, 15
-    jne        l4b
-
-        // 4 pixel loop
-  l4:
-    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
-    lea        eax, [eax + 16]
-    movdqa     xmm4, xmm2
-
-    punpcklbw  xmm2, xmm1
-    movdqa     xmm3, xmm2
-    punpcklwd  xmm2, xmm1
-    punpckhwd  xmm3, xmm1
-
-    punpckhbw  xmm4, xmm1
-    movdqa     xmm5, xmm4
-    punpcklwd  xmm4, xmm1
-    punpckhwd  xmm5, xmm1
-
-    paddd      xmm0, xmm2
-    movdqu     xmm2, [esi]  // previous row above.
-    paddd      xmm2, xmm0
-
-    paddd      xmm0, xmm3
-    movdqu     xmm3, [esi + 16]
-    paddd      xmm3, xmm0
-
-    paddd      xmm0, xmm4
-    movdqu     xmm4, [esi + 32]
-    paddd      xmm4, xmm0
-
-    paddd      xmm0, xmm5
-    movdqu     xmm5, [esi + 48]
-    lea        esi, [esi + 64]
-    paddd      xmm5, xmm0
-
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm3
-    movdqu     [edx + 32], xmm4
-    movdqu     [edx + 48], xmm5
-
-    lea        edx, [edx + 64]
-    sub        ecx, 4
-    jge        l4
-
-  l4b:
-    add        ecx, 4 - 1
-    jl         l1b
-
-            // 1 pixel loop
-  l1:
-    movd       xmm2, dword ptr [eax]  // 1 argb pixel
-    lea        eax, [eax + 4]
-    punpcklbw  xmm2, xmm1
-    punpcklwd  xmm2, xmm1
-    paddd      xmm0, xmm2
-    movdqu     xmm2, [esi]
-    lea        esi, [esi + 16]
-    paddd      xmm2, xmm0
-    movdqu     [edx], xmm2
-    lea        edx, [edx + 16]
-    sub        ecx, 1
-    jge        l1
-
- l1b:
-  }
-}
-#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
-                                                     int src_argb_stride,
-                                                     uint8_t* dst_argb,
-                                                     const float* uv_dudv,
-                                                     int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 12]  // src_argb
-    mov        esi, [esp + 16]  // stride
-    mov        edx, [esp + 20]  // dst_argb
-    mov        ecx, [esp + 24]  // pointer to uv_dudv
-    movq       xmm2, qword ptr [ecx]  // uv
-    movq       xmm7, qword ptr [ecx + 8]  // dudv
-    mov        ecx, [esp + 28]  // width
-    shl        esi, 16  // 4, stride
-    add        esi, 4
-    movd       xmm5, esi
-    sub        ecx, 4
-    jl         l4b
-
-        // setup for 4 pixel loop
-    pshufd     xmm7, xmm7, 0x44  // dup dudv
-    pshufd     xmm5, xmm5, 0  // dup 4, stride
-    movdqa     xmm0, xmm2  // x0, y0, x1, y1
-    addps      xmm0, xmm7
-    movlhps    xmm2, xmm0
-    movdqa     xmm4, xmm7
-    addps      xmm4, xmm4  // dudv *= 2
-    movdqa     xmm3, xmm2  // x2, y2, x3, y3
-    addps      xmm3, xmm4
-    addps      xmm4, xmm4  // dudv *= 4
-
-        // 4 pixel loop
-  l4:
-    cvttps2dq  xmm0, xmm2  // x, y float to int first 2
-    cvttps2dq  xmm1, xmm3  // x, y float to int next 2
-    packssdw   xmm0, xmm1  // x, y as 8 shorts
-    pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       edi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       xmm1, [eax + esi]  // read pixel 0
-    movd       xmm6, [eax + edi]  // read pixel 1
-    punpckldq  xmm1, xmm6  // combine pixel 0 and 1
-    addps      xmm2, xmm4  // x, y += dx, dy first 2
-    movq       qword ptr [edx], xmm1
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       edi, xmm0
-    movd       xmm6, [eax + esi]  // read pixel 2
-    movd       xmm0, [eax + edi]  // read pixel 3
-    punpckldq  xmm6, xmm0  // combine pixel 2 and 3
-    addps      xmm3, xmm4  // x, y += dx, dy next 2
-    movq       qword ptr 8[edx], xmm6
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jge        l4
-
-  l4b:
-    add        ecx, 4 - 1
-    jl         l1b
-
-            // 1 pixel loop
-  l1:
-    cvttps2dq  xmm0, xmm2  // x, y float to int
-    packssdw   xmm0, xmm0  // x, y as shorts
-    pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
-    addps      xmm2, xmm7  // x, y += dx, dy
-    movd       esi, xmm0
-    movd       xmm0, [eax + esi]  // copy a pixel
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jge        l1
-  l1b:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBAFFINEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 32x2 -> 32x1
-__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
-                                           const uint8_t* src_ptr,
-                                           ptrdiff_t src_stride,
-                                           int dst_width,
-                                           int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]  // dst_ptr
-    mov        esi, [esp + 8 + 8]  // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 256.  Blend 100 / 0.
-    sub        edi, esi
-    cmp        eax, 128
-    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
-
-    vmovd      xmm0, eax  // high fraction 0..255
-    neg        eax
-    add        eax, 256
-    vmovd      xmm5, eax  // low fraction 256..1
-    vpunpcklbw xmm5, xmm5, xmm0
-    vpunpcklwd xmm5, xmm5, xmm5
-    vbroadcastss ymm5, xmm5
-
-    mov        eax, 0x80808080  // 128b for bias and rounding.
-    vmovd      xmm4, eax
-    vbroadcastss ymm4, xmm4
-
-  xloop:
-    vmovdqu    ymm0, [esi]
-    vmovdqu    ymm2, [esi + edx]
-    vpunpckhbw ymm1, ymm0, ymm2  // mutates
-    vpunpcklbw ymm0, ymm0, ymm2
-    vpsubb     ymm1, ymm1, ymm4  // bias to signed image
-    vpsubb     ymm0, ymm0, ymm4
-    vpmaddubsw ymm1, ymm5, ymm1
-    vpmaddubsw ymm0, ymm5, ymm0
-    vpaddw     ymm1, ymm1, ymm4  // unbias and round
-    vpaddw     ymm0, ymm0, ymm4
-    vpsrlw     ymm1, ymm1, 8
-    vpsrlw     ymm0, ymm0, 8
-    vpackuswb  ymm0, ymm0, ymm1            // unmutates
-    vmovdqu    [esi + edi], ymm0
-    lea        esi, [esi + 32]
-    sub        ecx, 32
-    jg         xloop
-    jmp        xloop99
-
-        // Blend 50 / 50.
- xloop50:
-   vmovdqu    ymm0, [esi]
-   vpavgb     ymm0, ymm0, [esi + edx]
-   vmovdqu    [esi + edi], ymm0
-   lea        esi, [esi + 32]
-   sub        ecx, 32
-   jg         xloop50
-   jmp        xloop99
-
-        // Blend 100 / 0 - Copy row unchanged.
- xloop100:
-   rep movsb
-
-  xloop99:
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_INTERPOLATEROW_AVX2
-
-// Bilinear filter 16x2 -> 16x1
-// TODO(fbarchard): Consider allowing 256 using memcpy.
-__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
-                                            const uint8_t* src_ptr,
-                                            ptrdiff_t src_stride,
-                                            int dst_width,
-                                            int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-
-    mov        edi, [esp + 8 + 4]  // dst_ptr
-    mov        esi, [esp + 8 + 8]  // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-        // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 /256.  Blend 100 / 0.
-    cmp        eax, 128
-    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
-
-    movd       xmm0, eax  // high fraction 0..255
-    neg        eax
-    add        eax, 256
-    movd       xmm5, eax  // low fraction 255..1
-    punpcklbw  xmm5, xmm0
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-    mov        eax, 0x80808080  // 128 for biasing image to signed.
-    movd       xmm4, eax
-    pshufd     xmm4, xmm4, 0x00
-
-  xloop:
-    movdqu     xmm0, [esi]
-    movdqu     xmm2, [esi + edx]
-    movdqu     xmm1, xmm0
-    punpcklbw  xmm0, xmm2
-    punpckhbw  xmm1, xmm2
-    psubb      xmm0, xmm4            // bias image by -128
-    psubb      xmm1, xmm4
-    movdqa     xmm2, xmm5
-    movdqa     xmm3, xmm5
-    pmaddubsw  xmm2, xmm0
-    pmaddubsw  xmm3, xmm1
-    paddw      xmm2, xmm4
-    paddw      xmm3, xmm4
-    psrlw      xmm2, 8
-    psrlw      xmm3, 8
-    packuswb   xmm2, xmm3
-    movdqu     [esi + edi], xmm2
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop
-    jmp        xloop99
-
-        // Blend 50 / 50.
-  xloop50:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop50
-    jmp        xloop99
-
-        // Blend 100 / 0 - Copy row unchanged.
-  xloop100:
-    movdqu     xmm0, [esi]
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop100
-
-  xloop99:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
-                                            uint8_t* dst_argb,
-                                            const uint8_t* shuffler,
-                                            int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // shuffler
-    movdqu     xmm5, [ecx]
-    mov        ecx, [esp + 16]  // width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm5
-    pshufb     xmm1, xmm5
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         wloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
-                                           uint8_t* dst_argb,
-                                           const uint8_t* shuffler,
-                                           int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // shuffler
-    vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
-    mov        ecx, [esp + 16]  // width
-
-  wloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax, [eax + 64]
-    vpshufb    ymm0, ymm0, ymm5
-    vpshufb    ymm1, ymm1, ymm5
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         wloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBSHUFFLEROW_AVX2
-
-// YUY2 - Macro-pixel = 2 image pixels
-// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
-
-// UYVY - Macro-pixel = 2 image pixels
-// U0Y0V0Y1
-
-__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
-                                          const uint8_t* src_u,
-                                          const uint8_t* src_v,
-                                          uint8_t* dst_frame,
-                                          int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_y
-    mov        esi, [esp + 8 + 8]  // src_u
-    mov        edx, [esp + 8 + 12]  // src_v
-    mov        edi, [esp + 8 + 16]  // dst_frame
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edx, esi
-
-  convertloop:
-    movq       xmm2, qword ptr [esi]  // U
-    movq       xmm3, qword ptr [esi + edx]  // V
-    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3  // UV
-    movdqu     xmm0, [eax]  // Y
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2  // YUYV
-    punpckhbw  xmm1, xmm2
-    movdqu     [edi], xmm0
-    movdqu     [edi + 16], xmm1
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
-                                          const uint8_t* src_u,
-                                          const uint8_t* src_v,
-                                          uint8_t* dst_frame,
-                                          int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_y
-    mov        esi, [esp + 8 + 8]  // src_u
-    mov        edx, [esp + 8 + 12]  // src_v
-    mov        edi, [esp + 8 + 16]  // dst_frame
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edx, esi
-
-  convertloop:
-    movq       xmm2, qword ptr [esi]  // U
-    movq       xmm3, qword ptr [esi + edx]  // V
-    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3  // UV
-    movdqu     xmm0, [eax]  // Y
-    movdqa     xmm1, xmm2
-    lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0  // UYVY
-    punpckhbw  xmm2, xmm0
-    movdqu     [edi], xmm1
-    movdqu     [edi + 16], xmm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
-                                              uint8_t* dst_argb,
-                                              const float* poly,
-                                              int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4] /* src_argb */
-    mov        edx, [esp + 4 + 8] /* dst_argb */
-    mov        esi, [esp + 4 + 12] /* poly */
-    mov        ecx, [esp + 4 + 16] /* width */
-    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
-
-        // 2 pixel loop.
- convertloop:
-        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
-    movq       xmm0, qword ptr [eax]  // BGRABGRA
-    lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm3
-    movdqa     xmm4, xmm0
-    punpcklwd  xmm0, xmm3  // pixel 0
-    punpckhwd  xmm4, xmm3  // pixel 1
-    cvtdq2ps   xmm0, xmm0  // 4 floats
-    cvtdq2ps   xmm4, xmm4
-    movdqa     xmm1, xmm0  // X
-    movdqa     xmm5, xmm4
-    mulps      xmm0, [esi + 16]  // C1 * X
-    mulps      xmm4, [esi + 16]
-    addps      xmm0, [esi]  // result = C0 + C1 * X
-    addps      xmm4, [esi]
-    movdqa     xmm2, xmm1
-    movdqa     xmm6, xmm5
-    mulps      xmm2, xmm1  // X * X
-    mulps      xmm6, xmm5
-    mulps      xmm1, xmm2  // X * X * X
-    mulps      xmm5, xmm6
-    mulps      xmm2, [esi + 32]  // C2 * X * X
-    mulps      xmm6, [esi + 32]
-    mulps      xmm1, [esi + 48]  // C3 * X * X * X
-    mulps      xmm5, [esi + 48]
-    addps      xmm0, xmm2  // result += C2 * X * X
-    addps      xmm4, xmm6
-    addps      xmm0, xmm1  // result += C3 * X * X * X
-    addps      xmm4, xmm5
-    cvttps2dq  xmm0, xmm0
-    cvttps2dq  xmm4, xmm4
-    packuswb   xmm0, xmm4
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 2
-    jg         convertloop
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
-                                              uint8_t* dst_argb,
-                                              const float* poly,
-                                              int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_argb */
-    mov        ecx, [esp + 12] /* poly */
-    vbroadcastf128 ymm4, [ecx]  // C0
-    vbroadcastf128 ymm5, [ecx + 16]  // C1
-    vbroadcastf128 ymm6, [ecx + 32]  // C2
-    vbroadcastf128 ymm7, [ecx + 48]  // C3
-    mov        ecx, [esp + 16] /* width */
-
-    // 2 pixel loop.
- convertloop:
-    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
-    lea         eax, [eax + 8]
-    vcvtdq2ps   ymm0, ymm0  // X 8 floats
-    vmulps      ymm2, ymm0, ymm0  // X * X
-    vmulps      ymm3, ymm0, ymm7  // C3 * X
-    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
-    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
-    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
-    vcvttps2dq  ymm0, ymm0
-    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
-    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
-    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
-    vmovq       qword ptr [edx], xmm0
-    lea         edx, [edx + 8]
-    sub         ecx, 2
-    jg          convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
-
-#ifdef HAS_HALFFLOATROW_SSE2
-static float kExpBias = 1.9259299444e-34f;
-__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
-                                         uint16_t* dst,
-                                         float scale,
-                                         int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src */
-    mov        edx, [esp + 8] /* dst */
-    movd       xmm4, dword ptr [esp + 12] /* scale */
-    mov        ecx, [esp + 16] /* width */
-    mulss      xmm4, kExpBias
-    pshufd     xmm4, xmm4, 0
-    pxor       xmm5, xmm5
-    sub        edx, eax
-
-        // 8 pixel loop.
- convertloop:
-    movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
-    add         eax, 16
-    movdqa      xmm3, xmm2
-    punpcklwd   xmm2, xmm5
-    cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
-    punpckhwd   xmm3, xmm5
-    cvtdq2ps    xmm3, xmm3
-    mulps       xmm2, xmm4
-    mulps       xmm3, xmm4
-    psrld       xmm2, 13
-    psrld       xmm3, 13
-    packssdw    xmm2, xmm3
-    movdqu      [eax + edx - 16], xmm2
-    sub         ecx, 8
-    jg          convertloop
-    ret
-  }
-}
-#endif  // HAS_HALFFLOATROW_SSE2
-
-#ifdef HAS_HALFFLOATROW_AVX2
-__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
-                                         uint16_t* dst,
-                                         float scale,
-                                         int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src */
-    mov        edx, [esp + 8] /* dst */
-    movd       xmm4, dword ptr [esp + 12] /* scale */
-    mov        ecx, [esp + 16] /* width */
-
-    vmulss     xmm4, xmm4, kExpBias
-    vbroadcastss ymm4, xmm4
-    vpxor      ymm5, ymm5, ymm5
-    sub        edx, eax
-
-        // 16 pixel loop.
- convertloop:
-    vmovdqu     ymm2, [eax]  // 16 shorts
-    add         eax, 32
-    vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
-    vpunpcklwd  ymm2, ymm2, ymm5
-    vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
-    vcvtdq2ps   ymm2, ymm2
-    vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
-    vmulps      ymm2, ymm2, ymm4
-    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
-    vpsrld      ymm2, ymm2, 13
-    vpackssdw   ymm2, ymm2, ymm3
-    vmovdqu     [eax + edx - 32], ymm2
-    sub         ecx, 16
-    jg          convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_HALFFLOATROW_AVX2
-
-#ifdef HAS_HALFFLOATROW_F16C
-__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
-                                         uint16_t* dst,
-                                         float scale,
-                                         int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src */
-    mov        edx, [esp + 8] /* dst */
-    vbroadcastss ymm4, [esp + 12] /* scale */
-    mov        ecx, [esp + 16] /* width */
-    sub        edx, eax
-
-        // 16 pixel loop.
- convertloop:
-    vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
-    vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
-    add         eax, 32
-    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
-    vcvtdq2ps   ymm3, ymm3
-    vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
-    vmulps      ymm3, ymm3, ymm4
-    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
-    vcvtps2ph   xmm3, ymm3, 3
-    vmovdqu     [eax + edx + 32], xmm2
-    vmovdqu     [eax + edx + 32 + 16], xmm3
-    sub         ecx, 16
-    jg          convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_HALFFLOATROW_F16C
-
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
-                                             const uint8_t* table_argb,
-                                             int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4] /* dst_argb */
-    mov        esi, [esp + 4 + 8] /* table_argb */
-    mov        ecx, [esp + 4 + 12] /* width */
-
-    // 1 pixel loop.
-  convertloop:
-    movzx      edx, byte ptr [eax]
-    lea        eax, [eax + 4]
-    movzx      edx, byte ptr [esi + edx * 4]
-    mov        byte ptr [eax - 4], dl
-    movzx      edx, byte ptr [eax - 4 + 1]
-    movzx      edx, byte ptr [esi + edx * 4 + 1]
-    mov        byte ptr [eax - 4 + 1], dl
-    movzx      edx, byte ptr [eax - 4 + 2]
-    movzx      edx, byte ptr [esi + edx * 4 + 2]
-    mov        byte ptr [eax - 4 + 2], dl
-    movzx      edx, byte ptr [eax - 4 + 3]
-    movzx      edx, byte ptr [esi + edx * 4 + 3]
-    mov        byte ptr [eax - 4 + 3], dl
-    dec        ecx
-    jg         convertloop
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBCOLORTABLEROW_X86
-
-#ifdef HAS_RGBCOLORTABLEROW_X86
-// Tranform RGB pixels with color table.
-__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
-                                            const uint8_t* table_argb,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4] /* dst_argb */
-    mov        esi, [esp + 4 + 8] /* table_argb */
-    mov        ecx, [esp + 4 + 12] /* width */
-
-    // 1 pixel loop.
-  convertloop:
-    movzx      edx, byte ptr [eax]
-    lea        eax, [eax + 4]
-    movzx      edx, byte ptr [esi + edx * 4]
-    mov        byte ptr [eax - 4], dl
-    movzx      edx, byte ptr [eax - 4 + 1]
-    movzx      edx, byte ptr [esi + edx * 4 + 1]
-    mov        byte ptr [eax - 4 + 1], dl
-    movzx      edx, byte ptr [eax - 4 + 2]
-    movzx      edx, byte ptr [esi + edx * 4 + 2]
-    mov        byte ptr [eax - 4 + 2], dl
-    dec        ecx
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_RGBCOLORTABLEROW_X86
-
-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
-// Tranform RGB pixels with luma table.
-__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
-                                                   uint8_t* dst_argb,
-                                                   int width,
-                                                   const uint8_t* luma,
-                                                   uint32_t lumacoeff) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4] /* src_argb */
-    mov        edi, [esp + 8 + 8] /* dst_argb */
-    mov        ecx, [esp + 8 + 12] /* width */
-    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
-    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
-    pshufd     xmm2, xmm2, 0
-    pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
-    psllw      xmm4, 8
-    pxor       xmm5, xmm5
-
-        // 4 pixel loop.
-  convertloop:
-    movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
-    pmaddubsw  xmm0, xmm3
-    phaddw     xmm0, xmm0
-    pand       xmm0, xmm4  // mask out low bits
-    punpcklwd  xmm0, xmm5
-    paddd      xmm0, xmm2  // add table base
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
-
-    movzx      edx, byte ptr [eax]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi], dl
-    movzx      edx, byte ptr [eax + 1]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 1], dl
-    movzx      edx, byte ptr [eax + 2]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 2], dl
-    movzx      edx, byte ptr [eax + 3]  // copy alpha.
-    mov        byte ptr [edi + 3], dl
-
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
-
-    movzx      edx, byte ptr [eax + 4]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 4], dl
-    movzx      edx, byte ptr [eax + 5]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 5], dl
-    movzx      edx, byte ptr [eax + 6]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 6], dl
-    movzx      edx, byte ptr [eax + 7]  // copy alpha.
-    mov        byte ptr [edi + 7], dl
-
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
-
-    movzx      edx, byte ptr [eax + 8]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 8], dl
-    movzx      edx, byte ptr [eax + 9]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 9], dl
-    movzx      edx, byte ptr [eax + 10]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 10], dl
-    movzx      edx, byte ptr [eax + 11]  // copy alpha.
-    mov        byte ptr [edi + 11], dl
-
-    movd       esi, xmm0
-
-    movzx      edx, byte ptr [eax + 12]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 12], dl
-    movzx      edx, byte ptr [eax + 13]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 13], dl
-    movzx      edx, byte ptr [eax + 14]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 14], dl
-    movzx      edx, byte ptr [eax + 15]  // copy alpha.
-    mov        byte ptr [edi + 15], dl
-
-    lea        eax, [eax + 16]
-    lea        edi, [edi + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-
-#endif  // defined(_M_X64)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
diff --git a/thirdparty/libyuv/source/scale.cc b/thirdparty/libyuv/source/scale.cc
deleted file mode 100644
index 03b0486..0000000
--- a/thirdparty/libyuv/source/scale.cc
+++ /dev/null
@@ -1,2385 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyPlane
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-#include "libyuv/scale_uv.h"  // For UVScale
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-
-// Scale plane, 1/2
-// This is an optimized version for scaling down a plane to 1/2 of
-// its original size.
-
-static void ScalePlaneDown2(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                        uint8_t* dst_ptr, int dst_width) =
-      filtering == kFilterNone
-          ? ScaleRowDown2_C
-          : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
-                                        : ScaleRowDown2Box_C);
-  int row_stride = src_stride << 1;
-  (void)src_width;
-  (void)src_height;
-  if (!filtering) {
-    src_ptr += src_stride;  // Point to odd rows.
-    src_stride = 0;
-  }
-
-#if defined(HAS_SCALEROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown2 =
-        filtering == kFilterNone
-            ? ScaleRowDown2_Any_NEON
-            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
-                                          : ScaleRowDown2Box_Any_NEON);
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
-                                               : (filtering == kFilterLinear
-                                                      ? ScaleRowDown2Linear_NEON
-                                                      : ScaleRowDown2Box_NEON);
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown2 =
-        filtering == kFilterNone
-            ? ScaleRowDown2_Any_SSSE3
-            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
-                                          : ScaleRowDown2Box_Any_SSSE3);
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 =
-          filtering == kFilterNone
-              ? ScaleRowDown2_SSSE3
-              : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
-                                            : ScaleRowDown2Box_SSSE3);
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown2 =
-        filtering == kFilterNone
-            ? ScaleRowDown2_Any_AVX2
-            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
-                                          : ScaleRowDown2Box_Any_AVX2);
-    if (IS_ALIGNED(dst_width, 32)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
-                                               : (filtering == kFilterLinear
-                                                      ? ScaleRowDown2Linear_AVX2
-                                                      : ScaleRowDown2Box_AVX2);
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleRowDown2 =
-        filtering == kFilterNone
-            ? ScaleRowDown2_Any_MMI
-            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI
-                                          : ScaleRowDown2Box_Any_MMI);
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI
-                                               : (filtering == kFilterLinear
-                                                      ? ScaleRowDown2Linear_MMI
-                                                      : ScaleRowDown2Box_MMI);
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleRowDown2 =
-        filtering == kFilterNone
-            ? ScaleRowDown2_Any_MSA
-            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
-                                          : ScaleRowDown2Box_Any_MSA);
-    if (IS_ALIGNED(dst_width, 32)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
-                                               : (filtering == kFilterLinear
-                                                      ? ScaleRowDown2Linear_MSA
-                                                      : ScaleRowDown2Box_MSA);
-    }
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  // TODO(fbarchard): Loop through source height to allow odd height.
-  for (y = 0; y < dst_height; ++y) {
-    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += row_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void ScalePlaneDown2_16(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr,
-                               enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                        uint16_t* dst_ptr, int dst_width) =
-      filtering == kFilterNone
-          ? ScaleRowDown2_16_C
-          : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
-                                        : ScaleRowDown2Box_16_C);
-  int row_stride = src_stride << 1;
-  (void)src_width;
-  (void)src_height;
-  if (!filtering) {
-    src_ptr += src_stride;  // Point to odd rows.
-    src_stride = 0;
-  }
-
-#if defined(HAS_SCALEROWDOWN2_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 =
-        filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 =
-        filtering == kFilterNone
-            ? ScaleRowDown2_16_SSE2
-            : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
-                                          : ScaleRowDown2Box_16_SSE2);
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_16_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI
-                                             : (filtering == kFilterLinear
-                                                    ? ScaleRowDown2Linear_16_MMI
-                                                    : ScaleRowDown2Box_16_MMI);
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  // TODO(fbarchard): Loop through source height to allow odd height.
-  for (y = 0; y < dst_height; ++y) {
-    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += row_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-// Scale plane, 1/4
-// This is an optimized version for scaling down a plane to 1/4 of
-// its original size.
-
-static void ScalePlaneDown4(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                        uint8_t* dst_ptr, int dst_width) =
-      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
-  int row_stride = src_stride << 2;
-  (void)src_width;
-  (void)src_height;
-  if (!filtering) {
-    src_ptr += src_stride * 2;  // Point to row 2.
-    src_stride = 0;
-  }
-#if defined(HAS_SCALEROWDOWN4_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown4 =
-        filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown4 =
-        filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown4 =
-        filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleRowDown4 =
-        filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleRowDown4 =
-        filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
-    }
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  for (y = 0; y < dst_height; ++y) {
-    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += row_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void ScalePlaneDown4_16(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr,
-                               enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                        uint16_t* dst_ptr, int dst_width) =
-      filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
-  int row_stride = src_stride << 2;
-  (void)src_width;
-  (void)src_height;
-  if (!filtering) {
-    src_ptr += src_stride * 2;  // Point to row 2.
-    src_stride = 0;
-  }
-#if defined(HAS_SCALEROWDOWN4_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 =
-        filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 =
-        filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_16_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI;
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  for (y = 0; y < dst_height; ++y) {
-    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
-    src_ptr += row_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-// Scale plane down, 3/4
-static void ScalePlaneDown34(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                           uint8_t* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                           uint8_t* dst_ptr, int dst_width);
-  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
-  (void)src_width;
-  (void)src_height;
-  assert(dst_width % 3 == 0);
-  if (!filtering) {
-    ScaleRowDown34_0 = ScaleRowDown34_C;
-    ScaleRowDown34_1 = ScaleRowDown34_C;
-  } else {
-    ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
-    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
-  }
-#if defined(HAS_SCALEROWDOWN34_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
-    }
-    if (dst_width % 24 == 0) {
-      if (!filtering) {
-        ScaleRowDown34_0 = ScaleRowDown34_NEON;
-        ScaleRowDown34_1 = ScaleRowDown34_NEON;
-      } else {
-        ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
-        ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN34_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_Any_MMI;
-      ScaleRowDown34_1 = ScaleRowDown34_Any_MMI;
-      if (dst_width % 24 == 0) {
-        ScaleRowDown34_0 = ScaleRowDown34_MMI;
-        ScaleRowDown34_1 = ScaleRowDown34_MMI;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN34_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_Any_MSA;
-      ScaleRowDown34_1 = ScaleRowDown34_Any_MSA;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA;
-    }
-    if (dst_width % 48 == 0) {
-      if (!filtering) {
-        ScaleRowDown34_0 = ScaleRowDown34_MSA;
-        ScaleRowDown34_1 = ScaleRowDown34_MSA;
-      } else {
-        ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA;
-        ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN34_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
-    }
-    if (dst_width % 24 == 0) {
-      if (!filtering) {
-        ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
-        ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
-      } else {
-        ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
-        ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
-      }
-    }
-  }
-#endif
-
-  for (y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 2;
-    dst_ptr += dst_stride;
-  }
-
-  // Remainder 1 or 2 rows with last row vertically unfiltered
-  if ((dst_height % 3) == 2) {
-    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
-  } else if ((dst_height % 3) == 1) {
-    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
-  }
-}
-
-static void ScalePlaneDown34_16(int src_width,
-                                int src_height,
-                                int dst_width,
-                                int dst_height,
-                                int src_stride,
-                                int dst_stride,
-                                const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                           uint16_t* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                           uint16_t* dst_ptr, int dst_width);
-  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
-  (void)src_width;
-  (void)src_height;
-  assert(dst_width % 3 == 0);
-  if (!filtering) {
-    ScaleRowDown34_0 = ScaleRowDown34_16_C;
-    ScaleRowDown34_1 = ScaleRowDown34_16_C;
-  } else {
-    ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
-    ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
-  }
-#if defined(HAS_SCALEROWDOWN34_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
-    }
-  }
-#endif
-
-  for (y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 2;
-    dst_ptr += dst_stride;
-  }
-
-  // Remainder 1 or 2 rows with last row vertically unfiltered
-  if ((dst_height % 3) == 2) {
-    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
-  } else if ((dst_height % 3) == 1) {
-    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
-  }
-}
-
-// Scale plane, 3/8
-// This is an optimized version for scaling down a plane to 3/8
-// of its original size.
-//
-// Uses box filter arranges like this
-// aaabbbcc -> abc
-// aaabbbcc    def
-// aaabbbcc    ghi
-// dddeeeff
-// dddeeeff
-// dddeeeff
-// ggghhhii
-// ggghhhii
-// Boxes are 3x3, 2x3, 3x2 and 2x2
-
-static void ScalePlaneDown38(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                           uint8_t* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                           uint8_t* dst_ptr, int dst_width);
-  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
-  assert(dst_width % 3 == 0);
-  (void)src_width;
-  (void)src_height;
-  if (!filtering) {
-    ScaleRowDown38_3 = ScaleRowDown38_C;
-    ScaleRowDown38_2 = ScaleRowDown38_C;
-  } else {
-    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
-    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
-  }
-
-#if defined(HAS_SCALEROWDOWN38_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
-    }
-    if (dst_width % 12 == 0) {
-      if (!filtering) {
-        ScaleRowDown38_3 = ScaleRowDown38_NEON;
-        ScaleRowDown38_2 = ScaleRowDown38_NEON;
-      } else {
-        ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
-        ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
-      }
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN38_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
-    }
-    if (dst_width % 12 == 0 && !filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
-    }
-    if (dst_width % 6 == 0 && filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN38_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
-      ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
-    }
-    if (dst_width % 12 == 0) {
-      if (!filtering) {
-        ScaleRowDown38_3 = ScaleRowDown38_MSA;
-        ScaleRowDown38_2 = ScaleRowDown38_MSA;
-      } else {
-        ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
-        ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
-      }
-    }
-  }
-#endif
-
-  for (y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 2;
-    dst_ptr += dst_stride;
-  }
-
-  // Remainder 1 or 2 rows with last row vertically unfiltered
-  if ((dst_height % 3) == 2) {
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
-  } else if ((dst_height % 3) == 1) {
-    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
-  }
-}
-
-static void ScalePlaneDown38_16(int src_width,
-                                int src_height,
-                                int dst_width,
-                                int dst_height,
-                                int src_stride,
-                                int dst_stride,
-                                const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                enum FilterMode filtering) {
-  int y;
-  void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                           uint16_t* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                           uint16_t* dst_ptr, int dst_width);
-  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
-  (void)src_width;
-  (void)src_height;
-  assert(dst_width % 3 == 0);
-  if (!filtering) {
-    ScaleRowDown38_3 = ScaleRowDown38_16_C;
-    ScaleRowDown38_2 = ScaleRowDown38_16_C;
-  } else {
-    ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
-    ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
-  }
-#if defined(HAS_SCALEROWDOWN38_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
-    }
-  }
-#endif
-
-  for (y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 2;
-    dst_ptr += dst_stride;
-  }
-
-  // Remainder 1 or 2 rows with last row vertically unfiltered
-  if ((dst_height % 3) == 2) {
-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-    src_ptr += src_stride * 3;
-    dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
-  } else if ((dst_height % 3) == 1) {
-    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
-  }
-}
-
-#define MIN1(x) ((x) < 1 ? 1 : (x))
-
-static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) {
-  uint32_t sum = 0u;
-  int x;
-  assert(iboxwidth > 0);
-  for (x = 0; x < iboxwidth; ++x) {
-    sum += src_ptr[x];
-  }
-  return sum;
-}
-
-static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) {
-  uint32_t sum = 0u;
-  int x;
-  assert(iboxwidth > 0);
-  for (x = 0; x < iboxwidth; ++x) {
-    sum += src_ptr[x];
-  }
-  return sum;
-}
-
-static void ScaleAddCols2_C(int dst_width,
-                            int boxheight,
-                            int x,
-                            int dx,
-                            const uint16_t* src_ptr,
-                            uint8_t* dst_ptr) {
-  int i;
-  int scaletbl[2];
-  int minboxwidth = dx >> 16;
-  int boxwidth;
-  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
-  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
-  for (i = 0; i < dst_width; ++i) {
-    int ix = x >> 16;
-    x += dx;
-    boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ =
-        SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
-        16;
-  }
-}
-
-static void ScaleAddCols2_16_C(int dst_width,
-                               int boxheight,
-                               int x,
-                               int dx,
-                               const uint32_t* src_ptr,
-                               uint16_t* dst_ptr) {
-  int i;
-  int scaletbl[2];
-  int minboxwidth = dx >> 16;
-  int boxwidth;
-  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
-  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
-  for (i = 0; i < dst_width; ++i) {
-    int ix = x >> 16;
-    x += dx;
-    boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
-                     scaletbl[boxwidth - minboxwidth] >>
-                 16;
-  }
-}
-
-static void ScaleAddCols0_C(int dst_width,
-                            int boxheight,
-                            int x,
-                            int dx,
-                            const uint16_t* src_ptr,
-                            uint8_t* dst_ptr) {
-  int scaleval = 65536 / boxheight;
-  int i;
-  (void)dx;
-  src_ptr += (x >> 16);
-  for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
-  }
-}
-
-static void ScaleAddCols1_C(int dst_width,
-                            int boxheight,
-                            int x,
-                            int dx,
-                            const uint16_t* src_ptr,
-                            uint8_t* dst_ptr) {
-  int boxwidth = MIN1(dx >> 16);
-  int scaleval = 65536 / (boxwidth * boxheight);
-  int i;
-  x >>= 16;
-  for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
-    x += boxwidth;
-  }
-}
-
-static void ScaleAddCols1_16_C(int dst_width,
-                               int boxheight,
-                               int x,
-                               int dx,
-                               const uint32_t* src_ptr,
-                               uint16_t* dst_ptr) {
-  int boxwidth = MIN1(dx >> 16);
-  int scaleval = 65536 / (boxwidth * boxheight);
-  int i;
-  for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
-    x += boxwidth;
-  }
-}
-
-// Scale plane down to any dimensions, with interpolation.
-// (boxfilter).
-//
-// Same method as SimpleScale, which is fixed point, outputting
-// one pixel of destination using fixed point (16.16) to step
-// through source, sampling a box of pixel with simple
-// averaging.
-static void ScalePlaneBox(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_ptr,
-                          uint8_t* dst_ptr) {
-  int j, k;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
-             &dx, &dy);
-  src_width = Abs(src_width);
-  {
-    // Allocate a row buffer of uint16_t.
-    align_buffer_64(row16, src_width * 2);
-    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-                         const uint16_t* src_ptr, uint8_t* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_C
-                      : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
-    void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr,
-                        int src_width) = ScaleAddRow_C;
-#if defined(HAS_SCALEADDROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2)) {
-      ScaleAddRow = ScaleAddRow_Any_SSE2;
-      if (IS_ALIGNED(src_width, 16)) {
-        ScaleAddRow = ScaleAddRow_SSE2;
-      }
-    }
-#endif
-#if defined(HAS_SCALEADDROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-      ScaleAddRow = ScaleAddRow_Any_AVX2;
-      if (IS_ALIGNED(src_width, 32)) {
-        ScaleAddRow = ScaleAddRow_AVX2;
-      }
-    }
-#endif
-#if defined(HAS_SCALEADDROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ScaleAddRow = ScaleAddRow_Any_NEON;
-      if (IS_ALIGNED(src_width, 16)) {
-        ScaleAddRow = ScaleAddRow_NEON;
-      }
-    }
-#endif
-#if defined(HAS_SCALEADDROW_MMI)
-    if (TestCpuFlag(kCpuHasMMI)) {
-      ScaleAddRow = ScaleAddRow_Any_MMI;
-      if (IS_ALIGNED(src_width, 8)) {
-        ScaleAddRow = ScaleAddRow_MMI;
-      }
-    }
-#endif
-#if defined(HAS_SCALEADDROW_MSA)
-    if (TestCpuFlag(kCpuHasMSA)) {
-      ScaleAddRow = ScaleAddRow_Any_MSA;
-      if (IS_ALIGNED(src_width, 16)) {
-        ScaleAddRow = ScaleAddRow_MSA;
-      }
-    }
-#endif
-
-    for (j = 0; j < dst_height; ++j) {
-      int boxheight;
-      int iy = y >> 16;
-      const uint8_t* src = src_ptr + iy * src_stride;
-      y += dy;
-      if (y > max_y) {
-        y = max_y;
-      }
-      boxheight = MIN1((y >> 16) - iy);
-      memset(row16, 0, src_width * 2);
-      for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint16_t*)(row16), src_width);
-        src += src_stride;
-      }
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr);
-      dst_ptr += dst_stride;
-    }
-    free_aligned_buffer_64(row16);
-  }
-}
-
-static void ScalePlaneBox_16(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr) {
-  int j, k;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
-             &dx, &dy);
-  src_width = Abs(src_width);
-  {
-    // Allocate a row buffer of uint32_t.
-    align_buffer_64(row32, src_width * 4);
-    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-                         const uint32_t* src_ptr, uint16_t* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
-    void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr,
-                        int src_width) = ScaleAddRow_16_C;
-
-#if defined(HAS_SCALEADDROW_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
-      ScaleAddRow = ScaleAddRow_16_SSE2;
-    }
-#endif
-
-#if defined(HAS_SCALEADDROW_16_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) {
-      ScaleAddRow = ScaleAddRow_16_MMI;
-    }
-#endif
-    for (j = 0; j < dst_height; ++j) {
-      int boxheight;
-      int iy = y >> 16;
-      const uint16_t* src = src_ptr + iy * src_stride;
-      y += dy;
-      if (y > max_y) {
-        y = max_y;
-      }
-      boxheight = MIN1((y >> 16) - iy);
-      memset(row32, 0, src_width * 4);
-      for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint32_t*)(row32), src_width);
-        src += src_stride;
-      }
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr);
-      dst_ptr += dst_stride;
-    }
-    free_aligned_buffer_64(row32);
-  }
-}
-
-// Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            enum FilterMode filtering) {
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
-  // Allocate a row buffer.
-  align_buffer_64(row, src_width);
-
-  const int max_y = (src_height - 1) << 16;
-  int j;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                          int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
-             &dx, &dy);
-  src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(src_width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(src_width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-
-#if defined(HAS_SCALEFILTERCOLS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEFILTERCOLS_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleFilterCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEFILTERCOLS_MSA)
-  if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleFilterCols = ScaleFilterCols_MSA;
-    }
-  }
-#endif
-  if (y > max_y) {
-    y = max_y;
-  }
-
-  for (j = 0; j < dst_height; ++j) {
-    int yi = y >> 16;
-    const uint8_t* src = src_ptr + yi * src_stride;
-    if (filtering == kFilterLinear) {
-      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
-    } else {
-      int yf = (y >> 8) & 255;
-      InterpolateRow(row, src, src_stride, src_width, yf);
-      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
-    }
-    dst_ptr += dst_stride;
-    y += dy;
-    if (y > max_y) {
-      y = max_y;
-    }
-  }
-  free_aligned_buffer_64(row);
-}
-
-void ScalePlaneBilinearDown_16(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr,
-                               enum FilterMode filtering) {
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
-  // Allocate a row buffer.
-  align_buffer_64(row, src_width * 2);
-
-  const int max_y = (src_height - 1) << 16;
-  int j;
-  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
-                          int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
-  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
-             &dx, &dy);
-  src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_16_SSE2;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_16_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_16_SSSE3;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_16_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_16_AVX2;
-    if (IS_ALIGNED(src_width, 32)) {
-      InterpolateRow = InterpolateRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_16_NEON;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_16_NEON;
-    }
-  }
-#endif
-
-#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
-  }
-#endif
-  if (y > max_y) {
-    y = max_y;
-  }
-
-  for (j = 0; j < dst_height; ++j) {
-    int yi = y >> 16;
-    const uint16_t* src = src_ptr + yi * src_stride;
-    if (filtering == kFilterLinear) {
-      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
-    } else {
-      int yf = (y >> 8) & 255;
-      InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf);
-      ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx);
-    }
-    dst_ptr += dst_stride;
-    y += dy;
-    if (y > max_y) {
-      y = max_y;
-    }
-  }
-  free_aligned_buffer_64(row);
-}
-
-// Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_ptr,
-                          uint8_t* dst_ptr,
-                          enum FilterMode filtering) {
-  int j;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                          int dst_width, int x, int dx) =
-      filtering ? ScaleFilterCols_C : ScaleCols_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
-             &dx, &dy);
-  src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-
-  if (filtering && src_width >= 32768) {
-    ScaleFilterCols = ScaleFilterCols64_C;
-  }
-#if defined(HAS_SCALEFILTERCOLS_SSSE3)
-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEFILTERCOLS_NEON)
-  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleFilterCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEFILTERCOLS_MSA)
-  if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleFilterCols = ScaleFilterCols_MSA;
-    }
-  }
-#endif
-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
-    ScaleFilterCols = ScaleColsUp2_C;
-#if defined(HAS_SCALECOLS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleColsUp2_SSE2;
-    }
-#endif
-#if defined(HAS_SCALECOLS_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleColsUp2_MMI;
-    }
-#endif
-  }
-
-  if (y > max_y) {
-    y = max_y;
-  }
-  {
-    int yi = y >> 16;
-    const uint8_t* src = src_ptr + yi * src_stride;
-
-    // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-
-    uint8_t* rowptr = row;
-    int rowstride = kRowSize;
-    int lasty = yi;
-
-    ScaleFilterCols(rowptr, src, dst_width, x, dx);
-    if (src_height > 1) {
-      src += src_stride;
-    }
-    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-    src += src_stride;
-
-    for (j = 0; j < dst_height; ++j) {
-      yi = y >> 16;
-      if (yi != lasty) {
-        if (y > max_y) {
-          y = max_y;
-          yi = y >> 16;
-          src = src_ptr + yi * src_stride;
-        }
-        if (yi != lasty) {
-          ScaleFilterCols(rowptr, src, dst_width, x, dx);
-          rowptr += rowstride;
-          rowstride = -rowstride;
-          lasty = yi;
-          src += src_stride;
-        }
-      }
-      if (filtering == kFilterLinear) {
-        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
-      }
-      dst_ptr += dst_stride;
-      y += dy;
-    }
-    free_aligned_buffer_64(row);
-  }
-}
-
-// Scale plane, horizontally up by 2 times.
-// Uses linear filter horizontally, nearest vertically.
-// This is an optimized version for scaling up a plane to 2 times of
-// its original width, using linear interpolation.
-// This is used to scale U and V planes of I422 to I444.
-void ScalePlaneUp2_Linear(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_ptr,
-                          uint8_t* dst_ptr) {
-  void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
-      ScaleRowUp2_Linear_Any_C;
-  int i;
-  int y;
-  int dy;
-
-  // This function can only scale up by 2 times horizontally.
-  assert(src_width == ((dst_width + 1) / 2));
-
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_NEON
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
-  }
-#endif
-
-  if (dst_height == 1) {
-    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
-               dst_width);
-  } else {
-    dy = FixedDiv(src_height - 1, dst_height - 1);
-    y = (1 << 15) - 1;
-    for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
-      dst_ptr += dst_stride;
-      y += dy;
-    }
-  }
-}
-
-// Scale plane, up by 2 times.
-// This is an optimized version for scaling up a plane to 2 times of
-// its original size, using bilinear interpolation.
-// This is used to scale U and V planes of I420 to I444.
-void ScalePlaneUp2_Bilinear(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_ptr,
-                            uint8_t* dst_ptr) {
-  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_Any_C;
-  int x;
-
-  // This function can only scale up by 2 times.
-  assert(src_width == ((dst_width + 1) / 2));
-  assert(src_height == ((dst_height + 1) / 2));
-
-#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_NEON
-  if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
-  }
-#endif
-
-  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-  dst_ptr += dst_stride;
-  for (x = 0; x < src_height - 1; ++x) {
-    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
-    src_ptr += src_stride;
-    // TODO(fbarchard): Test performance of writing one row of destination at a
-    // time.
-    dst_ptr += 2 * dst_stride;
-  }
-  if (!(dst_height & 1)) {
-    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-  }
-}
-
-// Scale at most 14 bit plane, horizontally up by 2 times.
-// This is an optimized version for scaling up a plane to 2 times of
-// its original width, using linear interpolation.
-// stride is in count of uint16_t.
-// This is used to scale U and V planes of I210 to I410 and I212 to I412.
-void ScalePlaneUp2_12_Linear(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr) {
-  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
-                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
-  int i;
-  int y;
-  int dy;
-
-  // This function can only scale up by 2 times horizontally.
-  assert(src_width == ((dst_width + 1) / 2));
-
-#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_12_NEON
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
-  }
-#endif
-
-  if (dst_height == 1) {
-    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
-               dst_width);
-  } else {
-    dy = FixedDiv(src_height - 1, dst_height - 1);
-    y = (1 << 15) - 1;
-    for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
-      dst_ptr += dst_stride;
-      y += dy;
-    }
-  }
-}
-
-// Scale at most 12 bit plane, up by 2 times.
-// This is an optimized version for scaling up a plane to 2 times of
-// its original size, using bilinear interpolation.
-// stride is in count of uint16_t.
-// This is used to scale U and V planes of I010 to I410 and I012 to I412.
-void ScalePlaneUp2_12_Bilinear(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr) {
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_16_Any_C;
-  int x;
-
-  // This function can only scale up by 2 times.
-  assert(src_width == ((dst_width + 1) / 2));
-  assert(src_height == ((dst_height + 1) / 2));
-
-#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_12_NEON
-  if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
-  }
-#endif
-
-  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-  dst_ptr += dst_stride;
-  for (x = 0; x < src_height - 1; ++x) {
-    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += 2 * dst_stride;
-  }
-  if (!(dst_height & 1)) {
-    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-  }
-}
-
-void ScalePlaneUp2_16_Linear(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr) {
-  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
-                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
-  int i;
-  int y;
-  int dy;
-
-  // This function can only scale up by 2 times horizontally.
-  assert(src_width == ((dst_width + 1) / 2));
-
-#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
-  }
-#endif
-
-  if (dst_height == 1) {
-    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
-               dst_width);
-  } else {
-    dy = FixedDiv(src_height - 1, dst_height - 1);
-    y = (1 << 15) - 1;
-    for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
-      dst_ptr += dst_stride;
-      y += dy;
-    }
-  }
-}
-
-void ScalePlaneUp2_16_Bilinear(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr) {
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleRowUp2_Bilinear_16_Any_C;
-  int x;
-
-  // This function can only scale up by 2 times.
-  assert(src_width == ((dst_width + 1) / 2));
-  assert(src_height == ((dst_height + 1) / 2));
-
-#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSSE3;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
-  }
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
-  if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
-  }
-#endif
-
-  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-  dst_ptr += dst_stride;
-  for (x = 0; x < src_height - 1; ++x) {
-    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
-    src_ptr += src_stride;
-    dst_ptr += 2 * dst_stride;
-  }
-  if (!(dst_height & 1)) {
-    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-  }
-}
-
-void ScalePlaneBilinearUp_16(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr,
-                             enum FilterMode filtering) {
-  int j;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_16_C;
-  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
-                          int dst_width, int x, int dx) =
-      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
-             &dx, &dy);
-  src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_16_SSE2;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_16_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_16_SSSE3;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_16_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_16_AVX2;
-    if (IS_ALIGNED(dst_width, 32)) {
-      InterpolateRow = InterpolateRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_16_NEON;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_16_NEON;
-    }
-  }
-#endif
-
-  if (filtering && src_width >= 32768) {
-    ScaleFilterCols = ScaleFilterCols64_16_C;
-  }
-#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
-  }
-#endif
-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
-    ScaleFilterCols = ScaleColsUp2_16_C;
-#if defined(HAS_SCALECOLS_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleColsUp2_16_SSE2;
-    }
-#endif
-#if defined(HAS_SCALECOLS_16_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleColsUp2_16_MMI;
-    }
-#endif
-  }
-
-  if (y > max_y) {
-    y = max_y;
-  }
-  {
-    int yi = y >> 16;
-    const uint16_t* src = src_ptr + yi * src_stride;
-
-    // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 31) & ~31;
-    align_buffer_64(row, kRowSize * 4);
-
-    uint16_t* rowptr = (uint16_t*)row;
-    int rowstride = kRowSize;
-    int lasty = yi;
-
-    ScaleFilterCols(rowptr, src, dst_width, x, dx);
-    if (src_height > 1) {
-      src += src_stride;
-    }
-    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-    src += src_stride;
-
-    for (j = 0; j < dst_height; ++j) {
-      yi = y >> 16;
-      if (yi != lasty) {
-        if (y > max_y) {
-          y = max_y;
-          yi = y >> 16;
-          src = src_ptr + yi * src_stride;
-        }
-        if (yi != lasty) {
-          ScaleFilterCols(rowptr, src, dst_width, x, dx);
-          rowptr += rowstride;
-          rowstride = -rowstride;
-          lasty = yi;
-          src += src_stride;
-        }
-      }
-      if (filtering == kFilterLinear) {
-        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
-      }
-      dst_ptr += dst_stride;
-      y += dy;
-    }
-    free_aligned_buffer_64(row);
-  }
-}
-
-// Scale Plane to/from any dimensions, without interpolation.
-// Fixed point math is used for performance: The upper 16 bits
-// of x and dx is the integer part of the source position and
-// the lower 16 bits are the fixed decimal part.
-
-static void ScalePlaneSimple(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint8_t* src_ptr,
-                             uint8_t* dst_ptr) {
-  int i;
-  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
-                    int x, int dx) = ScaleCols_C;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
-             &dx, &dy);
-  src_width = Abs(src_width);
-
-  if (src_width * 2 == dst_width && x < 0x8000) {
-    ScaleCols = ScaleColsUp2_C;
-#if defined(HAS_SCALECOLS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleCols = ScaleColsUp2_SSE2;
-    }
-#endif
-#if defined(HAS_SCALECOLS_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
-      ScaleCols = ScaleColsUp2_MMI;
-    }
-#endif
-  }
-
-  for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
-    dst_ptr += dst_stride;
-    y += dy;
-  }
-}
-
-static void ScalePlaneSimple_16(int src_width,
-                                int src_height,
-                                int dst_width,
-                                int dst_height,
-                                int src_stride,
-                                int dst_stride,
-                                const uint16_t* src_ptr,
-                                uint16_t* dst_ptr) {
-  int i;
-  void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
-                    int x, int dx) = ScaleCols_16_C;
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
-             &dx, &dy);
-  src_width = Abs(src_width);
-
-  if (src_width * 2 == dst_width && x < 0x8000) {
-    ScaleCols = ScaleColsUp2_16_C;
-#if defined(HAS_SCALECOLS_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleCols = ScaleColsUp2_16_SSE2;
-    }
-#endif
-#if defined(HAS_SCALECOLS_16_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
-      ScaleCols = ScaleColsUp2_16_MMI;
-    }
-#endif
-  }
-
-  for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
-    dst_ptr += dst_stride;
-    y += dy;
-  }
-}
-
-// Scale a plane.
-// This function dispatches to a specialized scaler based on scale factor.
-
-LIBYUV_API
-void ScalePlane(const uint8_t* src,
-                int src_stride,
-                int src_width,
-                int src_height,
-                uint8_t* dst,
-                int dst_stride,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering) {
-  // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
-                                filtering);
-
-  // Negative height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-
-  // Use specialized scales to improve performance for common resolutions.
-  // For example, all the 1/2 scalings will use ScalePlaneDown2()
-  if (dst_width == src_width && dst_height == src_height) {
-    // Straight copy.
-    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
-    return;
-  }
-  if (dst_width == src_width && filtering != kFilterBox) {
-    int dy = FixedDiv(src_height, dst_height);
-    // Arbitrary scale vertically, but unscaled horizontally.
-    ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
-                       dst_stride, src, dst, 0, 0, dy, 1, filtering);
-    return;
-  }
-  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
-    // Scale down.
-    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
-      // optimized, 3/4
-      ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
-                       dst_stride, src, dst, filtering);
-      return;
-    }
-    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
-      // optimized, 1/2
-      ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
-                      dst_stride, src, dst, filtering);
-      return;
-    }
-    // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
-      // optimized, 3/8
-      ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
-                       dst_stride, src, dst, filtering);
-      return;
-    }
-    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-        (filtering == kFilterBox || filtering == kFilterNone)) {
-      // optimized, 1/4
-      ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
-                      dst_stride, src, dst, filtering);
-      return;
-    }
-  }
-  if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
-                  dst_stride, src, dst);
-    return;
-  }
-  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
-    ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
-                         src_stride, dst_stride, src, dst);
-    return;
-  }
-  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
-      (filtering == kFilterBilinear || filtering == kFilterBox)) {
-    ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
-                           src_stride, dst_stride, src, dst);
-    return;
-  }
-  if (filtering && dst_height > src_height) {
-    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
-                         src_stride, dst_stride, src, dst, filtering);
-    return;
-  }
-  if (filtering) {
-    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
-                           src_stride, dst_stride, src, dst, filtering);
-    return;
-  }
-  ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
-                   dst_stride, src, dst);
-}
-
-LIBYUV_API
-void ScalePlane_16(const uint16_t* src,
-                   int src_stride,
-                   int src_width,
-                   int src_height,
-                   uint16_t* dst,
-                   int dst_stride,
-                   int dst_width,
-                   int dst_height,
-                   enum FilterMode filtering) {
-  // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
-                                filtering);
-
-  // Negative height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-
-  // Use specialized scales to improve performance for common resolutions.
-  // For example, all the 1/2 scalings will use ScalePlaneDown2()
-  if (dst_width == src_width && dst_height == src_height) {
-    // Straight copy.
-    CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
-    return;
-  }
-  if (dst_width == src_width && filtering != kFilterBox) {
-    int dy = FixedDiv(src_height, dst_height);
-    // Arbitrary scale vertically, but unscaled horizontally.
-    ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
-                          dst_stride, src, dst, 0, 0, dy, 1, filtering);
-    return;
-  }
-  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
-    // Scale down.
-    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
-      // optimized, 3/4
-      ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
-                          src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
-      // optimized, 1/2
-      ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
-                         src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-    // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
-      // optimized, 3/8
-      ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
-                          src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-        (filtering == kFilterBox || filtering == kFilterNone)) {
-      // optimized, 1/4
-      ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
-                         src_stride, dst_stride, src, dst, filtering);
-      return;
-    }
-  }
-  if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
-                     dst_stride, src, dst);
-    return;
-  }
-  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
-    ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
-                            src_stride, dst_stride, src, dst);
-    return;
-  }
-  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
-      (filtering == kFilterBilinear || filtering == kFilterBox)) {
-    ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
-                              src_stride, dst_stride, src, dst);
-    return;
-  }
-  if (filtering && dst_height > src_height) {
-    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
-                            src_stride, dst_stride, src, dst, filtering);
-    return;
-  }
-  if (filtering) {
-    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
-                              src_stride, dst_stride, src, dst, filtering);
-    return;
-  }
-  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
-                      dst_stride, src, dst);
-}
-
-LIBYUV_API
-void ScalePlane_12(const uint16_t* src,
-                   int src_stride,
-                   int src_width,
-                   int src_height,
-                   uint16_t* dst,
-                   int dst_stride,
-                   int dst_width,
-                   int dst_height,
-                   enum FilterMode filtering) {
-  // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
-                                filtering);
-
-  // Negative height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-
-  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
-    ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height,
-                            src_stride, dst_stride, src, dst);
-    return;
-  }
-  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
-      (filtering == kFilterBilinear || filtering == kFilterBox)) {
-    ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height,
-                              src_stride, dst_stride, src, dst);
-    return;
-  }
-
-  ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
-                dst_width, dst_height, filtering);
-}
-
-// Scale an I420 image.
-// This function in turn calls a scaling function for each plane.
-
-LIBYUV_API
-int I420Scale(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              int src_width,
-              int src_height,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering) {
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-             dst_width, dst_height, filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
-             dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
-             dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
-  return 0;
-}
-
-LIBYUV_API
-int I420Scale_16(const uint16_t* src_y,
-                 int src_stride_y,
-                 const uint16_t* src_u,
-                 int src_stride_u,
-                 const uint16_t* src_v,
-                 int src_stride_v,
-                 int src_width,
-                 int src_height,
-                 uint16_t* dst_y,
-                 int dst_stride_y,
-                 uint16_t* dst_u,
-                 int dst_stride_u,
-                 uint16_t* dst_v,
-                 int dst_stride_v,
-                 int dst_width,
-                 int dst_height,
-                 enum FilterMode filtering) {
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
-                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
-  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
-                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
-  return 0;
-}
-
-LIBYUV_API
-int I420Scale_12(const uint16_t* src_y,
-                 int src_stride_y,
-                 const uint16_t* src_u,
-                 int src_stride_u,
-                 const uint16_t* src_v,
-                 int src_stride_v,
-                 int src_width,
-                 int src_height,
-                 uint16_t* dst_y,
-                 int dst_stride_y,
-                 uint16_t* dst_u,
-                 int dst_stride_u,
-                 uint16_t* dst_v,
-                 int dst_stride_v,
-                 int dst_width,
-                 int dst_height,
-                 enum FilterMode filtering) {
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-
-  ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
-                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
-  ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
-                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
-  return 0;
-}
-
-// Scale an I444 image.
-// This function in turn calls a scaling function for each plane.
-
-LIBYUV_API
-int I444Scale(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              int src_width,
-              int src_height,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering) {
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-             dst_width, dst_height, filtering);
-  ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
-             dst_width, dst_height, filtering);
-  ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
-             dst_width, dst_height, filtering);
-  return 0;
-}
-
-LIBYUV_API
-int I444Scale_16(const uint16_t* src_y,
-                 int src_stride_y,
-                 const uint16_t* src_u,
-                 int src_stride_u,
-                 const uint16_t* src_v,
-                 int src_stride_v,
-                 int src_width,
-                 int src_height,
-                 uint16_t* dst_y,
-                 int dst_stride_y,
-                 uint16_t* dst_u,
-                 int dst_stride_u,
-                 uint16_t* dst_v,
-                 int dst_stride_v,
-                 int dst_width,
-                 int dst_height,
-                 enum FilterMode filtering) {
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
-                dst_width, dst_height, filtering);
-  ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
-                dst_width, dst_height, filtering);
-  return 0;
-}
-
-LIBYUV_API
-int I444Scale_12(const uint16_t* src_y,
-                 int src_stride_y,
-                 const uint16_t* src_u,
-                 int src_stride_u,
-                 const uint16_t* src_v,
-                 int src_stride_v,
-                 int src_width,
-                 int src_height,
-                 uint16_t* dst_y,
-                 int dst_stride_y,
-                 uint16_t* dst_u,
-                 int dst_stride_u,
-                 uint16_t* dst_v,
-                 int dst_stride_v,
-                 int dst_width,
-                 int dst_height,
-                 enum FilterMode filtering) {
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-
-  ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
-                dst_width, dst_height, filtering);
-  ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
-                dst_width, dst_height, filtering);
-  return 0;
-}
-
-// Scale an NV12 image.
-// This function in turn calls a scaling function for each plane.
-
-LIBYUV_API
-int NV12Scale(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_uv,
-              int src_stride_uv,
-              int src_width,
-              int src_height,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_uv,
-              int dst_stride_uv,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering) {
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
-      dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-             dst_width, dst_height, filtering);
-  UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
-          dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
-  return 0;
-}
-
-// Deprecated api
-LIBYUV_API
-int Scale(const uint8_t* src_y,
-          const uint8_t* src_u,
-          const uint8_t* src_v,
-          int src_stride_y,
-          int src_stride_u,
-          int src_stride_v,
-          int src_width,
-          int src_height,
-          uint8_t* dst_y,
-          uint8_t* dst_u,
-          uint8_t* dst_v,
-          int dst_stride_y,
-          int dst_stride_u,
-          int dst_stride_v,
-          int dst_width,
-          int dst_height,
-          LIBYUV_BOOL interpolate) {
-  return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                   src_stride_v, src_width, src_height, dst_y, dst_stride_y,
-                   dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,
-                   dst_height, interpolate ? kFilterBox : kFilterNone);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/scale_any.cc b/thirdparty/libyuv/source/scale_any.cc
deleted file mode 100644
index 965749c..0000000
--- a/thirdparty/libyuv/source/scale_any.cc
+++ /dev/null
@@ -1,1026 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <string.h>  // For memset/memcpy
-
-#include "libyuv/scale.h"
-#include "libyuv/scale_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Fixed scale down.
-// Mask may be non-power of 2, so use MOD
-#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
-               int dst_width) {                                                \
-    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
-    int n = dst_width - r;                                                     \
-    if (n > 0) {                                                               \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
-    }                                                                          \
-    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
-                   dst_ptr + n * BPP, r);                                      \
-  }
-
-// Fixed scale down for odd source width.  Used by I420Blend subsampling.
-// Since dst_width is (width + 1) / 2, this function scales one less pixel
-// and copies the last pixel.
-#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
-               int dst_width) {                                                \
-    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \
-    int n = (dst_width - 1) - r;                                               \
-    if (n > 0) {                                                               \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
-    }                                                                          \
-    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
-                   dst_ptr + n * BPP, r + 1);                                  \
-  }
-
-#ifdef HAS_SCALEROWDOWN2_SSSE3
-SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_SSSE3,
-      ScaleRowDown2Linear_SSSE3,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      15)
-SDANY(ScaleRowDown2Box_Any_SSSE3,
-      ScaleRowDown2Box_SSSE3,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      15)
-SDODD(ScaleRowDown2Box_Odd_SSSE3,
-      ScaleRowDown2Box_SSSE3,
-      ScaleRowDown2Box_Odd_C,
-      2,
-      1,
-      15)
-#endif
-#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
-SDANY(ScaleUVRowDown2Box_Any_SSSE3,
-      ScaleUVRowDown2Box_SSSE3,
-      ScaleUVRowDown2Box_C,
-      2,
-      2,
-      4)
-#endif
-#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
-SDANY(ScaleUVRowDown2Box_Any_AVX2,
-      ScaleUVRowDown2Box_AVX2,
-      ScaleUVRowDown2Box_C,
-      2,
-      2,
-      8)
-#endif
-#ifdef HAS_SCALEROWDOWN2_AVX2
-SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_AVX2,
-      ScaleRowDown2Linear_AVX2,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      31)
-SDANY(ScaleRowDown2Box_Any_AVX2,
-      ScaleRowDown2Box_AVX2,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      31)
-SDODD(ScaleRowDown2Box_Odd_AVX2,
-      ScaleRowDown2Box_AVX2,
-      ScaleRowDown2Box_Odd_C,
-      2,
-      1,
-      31)
-#endif
-#ifdef HAS_SCALEROWDOWN2_NEON
-SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_NEON,
-      ScaleRowDown2Linear_NEON,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      15)
-SDANY(ScaleRowDown2Box_Any_NEON,
-      ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      15)
-SDODD(ScaleRowDown2Box_Odd_NEON,
-      ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_Odd_C,
-      2,
-      1,
-      15)
-#endif
-#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
-SDANY(ScaleUVRowDown2Box_Any_NEON,
-      ScaleUVRowDown2Box_NEON,
-      ScaleUVRowDown2Box_C,
-      2,
-      2,
-      8)
-#endif
-
-#ifdef HAS_SCALEROWDOWN2_MSA
-SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_MSA,
-      ScaleRowDown2Linear_MSA,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      31)
-SDANY(ScaleRowDown2Box_Any_MSA,
-      ScaleRowDown2Box_MSA,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      31)
-#endif
-#ifdef HAS_SCALEROWDOWN2_MMI
-SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7)
-SDANY(ScaleRowDown2Linear_Any_MMI,
-      ScaleRowDown2Linear_MMI,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      7)
-SDANY(ScaleRowDown2Box_Any_MMI,
-      ScaleRowDown2Box_MMI,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      7)
-SDODD(ScaleRowDown2Box_Odd_MMI,
-      ScaleRowDown2Box_MMI,
-      ScaleRowDown2Box_Odd_C,
-      2,
-      1,
-      7)
-#endif
-#ifdef HAS_SCALEROWDOWN4_SSSE3
-SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_SSSE3,
-      ScaleRowDown4Box_SSSE3,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      7)
-#endif
-#ifdef HAS_SCALEROWDOWN4_AVX2
-SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_AVX2,
-      ScaleRowDown4Box_AVX2,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      15)
-#endif
-#ifdef HAS_SCALEROWDOWN4_NEON
-SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_NEON,
-      ScaleRowDown4Box_NEON,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      7)
-#endif
-#ifdef HAS_SCALEROWDOWN4_MSA
-SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_MSA,
-      ScaleRowDown4Box_MSA,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      15)
-#endif
-#ifdef HAS_SCALEROWDOWN4_MMI
-SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_MMI,
-      ScaleRowDown4Box_MMI,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      7)
-#endif
-#ifdef HAS_SCALEROWDOWN34_SSSE3
-SDANY(ScaleRowDown34_Any_SSSE3,
-      ScaleRowDown34_SSSE3,
-      ScaleRowDown34_C,
-      4 / 3,
-      1,
-      23)
-SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
-      ScaleRowDown34_0_Box_SSSE3,
-      ScaleRowDown34_0_Box_C,
-      4 / 3,
-      1,
-      23)
-SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
-      ScaleRowDown34_1_Box_SSSE3,
-      ScaleRowDown34_1_Box_C,
-      4 / 3,
-      1,
-      23)
-#endif
-#ifdef HAS_SCALEROWDOWN34_NEON
-SDANY(ScaleRowDown34_Any_NEON,
-      ScaleRowDown34_NEON,
-      ScaleRowDown34_C,
-      4 / 3,
-      1,
-      23)
-SDANY(ScaleRowDown34_0_Box_Any_NEON,
-      ScaleRowDown34_0_Box_NEON,
-      ScaleRowDown34_0_Box_C,
-      4 / 3,
-      1,
-      23)
-SDANY(ScaleRowDown34_1_Box_Any_NEON,
-      ScaleRowDown34_1_Box_NEON,
-      ScaleRowDown34_1_Box_C,
-      4 / 3,
-      1,
-      23)
-#endif
-#ifdef HAS_SCALEROWDOWN34_MSA
-SDANY(ScaleRowDown34_Any_MSA,
-      ScaleRowDown34_MSA,
-      ScaleRowDown34_C,
-      4 / 3,
-      1,
-      47)
-SDANY(ScaleRowDown34_0_Box_Any_MSA,
-      ScaleRowDown34_0_Box_MSA,
-      ScaleRowDown34_0_Box_C,
-      4 / 3,
-      1,
-      47)
-SDANY(ScaleRowDown34_1_Box_Any_MSA,
-      ScaleRowDown34_1_Box_MSA,
-      ScaleRowDown34_1_Box_C,
-      4 / 3,
-      1,
-      47)
-#endif
-#ifdef HAS_SCALEROWDOWN34_MMI
-SDANY(ScaleRowDown34_Any_MMI,
-      ScaleRowDown34_MMI,
-      ScaleRowDown34_C,
-      4 / 3,
-      1,
-      23)
-#endif
-#ifdef HAS_SCALEROWDOWN38_SSSE3
-SDANY(ScaleRowDown38_Any_SSSE3,
-      ScaleRowDown38_SSSE3,
-      ScaleRowDown38_C,
-      8 / 3,
-      1,
-      11)
-SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
-      ScaleRowDown38_3_Box_SSSE3,
-      ScaleRowDown38_3_Box_C,
-      8 / 3,
-      1,
-      5)
-SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
-      ScaleRowDown38_2_Box_SSSE3,
-      ScaleRowDown38_2_Box_C,
-      8 / 3,
-      1,
-      5)
-#endif
-#ifdef HAS_SCALEROWDOWN38_NEON
-SDANY(ScaleRowDown38_Any_NEON,
-      ScaleRowDown38_NEON,
-      ScaleRowDown38_C,
-      8 / 3,
-      1,
-      11)
-SDANY(ScaleRowDown38_3_Box_Any_NEON,
-      ScaleRowDown38_3_Box_NEON,
-      ScaleRowDown38_3_Box_C,
-      8 / 3,
-      1,
-      11)
-SDANY(ScaleRowDown38_2_Box_Any_NEON,
-      ScaleRowDown38_2_Box_NEON,
-      ScaleRowDown38_2_Box_C,
-      8 / 3,
-      1,
-      11)
-#endif
-#ifdef HAS_SCALEROWDOWN38_MSA
-SDANY(ScaleRowDown38_Any_MSA,
-      ScaleRowDown38_MSA,
-      ScaleRowDown38_C,
-      8 / 3,
-      1,
-      11)
-SDANY(ScaleRowDown38_3_Box_Any_MSA,
-      ScaleRowDown38_3_Box_MSA,
-      ScaleRowDown38_3_Box_C,
-      8 / 3,
-      1,
-      11)
-SDANY(ScaleRowDown38_2_Box_Any_MSA,
-      ScaleRowDown38_2_Box_MSA,
-      ScaleRowDown38_2_Box_C,
-      8 / 3,
-      1,
-      11)
-#endif
-
-#ifdef HAS_SCALEARGBROWDOWN2_SSE2
-SDANY(ScaleARGBRowDown2_Any_SSE2,
-      ScaleARGBRowDown2_SSE2,
-      ScaleARGBRowDown2_C,
-      2,
-      4,
-      3)
-SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
-      ScaleARGBRowDown2Linear_SSE2,
-      ScaleARGBRowDown2Linear_C,
-      2,
-      4,
-      3)
-SDANY(ScaleARGBRowDown2Box_Any_SSE2,
-      ScaleARGBRowDown2Box_SSE2,
-      ScaleARGBRowDown2Box_C,
-      2,
-      4,
-      3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWN2_NEON
-SDANY(ScaleARGBRowDown2_Any_NEON,
-      ScaleARGBRowDown2_NEON,
-      ScaleARGBRowDown2_C,
-      2,
-      4,
-      7)
-SDANY(ScaleARGBRowDown2Linear_Any_NEON,
-      ScaleARGBRowDown2Linear_NEON,
-      ScaleARGBRowDown2Linear_C,
-      2,
-      4,
-      7)
-SDANY(ScaleARGBRowDown2Box_Any_NEON,
-      ScaleARGBRowDown2Box_NEON,
-      ScaleARGBRowDown2Box_C,
-      2,
-      4,
-      7)
-#endif
-#ifdef HAS_SCALEARGBROWDOWN2_MSA
-SDANY(ScaleARGBRowDown2_Any_MSA,
-      ScaleARGBRowDown2_MSA,
-      ScaleARGBRowDown2_C,
-      2,
-      4,
-      3)
-SDANY(ScaleARGBRowDown2Linear_Any_MSA,
-      ScaleARGBRowDown2Linear_MSA,
-      ScaleARGBRowDown2Linear_C,
-      2,
-      4,
-      3)
-SDANY(ScaleARGBRowDown2Box_Any_MSA,
-      ScaleARGBRowDown2Box_MSA,
-      ScaleARGBRowDown2Box_C,
-      2,
-      4,
-      3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWN2_MMI
-SDANY(ScaleARGBRowDown2_Any_MMI,
-      ScaleARGBRowDown2_MMI,
-      ScaleARGBRowDown2_C,
-      2,
-      4,
-      1)
-SDANY(ScaleARGBRowDown2Linear_Any_MMI,
-      ScaleARGBRowDown2Linear_MMI,
-      ScaleARGBRowDown2Linear_C,
-      2,
-      4,
-      1)
-SDANY(ScaleARGBRowDown2Box_Any_MMI,
-      ScaleARGBRowDown2Box_MMI,
-      ScaleARGBRowDown2Box_C,
-      2,
-      4,
-      1)
-#endif
-#undef SDANY
-
-// Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
-               uint8_t* dst_ptr, int dst_width) {                           \
-    int r = dst_width & MASK;                                               \
-    int n = dst_width & ~MASK;                                              \
-    if (n > 0) {                                                            \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \
-    }                                                                       \
-    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \
-                   dst_ptr + n * BPP, r);                                   \
-  }
-
-#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
-SDAANY(ScaleARGBRowDownEven_Any_SSE2,
-       ScaleARGBRowDownEven_SSE2,
-       ScaleARGBRowDownEven_C,
-       4,
-       3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
-       ScaleARGBRowDownEvenBox_SSE2,
-       ScaleARGBRowDownEvenBox_C,
-       4,
-       3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-SDAANY(ScaleARGBRowDownEven_Any_NEON,
-       ScaleARGBRowDownEven_NEON,
-       ScaleARGBRowDownEven_C,
-       4,
-       3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
-       ScaleARGBRowDownEvenBox_NEON,
-       ScaleARGBRowDownEvenBox_C,
-       4,
-       3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
-SDAANY(ScaleARGBRowDownEven_Any_MSA,
-       ScaleARGBRowDownEven_MSA,
-       ScaleARGBRowDownEven_C,
-       4,
-       3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
-       ScaleARGBRowDownEvenBox_MSA,
-       ScaleARGBRowDownEvenBox_C,
-       4,
-       3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI
-SDAANY(ScaleARGBRowDownEven_Any_MMI,
-       ScaleARGBRowDownEven_MMI,
-       ScaleARGBRowDownEven_C,
-       4,
-       1)
-SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
-       ScaleARGBRowDownEvenBox_MMI,
-       ScaleARGBRowDownEvenBox_C,
-       4,
-       1)
-#endif
-#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
-SDAANY(ScaleUVRowDownEven_Any_NEON,
-       ScaleUVRowDownEven_NEON,
-       ScaleUVRowDownEven_C,
-       2,
-       3)
-#endif
-
-#ifdef SASIMDONLY
-// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
-
-// Add rows box filter scale down.  Using macro from row_any
-#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                      \
-  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint16_t dst_temp[32]);                               \
-    SIMD_ALIGNED(uint8_t src_temp[32]);                                \
-    memset(dst_temp, 0, 32 * 2); /* for msan */                        \
-    int r = width & MASK;                                              \
-    int n = width & ~MASK;                                             \
-    if (n > 0) {                                                       \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                   \
-    }                                                                  \
-    memcpy(src_temp, src_ptr + n * SBPP, r * SBPP);                    \
-    memcpy(dst_temp, dst_ptr + n * BPP, r * BPP);                      \
-    ANY_SIMD(src_temp, dst_temp, MASK + 1);                            \
-    memcpy(dst_ptr + n * BPP, dst_temp, r * BPP);                      \
-  }
-
-#ifdef HAS_SCALEADDROW_SSE2
-SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
-#endif
-#ifdef HAS_SCALEADDROW_AVX2
-SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
-#endif
-#ifdef HAS_SCALEADDROW_NEON
-SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MSA
-SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MMI
-SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
-#endif
-#undef SAANY
-
-#else
-
-// Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
-  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
-    int n = src_width & ~MASK;                                             \
-    if (n > 0) {                                                           \
-      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
-    }                                                                      \
-    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
-  }
-
-#ifdef HAS_SCALEADDROW_SSE2
-SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
-#endif
-#ifdef HAS_SCALEADDROW_AVX2
-SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
-#endif
-#ifdef HAS_SCALEADDROW_NEON
-SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MSA
-SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MMI
-SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
-#endif
-#undef SAANY
-
-#endif  // SASIMDONLY
-
-// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
-  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
-               int dx) {                                                       \
-    int r = dst_width & MASK;                                                  \
-    int n = dst_width & ~MASK;                                                 \
-    if (n > 0) {                                                               \
-      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
-    }                                                                          \
-    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
-  }
-
-#ifdef HAS_SCALEFILTERCOLS_NEON
-CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
-#endif
-#ifdef HAS_SCALEFILTERCOLS_MSA
-CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
-#endif
-#ifdef HAS_SCALEARGBCOLS_NEON
-CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MSA
-CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MMI
-CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON,
-     ScaleARGBFilterCols_NEON,
-     ScaleARGBFilterCols_C,
-     4,
-     3)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_MSA
-CANY(ScaleARGBFilterCols_Any_MSA,
-     ScaleARGBFilterCols_MSA,
-     ScaleARGBFilterCols_C,
-     4,
-     7)
-#endif
-#undef CANY
-
-// Scale up horizontally 2 times using linear filter.
-#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE)                       \
-  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
-    int work_width = (dst_width - 1) & ~1;                         \
-    int r = work_width & MASK;                                     \
-    int n = work_width & ~MASK;                                    \
-    dst_ptr[0] = src_ptr[0];                                       \
-    if (work_width > 0) {                                          \
-      if (n != 0) {                                                \
-        SIMD(src_ptr, dst_ptr + 1, n);                             \
-      }                                                            \
-      C(src_ptr + (n / 2), dst_ptr + n + 1, r);                    \
-    }                                                              \
-    dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1];         \
-  }
-
-// Even the C versions need to be wrapped, because boundary pixels have to
-// be handled differently
-
-SUH2LANY(ScaleRowUp2_Linear_Any_C,
-         ScaleRowUp2_Linear_C,
-         ScaleRowUp2_Linear_C,
-         0,
-         uint8_t)
-
-SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
-         ScaleRowUp2_Linear_16_C,
-         ScaleRowUp2_Linear_16_C,
-         0,
-         uint16_t)
-
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
-SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
-         ScaleRowUp2_Linear_SSE2,
-         ScaleRowUp2_Linear_C,
-         15,
-         uint8_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
-SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
-         ScaleRowUp2_Linear_SSSE3,
-         ScaleRowUp2_Linear_C,
-         15,
-         uint8_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3
-SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3,
-         ScaleRowUp2_Linear_12_SSSE3,
-         ScaleRowUp2_Linear_16_C,
-         15,
-         uint16_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
-SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
-         ScaleRowUp2_Linear_16_SSE2,
-         ScaleRowUp2_Linear_16_C,
-         7,
-         uint16_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
-SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
-         ScaleRowUp2_Linear_AVX2,
-         ScaleRowUp2_Linear_C,
-         31,
-         uint8_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2
-SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2,
-         ScaleRowUp2_Linear_12_AVX2,
-         ScaleRowUp2_Linear_16_C,
-         31,
-         uint16_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
-SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
-         ScaleRowUp2_Linear_16_AVX2,
-         ScaleRowUp2_Linear_16_C,
-         15,
-         uint16_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_NEON
-SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
-         ScaleRowUp2_Linear_NEON,
-         ScaleRowUp2_Linear_C,
-         15,
-         uint8_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_12_NEON
-SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON,
-         ScaleRowUp2_Linear_12_NEON,
-         ScaleRowUp2_Linear_16_C,
-         15,
-         uint16_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
-SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
-         ScaleRowUp2_Linear_16_NEON,
-         ScaleRowUp2_Linear_16_C,
-         15,
-         uint16_t)
-#endif
-
-#undef SUH2LANY
-
-// Scale up 2 times using bilinear filter.
-// This function produces 2 rows at a time.
-#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE)                              \
-  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr,   \
-            ptrdiff_t dst_stride, int dst_width) {                        \
-    int work_width = (dst_width - 1) & ~1;                                \
-    int r = work_width & MASK;                                            \
-    int n = work_width & ~MASK;                                           \
-    const PTYPE* sa = src_ptr;                                            \
-    const PTYPE* sb = src_ptr + src_stride;                               \
-    PTYPE* da = dst_ptr;                                                  \
-    PTYPE* db = dst_ptr + dst_stride;                                     \
-    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                                 \
-    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                                 \
-    if (work_width > 0) {                                                 \
-      if (n != 0) {                                                       \
-        SIMD(sa, sb - sa, da + 1, db - da, n);                            \
-      }                                                                   \
-      C(sa + (n / 2), sb - sa, da + n + 1, db - da, r);                   \
-    }                                                                     \
-    da[dst_width - 1] =                                                   \
-        (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
-    db[dst_width - 1] =                                                   \
-        (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
-  }
-
-SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
-         ScaleRowUp2_Bilinear_C,
-         ScaleRowUp2_Bilinear_C,
-         0,
-         uint8_t)
-
-SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
-         ScaleRowUp2_Bilinear_16_C,
-         ScaleRowUp2_Bilinear_16_C,
-         0,
-         uint16_t)
-
-#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
-SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
-         ScaleRowUp2_Bilinear_SSE2,
-         ScaleRowUp2_Bilinear_C,
-         15,
-         uint8_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3
-SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3,
-         ScaleRowUp2_Bilinear_12_SSSE3,
-         ScaleRowUp2_Bilinear_16_C,
-         15,
-         uint16_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
-SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3,
-         ScaleRowUp2_Bilinear_16_SSE2,
-         ScaleRowUp2_Bilinear_16_C,
-         7,
-         uint16_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
-SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
-         ScaleRowUp2_Bilinear_SSSE3,
-         ScaleRowUp2_Bilinear_C,
-         15,
-         uint8_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
-SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
-         ScaleRowUp2_Bilinear_AVX2,
-         ScaleRowUp2_Bilinear_C,
-         31,
-         uint8_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2
-SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2,
-         ScaleRowUp2_Bilinear_12_AVX2,
-         ScaleRowUp2_Bilinear_16_C,
-         15,
-         uint16_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
-SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
-         ScaleRowUp2_Bilinear_16_AVX2,
-         ScaleRowUp2_Bilinear_16_C,
-         15,
-         uint16_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_NEON
-SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
-         ScaleRowUp2_Bilinear_NEON,
-         ScaleRowUp2_Bilinear_C,
-         15,
-         uint8_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_12_NEON
-SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON,
-         ScaleRowUp2_Bilinear_12_NEON,
-         ScaleRowUp2_Bilinear_16_C,
-         15,
-         uint16_t)
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
-SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
-         ScaleRowUp2_Bilinear_16_NEON,
-         ScaleRowUp2_Bilinear_16_C,
-         7,
-         uint16_t)
-#endif
-
-#undef SU2BLANY
-
-// Scale bi-planar plane up horizontally 2 times using linear filter.
-#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE)                         \
-  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) {    \
-    int work_width = (dst_width - 1) & ~1;                            \
-    int r = work_width & MASK;                                        \
-    int n = work_width & ~MASK;                                       \
-    dst_ptr[0] = src_ptr[0];                                          \
-    dst_ptr[1] = src_ptr[1];                                          \
-    if (work_width > 0) {                                             \
-      if (n != 0) {                                                   \
-        SIMD(src_ptr, dst_ptr + 2, n);                                \
-      }                                                               \
-      C(src_ptr + n, dst_ptr + 2 * n + 2, r);                         \
-    }                                                                 \
-    dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
-    dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
-  }
-
-SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
-          ScaleUVRowUp2_Linear_C,
-          ScaleUVRowUp2_Linear_C,
-          0,
-          uint8_t)
-
-SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C,
-          ScaleUVRowUp2_Linear_16_C,
-          ScaleUVRowUp2_Linear_16_C,
-          0,
-          uint16_t)
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
-SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
-          ScaleUVRowUp2_Linear_SSSE3,
-          ScaleUVRowUp2_Linear_C,
-          7,
-          uint8_t)
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
-SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
-          ScaleUVRowUp2_Linear_AVX2,
-          ScaleUVRowUp2_Linear_C,
-          15,
-          uint8_t)
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
-SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE2,
-          ScaleUVRowUp2_Linear_16_SSE2,
-          ScaleUVRowUp2_Linear_16_C,
-          3,
-          uint16_t)
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
-SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
-          ScaleUVRowUp2_Linear_16_AVX2,
-          ScaleUVRowUp2_Linear_16_C,
-          7,
-          uint16_t)
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
-SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
-          ScaleUVRowUp2_Linear_NEON,
-          ScaleUVRowUp2_Linear_C,
-          15,
-          uint8_t)
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
-SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON,
-          ScaleUVRowUp2_Linear_16_NEON,
-          ScaleUVRowUp2_Linear_16_C,
-          15,
-          uint16_t)
-#endif
-
-#undef SBUH2LANY
-
-// Scale bi-planar plane up 2 times using bilinear filter.
-// This function produces 2 rows at a time.
-#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE)                           \
-  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
-            ptrdiff_t dst_stride, int dst_width) {                      \
-    int work_width = (dst_width - 1) & ~1;                              \
-    int r = work_width & MASK;                                          \
-    int n = work_width & ~MASK;                                         \
-    const PTYPE* sa = src_ptr;                                          \
-    const PTYPE* sb = src_ptr + src_stride;                             \
-    PTYPE* da = dst_ptr;                                                \
-    PTYPE* db = dst_ptr + dst_stride;                                   \
-    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                               \
-    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                               \
-    da[1] = (3 * sa[1] + sb[1] + 2) >> 2;                               \
-    db[1] = (sa[1] + 3 * sb[1] + 2) >> 2;                               \
-    if (work_width > 0) {                                               \
-      if (n != 0) {                                                     \
-        SIMD(sa, sb - sa, da + 2, db - da, n);                          \
-      }                                                                 \
-      C(sa + n, sb - sa, da + 2 * n + 2, db - da, r);                   \
-    }                                                                   \
-    da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] +       \
-                             sb[((dst_width + 1) & ~1) - 2] + 2) >>     \
-                            2;                                          \
-    db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] +           \
-                             3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \
-                            2;                                          \
-    da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] +       \
-                             sb[((dst_width + 1) & ~1) - 1] + 2) >>     \
-                            2;                                          \
-    db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] +           \
-                             3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \
-                            2;                                          \
-  }
-
-SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
-          ScaleUVRowUp2_Bilinear_C,
-          ScaleUVRowUp2_Bilinear_C,
-          0,
-          uint8_t)
-
-SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C,
-          ScaleUVRowUp2_Bilinear_16_C,
-          ScaleUVRowUp2_Bilinear_16_C,
-          0,
-          uint16_t)
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
-SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
-          ScaleUVRowUp2_Bilinear_SSSE3,
-          ScaleUVRowUp2_Bilinear_C,
-          7,
-          uint8_t)
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
-SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
-          ScaleUVRowUp2_Bilinear_AVX2,
-          ScaleUVRowUp2_Bilinear_C,
-          15,
-          uint8_t)
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
-SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE2,
-          ScaleUVRowUp2_Bilinear_16_SSE2,
-          ScaleUVRowUp2_Bilinear_16_C,
-          7,
-          uint16_t)
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
-SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2,
-          ScaleUVRowUp2_Bilinear_16_AVX2,
-          ScaleUVRowUp2_Bilinear_16_C,
-          7,
-          uint16_t)
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
-SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
-          ScaleUVRowUp2_Bilinear_NEON,
-          ScaleUVRowUp2_Bilinear_C,
-          7,
-          uint8_t)
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
-SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON,
-          ScaleUVRowUp2_Bilinear_16_NEON,
-          ScaleUVRowUp2_Bilinear_16_C,
-          7,
-          uint16_t)
-#endif
-
-#undef SBU2BLANY
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/scale_argb.cc b/thirdparty/libyuv/source/scale_argb.cc
deleted file mode 100644
index 451d4ec..0000000
--- a/thirdparty/libyuv/source/scale_argb.cc
+++ /dev/null
@@ -1,1091 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// ScaleARGB ARGB, 1/2
-// This is an optimized version for scaling down a ARGB to 1/2 of
-// its original size.
-static void ScaleARGBDown2(int src_width,
-                           int src_height,
-                           int dst_width,
-                           int dst_height,
-                           int src_stride,
-                           int dst_stride,
-                           const uint8_t* src_argb,
-                           uint8_t* dst_argb,
-                           int x,
-                           int dx,
-                           int y,
-                           int dy,
-                           enum FilterMode filtering) {
-  int j;
-  int row_stride = src_stride * (dy >> 16);
-  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
-                            uint8_t* dst_argb, int dst_width) =
-      filtering == kFilterNone
-          ? ScaleARGBRowDown2_C
-          : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
-                                        : ScaleARGBRowDown2Box_C);
-  (void)src_width;
-  (void)src_height;
-  (void)dx;
-  assert(dx == 65536 * 2);      // Test scale factor of 2.
-  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
-  // Advance to odd row, even column.
-  if (filtering == kFilterBilinear) {
-    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-  } else {
-    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
-  }
-
-#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 =
-        filtering == kFilterNone
-            ? ScaleARGBRowDown2_Any_SSE2
-            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
-                                          : ScaleARGBRowDown2Box_Any_SSE2);
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 =
-          filtering == kFilterNone
-              ? ScaleARGBRowDown2_SSE2
-              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
-                                            : ScaleARGBRowDown2Box_SSE2);
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 =
-        filtering == kFilterNone
-            ? ScaleARGBRowDown2_Any_NEON
-            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
-                                          : ScaleARGBRowDown2Box_Any_NEON);
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBRowDown2 =
-          filtering == kFilterNone
-              ? ScaleARGBRowDown2_NEON
-              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
-                                            : ScaleARGBRowDown2Box_NEON);
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWN2_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBRowDown2 =
-        filtering == kFilterNone
-            ? ScaleARGBRowDown2_Any_MMI
-            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI
-                                          : ScaleARGBRowDown2Box_Any_MMI);
-    if (IS_ALIGNED(dst_width, 2)) {
-      ScaleARGBRowDown2 =
-          filtering == kFilterNone
-              ? ScaleARGBRowDown2_MMI
-              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI
-                                            : ScaleARGBRowDown2Box_MMI);
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWN2_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleARGBRowDown2 =
-        filtering == kFilterNone
-            ? ScaleARGBRowDown2_Any_MSA
-            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
-                                          : ScaleARGBRowDown2Box_Any_MSA);
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 =
-          filtering == kFilterNone
-              ? ScaleARGBRowDown2_MSA
-              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
-                                            : ScaleARGBRowDown2Box_MSA);
-    }
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  for (j = 0; j < dst_height; ++j) {
-    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
-    src_argb += row_stride;
-    dst_argb += dst_stride;
-  }
-}
-
-// ScaleARGB ARGB, 1/4
-// This is an optimized version for scaling down a ARGB to 1/4 of
-// its original size.
-static void ScaleARGBDown4Box(int src_width,
-                              int src_height,
-                              int dst_width,
-                              int dst_height,
-                              int src_stride,
-                              int dst_stride,
-                              const uint8_t* src_argb,
-                              uint8_t* dst_argb,
-                              int x,
-                              int dx,
-                              int y,
-                              int dy) {
-  int j;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
-  int row_stride = src_stride * (dy >> 16);
-  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
-                            uint8_t* dst_argb, int dst_width) =
-      ScaleARGBRowDown2Box_C;
-  // Advance to odd row, even column.
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-  (void)src_width;
-  (void)src_height;
-  (void)dx;
-  assert(dx == 65536 * 4);      // Test scale factor of 4.
-  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
-#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
-    }
-  }
-#endif
-
-  for (j = 0; j < dst_height; ++j) {
-    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
-                      dst_width * 2);
-    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
-    src_argb += row_stride;
-    dst_argb += dst_stride;
-  }
-  free_aligned_buffer_64(row);
-}
-
-// ScaleARGB ARGB Even
-// This is an optimized version for scaling down a ARGB to even
-// multiple of its original size.
-static void ScaleARGBDownEven(int src_width,
-                              int src_height,
-                              int dst_width,
-                              int dst_height,
-                              int src_stride,
-                              int dst_stride,
-                              const uint8_t* src_argb,
-                              uint8_t* dst_argb,
-                              int x,
-                              int dx,
-                              int y,
-                              int dy,
-                              enum FilterMode filtering) {
-  int j;
-  int col_step = dx >> 16;
-  int row_stride = (dy >> 16) * src_stride;
-  void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
-                               int src_step, uint8_t* dst_argb, int dst_width) =
-      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
-  (void)src_width;
-  (void)src_height;
-  assert(IS_ALIGNED(src_width, 2));
-  assert(IS_ALIGNED(src_height, 2));
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
-                                     : ScaleARGBRowDownEven_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven =
-          filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
-                                     : ScaleARGBRowDownEven_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven =
-          filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI
-                                     : ScaleARGBRowDownEven_Any_MMI;
-    if (IS_ALIGNED(dst_width, 2)) {
-      ScaleARGBRowDownEven =
-          filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
-                                     : ScaleARGBRowDownEven_Any_MSA;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven =
-          filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
-    }
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  for (j = 0; j < dst_height; ++j) {
-    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
-    src_argb += row_stride;
-    dst_argb += dst_stride;
-  }
-}
-
-// Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width,
-                                  int src_height,
-                                  int dst_width,
-                                  int dst_height,
-                                  int src_stride,
-                                  int dst_stride,
-                                  const uint8_t* src_argb,
-                                  uint8_t* dst_argb,
-                                  int x,
-                                  int dx,
-                                  int y,
-                                  int dy,
-                                  enum FilterMode filtering) {
-  int j;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
-                              int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
-  int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
-  int64_t xl = (dx >= 0) ? x : xlast;
-  int64_t xr = (dx >= 0) ? xlast : x;
-  int clip_src_width;
-  xl = (xl >> 16) & ~3;    // Left edge aligned.
-  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.
-  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
-  if (xr > src_width) {
-    xr = src_width;
-  }
-  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
-  src_argb += xl * 4;
-  x -= (int)(xl << 16);
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(clip_src_width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(clip_src_width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
-    }
-  }
-#endif
-  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
-  // Allocate a row of ARGB.
-  {
-    align_buffer_64(row, clip_src_width * 4);
-
-    const int max_y = (src_height - 1) << 16;
-    if (y > max_y) {
-      y = max_y;
-    }
-    for (j = 0; j < dst_height; ++j) {
-      int yi = y >> 16;
-      const uint8_t* src = src_argb + yi * src_stride;
-      if (filtering == kFilterLinear) {
-        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(row, src, src_stride, clip_src_width, yf);
-        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
-      }
-      dst_argb += dst_stride;
-      y += dy;
-      if (y > max_y) {
-        y = max_y;
-      }
-    }
-    free_aligned_buffer_64(row);
-  }
-}
-
-// Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width,
-                                int src_height,
-                                int dst_width,
-                                int dst_height,
-                                int src_stride,
-                                int dst_stride,
-                                const uint8_t* src_argb,
-                                uint8_t* dst_argb,
-                                int x,
-                                int dx,
-                                int y,
-                                int dy,
-                                enum FilterMode filtering) {
-  int j;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
-                              int dst_width, int x, int dx) =
-      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
-  const int max_y = (src_height - 1) << 16;
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 8)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(dst_width, 2)) {
-      InterpolateRow = InterpolateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(dst_width, 8)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-  if (src_width >= 32768) {
-    ScaleARGBFilterCols =
-        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
-  }
-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
-  if (filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
-  if (filtering && TestCpuFlag(kCpuHasMSA)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_SSE2)
-  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
-    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_NEON)
-  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
-  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
-    if (IS_ALIGNED(dst_width, 1)) {
-      ScaleARGBFilterCols = ScaleARGBCols_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_MSA)
-  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBCols_MSA;
-    }
-  }
-#endif
-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
-    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
-    }
-#endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
-    }
-#endif
-  }
-
-  if (y > max_y) {
-    y = max_y;
-  }
-
-  {
-    int yi = y >> 16;
-    const uint8_t* src = src_argb + yi * src_stride;
-
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (dst_width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-
-    uint8_t* rowptr = row;
-    int rowstride = kRowSize;
-    int lasty = yi;
-
-    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
-    if (src_height > 1) {
-      src += src_stride;
-    }
-    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-    src += src_stride;
-
-    for (j = 0; j < dst_height; ++j) {
-      yi = y >> 16;
-      if (yi != lasty) {
-        if (y > max_y) {
-          y = max_y;
-          yi = y >> 16;
-          src = src_argb + yi * src_stride;
-        }
-        if (yi != lasty) {
-          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
-          rowptr += rowstride;
-          rowstride = -rowstride;
-          lasty = yi;
-          src += src_stride;
-        }
-      }
-      if (filtering == kFilterLinear) {
-        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
-      }
-      dst_argb += dst_stride;
-      y += dy;
-    }
-    free_aligned_buffer_64(row);
-  }
-}
-
-#ifdef YUVSCALEUP
-// Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width,
-                                     int src_height,
-                                     int dst_width,
-                                     int dst_height,
-                                     int src_stride_y,
-                                     int src_stride_u,
-                                     int src_stride_v,
-                                     int dst_stride_argb,
-                                     const uint8_t* src_y,
-                                     const uint8_t* src_u,
-                                     const uint8_t* src_v,
-                                     uint8_t* dst_argb,
-                                     int x,
-                                     int dx,
-                                     int y,
-                                     int dy,
-                                     enum FilterMode filtering) {
-  int j;
-  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
-      I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(src_width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(src_width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(src_width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(src_width, 4)) {
-      I422ToARGBRow = I422ToARGBRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(src_width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 8)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(dst_width, 8)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
-                              int dst_width, int x, int dx) =
-      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
-  if (src_width >= 32768) {
-    ScaleARGBFilterCols =
-        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
-  }
-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
-  if (filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
-  if (filtering && TestCpuFlag(kCpuHasMSA)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_SSE2)
-  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
-    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_NEON)
-  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
-  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
-    if (IS_ALIGNED(dst_width, 1)) {
-      ScaleARGBFilterCols = ScaleARGBCols_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_MSA)
-  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBCols_MSA;
-    }
-  }
-#endif
-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
-    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
-    }
-#endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
-    }
-#endif
-  }
-
-  const int max_y = (src_height - 1) << 16;
-  if (y > max_y) {
-    y = max_y;
-  }
-  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
-  int yi = y >> 16;
-  int uv_yi = yi >> kYShift;
-  const uint8_t* src_row_y = src_y + yi * src_stride_y;
-  const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;
-  const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;
-
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 4 + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
-
-  // Allocate 1 row of ARGB for source conversion.
-  align_buffer_64(argb_row, src_width * 4);
-
-  uint8_t* rowptr = row;
-  int rowstride = kRowSize;
-  int lasty = yi;
-
-  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
-  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
-  if (src_height > 1) {
-    src_row_y += src_stride_y;
-    if (yi & 1) {
-      src_row_u += src_stride_u;
-      src_row_v += src_stride_v;
-    }
-  }
-  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
-  if (src_height > 2) {
-    src_row_y += src_stride_y;
-    if (!(yi & 1)) {
-      src_row_u += src_stride_u;
-      src_row_v += src_stride_v;
-    }
-  }
-
-  for (j = 0; j < dst_height; ++j) {
-    yi = y >> 16;
-    if (yi != lasty) {
-      if (y > max_y) {
-        y = max_y;
-        yi = y >> 16;
-        uv_yi = yi >> kYShift;
-        src_row_y = src_y + yi * src_stride_y;
-        src_row_u = src_u + uv_yi * src_stride_u;
-        src_row_v = src_v + uv_yi * src_stride_v;
-      }
-      if (yi != lasty) {
-        // TODO(fbarchard): Convert the clipped region of row.
-        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
-        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
-        rowptr += rowstride;
-        rowstride = -rowstride;
-        lasty = yi;
-        src_row_y += src_stride_y;
-        if (yi & 1) {
-          src_row_u += src_stride_u;
-          src_row_v += src_stride_v;
-        }
-      }
-    }
-    if (filtering == kFilterLinear) {
-      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
-    } else {
-      int yf = (y >> 8) & 255;
-      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
-    }
-    dst_argb += dst_stride_argb;
-    y += dy;
-  }
-  free_aligned_buffer_64(row);
-  free_aligned_buffer_64(row_argb);
-}
-#endif
-
-// Scale ARGB to/from any dimensions, without interpolation.
-// Fixed point math is used for performance: The upper 16 bits
-// of x and dx is the integer part of the source position and
-// the lower 16 bits are the fixed decimal part.
-
-static void ScaleARGBSimple(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            int x,
-                            int dx,
-                            int y,
-                            int dy) {
-  int j;
-  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
-                        int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
-  (void)src_height;
-#if defined(HAS_SCALEARGBCOLS_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
-    ScaleARGBCols = ScaleARGBCols_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBCols = ScaleARGBCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBCols = ScaleARGBCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBCols = ScaleARGBCols_Any_MMI;
-    if (IS_ALIGNED(dst_width, 1)) {
-      ScaleARGBCols = ScaleARGBCols_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleARGBCols = ScaleARGBCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBCols = ScaleARGBCols_MSA;
-    }
-  }
-#endif
-  if (src_width * 2 == dst_width && x < 0x8000) {
-    ScaleARGBCols = ScaleARGBColsUp2_C;
-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
-    }
-#endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBCols = ScaleARGBColsUp2_MMI;
-    }
-#endif
-  }
-
-  for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
-                  dx);
-    dst_argb += dst_stride;
-    y += dy;
-  }
-}
-
-// ScaleARGB a ARGB.
-// This function in turn calls a scaling function
-// suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8_t* src,
-                      int src_stride,
-                      int src_width,
-                      int src_height,
-                      uint8_t* dst,
-                      int dst_stride,
-                      int dst_width,
-                      int dst_height,
-                      int clip_x,
-                      int clip_y,
-                      int clip_width,
-                      int clip_height,
-                      enum FilterMode filtering) {
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  // ARGB does not support box filter yet, but allow the user to pass it.
-  // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
-                                filtering);
-
-  // Negative src_height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
-             &dx, &dy);
-  src_width = Abs(src_width);
-  if (clip_x) {
-    int64_t clipf = (int64_t)(clip_x)*dx;
-    x += (clipf & 0xffff);
-    src += (clipf >> 16) * 4;
-    dst += clip_x * 4;
-  }
-  if (clip_y) {
-    int64_t clipf = (int64_t)(clip_y)*dy;
-    y += (clipf & 0xffff);
-    src += (clipf >> 16) * src_stride;
-    dst += clip_y * dst_stride;
-  }
-
-  // Special case for integer step values.
-  if (((dx | dy) & 0xffff) == 0) {
-    if (!dx || !dy) {  // 1 pixel wide and/or tall.
-      filtering = kFilterNone;
-    } else {
-      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
-      if (!(dx & 0x10000) && !(dy & 0x10000)) {
-        if (dx == 0x20000) {
-          // Optimized 1/2 downsample.
-          ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
-                         src_stride, dst_stride, src, dst, x, dx, y, dy,
-                         filtering);
-          return;
-        }
-        if (dx == 0x40000 && filtering == kFilterBox) {
-          // Optimized 1/4 box downsample.
-          ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
-                            src_stride, dst_stride, src, dst, x, dx, y, dy);
-          return;
-        }
-        ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
-                          src_stride, dst_stride, src, dst, x, dx, y, dy,
-                          filtering);
-        return;
-      }
-      // Optimized odd scale down. ie 3, 5, 7, 9x.
-      if ((dx & 0x10000) && (dy & 0x10000)) {
-        filtering = kFilterNone;
-        if (dx == 0x10000 && dy == 0x10000) {
-          // Straight copy.
-          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
-                   dst, dst_stride, clip_width, clip_height);
-          return;
-        }
-      }
-    }
-  }
-  if (dx == 0x10000 && (x & 0xffff) == 0) {
-    // Arbitrary scale vertically, but unscaled horizontally.
-    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
-                       dst_stride, src, dst, x, y, dy, 4, filtering);
-    return;
-  }
-  if (filtering && dy < 65536) {
-    ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
-                        src_stride, dst_stride, src, dst, x, dx, y, dy,
-                        filtering);
-    return;
-  }
-  if (filtering) {
-    ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
-                          src_stride, dst_stride, src, dst, x, dx, y, dy,
-                          filtering);
-    return;
-  }
-  ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
-                  dst_stride, src, dst, x, dx, y, dy);
-}
-
-LIBYUV_API
-int ARGBScaleClip(const uint8_t* src_argb,
-                  int src_stride_argb,
-                  int src_width,
-                  int src_height,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int dst_width,
-                  int dst_height,
-                  int clip_x,
-                  int clip_y,
-                  int clip_width,
-                  int clip_height,
-                  enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
-      dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
-      clip_width > 32768 || clip_height > 32768 ||
-      (clip_x + clip_width) > dst_width ||
-      (clip_y + clip_height) > dst_height) {
-    return -1;
-  }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
-            dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
-            clip_height, filtering);
-  return 0;
-}
-
-// Scale an ARGB image.
-LIBYUV_API
-int ARGBScale(const uint8_t* src_argb,
-              int src_stride_argb,
-              int src_width,
-              int src_height,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
-      src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
-            dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
-            filtering);
-  return 0;
-}
-
-// Scale with YUV conversion to ARGB and clipping.
-LIBYUV_API
-int YUVToARGBScaleClip(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint32_t src_fourcc,
-                       int src_width,
-                       int src_height,
-                       uint8_t* dst_argb,
-                       int dst_stride_argb,
-                       uint32_t dst_fourcc,
-                       int dst_width,
-                       int dst_height,
-                       int clip_x,
-                       int clip_y,
-                       int clip_width,
-                       int clip_height,
-                       enum FilterMode filtering) {
-  uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
-  int r;
-  (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
-  (void)dst_fourcc;
-  I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-             argb_buffer, src_width * 4, src_width, src_height);
-
-  r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
-                    dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
-                    clip_width, clip_height, filtering);
-  free(argb_buffer);
-  return r;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/scale_common.cc b/thirdparty/libyuv/source/scale_common.cc
deleted file mode 100644
index da96d42..0000000
--- a/thirdparty/libyuv/source/scale_common.cc
+++ /dev/null
@@ -1,1769 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// CPU agnostic row functions
-void ScaleRowDown2_C(const uint8_t* src_ptr,
-                     ptrdiff_t src_stride,
-                     uint8_t* dst,
-                     int dst_width) {
-  int x;
-  (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src_ptr[1];
-    dst[1] = src_ptr[3];
-    dst += 2;
-    src_ptr += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = src_ptr[1];
-  }
-}
-
-void ScaleRowDown2_16_C(const uint16_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint16_t* dst,
-                        int dst_width) {
-  int x;
-  (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src_ptr[1];
-    dst[1] = src_ptr[3];
-    dst += 2;
-    src_ptr += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = src_ptr[1];
-  }
-}
-
-void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst,
-                           int dst_width) {
-  const uint8_t* s = src_ptr;
-  int x;
-  (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + 1) >> 1;
-    dst[1] = (s[2] + s[3] + 1) >> 1;
-    dst += 2;
-    s += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + 1) >> 1;
-  }
-}
-
-void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint16_t* dst,
-                              int dst_width) {
-  const uint16_t* s = src_ptr;
-  int x;
-  (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + 1) >> 1;
-    dst[1] = (s[2] + s[3] + 1) >> 1;
-    dst += 2;
-    s += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + 1) >> 1;
-  }
-}
-
-void ScaleRowDown2Box_C(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-    dst += 2;
-    s += 4;
-    t += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-  }
-}
-
-void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst,
-                            int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  int x;
-  dst_width -= 1;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-    dst += 2;
-    s += 4;
-    t += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst += 1;
-    s += 2;
-    t += 2;
-  }
-  dst[0] = (s[0] + t[0] + 1) >> 1;
-}
-
-void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint16_t* dst,
-                           int dst_width) {
-  const uint16_t* s = src_ptr;
-  const uint16_t* t = src_ptr + src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-    dst += 2;
-    s += 4;
-    t += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-  }
-}
-
-void ScaleRowDown4_C(const uint8_t* src_ptr,
-                     ptrdiff_t src_stride,
-                     uint8_t* dst,
-                     int dst_width) {
-  int x;
-  (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src_ptr[2];
-    dst[1] = src_ptr[6];
-    dst += 2;
-    src_ptr += 8;
-  }
-  if (dst_width & 1) {
-    dst[0] = src_ptr[2];
-  }
-}
-
-void ScaleRowDown4_16_C(const uint16_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint16_t* dst,
-                        int dst_width) {
-  int x;
-  (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src_ptr[2];
-    dst[1] = src_ptr[6];
-    dst += 2;
-    src_ptr += 8;
-  }
-  if (dst_width & 1) {
-    dst[0] = src_ptr[2];
-  }
-}
-
-void ScaleRowDown4Box_C(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width) {
-  intptr_t stride = src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
-              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
-              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
-              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
-              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
-              src_ptr[stride * 3 + 3] + 8) >>
-             4;
-    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
-              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
-              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
-              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
-              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
-              src_ptr[stride * 3 + 7] + 8) >>
-             4;
-    dst += 2;
-    src_ptr += 8;
-  }
-  if (dst_width & 1) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
-              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
-              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
-              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
-              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
-              src_ptr[stride * 3 + 3] + 8) >>
-             4;
-  }
-}
-
-void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint16_t* dst,
-                           int dst_width) {
-  intptr_t stride = src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
-              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
-              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
-              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
-              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
-              src_ptr[stride * 3 + 3] + 8) >>
-             4;
-    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
-              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
-              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
-              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
-              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
-              src_ptr[stride * 3 + 7] + 8) >>
-             4;
-    dst += 2;
-    src_ptr += 8;
-  }
-  if (dst_width & 1) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
-              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
-              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
-              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
-              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
-              src_ptr[stride * 3 + 3] + 8) >>
-             4;
-  }
-}
-
-void ScaleRowDown34_C(const uint8_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t* dst,
-                      int dst_width) {
-  int x;
-  (void)src_stride;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[1];
-    dst[2] = src_ptr[3];
-    dst += 3;
-    src_ptr += 4;
-  }
-}
-
-void ScaleRowDown34_16_C(const uint16_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint16_t* dst,
-                         int dst_width) {
-  int x;
-  (void)src_stride;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[1];
-    dst[2] = src_ptr[3];
-    dst += 3;
-    src_ptr += 4;
-  }
-}
-
-// Filter rows 0 and 1 together, 3 : 1
-void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* d,
-                            int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 * 3 + b0 + 2) >> 2;
-    d[1] = (a1 * 3 + b1 + 2) >> 2;
-    d[2] = (a2 * 3 + b2 + 2) >> 2;
-    d += 3;
-    s += 4;
-    t += 4;
-  }
-}
-
-void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t* d,
-                               int dst_width) {
-  const uint16_t* s = src_ptr;
-  const uint16_t* t = src_ptr + src_stride;
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 * 3 + b0 + 2) >> 2;
-    d[1] = (a1 * 3 + b1 + 2) >> 2;
-    d[2] = (a2 * 3 + b2 + 2) >> 2;
-    d += 3;
-    s += 4;
-    t += 4;
-  }
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* d,
-                            int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 + b0 + 1) >> 1;
-    d[1] = (a1 + b1 + 1) >> 1;
-    d[2] = (a2 + b2 + 1) >> 1;
-    d += 3;
-    s += 4;
-    t += 4;
-  }
-}
-
-void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t* d,
-                               int dst_width) {
-  const uint16_t* s = src_ptr;
-  const uint16_t* t = src_ptr + src_stride;
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 + b0 + 1) >> 1;
-    d[1] = (a1 + b1 + 1) >> 1;
-    d[2] = (a2 + b2 + 1) >> 1;
-    d += 3;
-    s += 4;
-    t += 4;
-  }
-}
-
-// Sample position: (O is src sample position, X is dst sample position)
-//
-//      v dst_ptr at here           v stop at here
-//  X O X   X O X   X O X   X O X   X O X
-//    ^ src_ptr at here
-void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
-                          uint8_t* dst_ptr,
-                          int dst_width) {
-  int src_width = dst_width >> 1;
-  int x;
-  assert((dst_width % 2 == 0) && (dst_width >= 0));
-  for (x = 0; x < src_width; ++x) {
-    dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
-    dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
-  }
-}
-
-// Sample position: (O is src sample position, X is dst sample position)
-//
-//    src_ptr at here
-//  X v X   X   X   X   X   X   X   X   X
-//    O       O       O       O       O
-//  X   X   X   X   X   X   X   X   X   X
-//      ^ dst_ptr at here           ^ stop at here
-//  X   X   X   X   X   X   X   X   X   X
-//    O       O       O       O       O
-//  X   X   X   X   X   X   X   X   X   X
-void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            ptrdiff_t dst_stride,
-                            int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  uint8_t* d = dst_ptr;
-  uint8_t* e = dst_ptr + dst_stride;
-  int src_width = dst_width >> 1;
-  int x;
-  assert((dst_width % 2 == 0) && (dst_width >= 0));
-  for (x = 0; x < src_width; ++x) {
-    d[2 * x + 0] =
-        (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
-    d[2 * x + 1] =
-        (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
-    e[2 * x + 0] =
-        (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
-    e[2 * x + 1] =
-        (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
-  }
-}
-
-// Only suitable for at most 14 bit range.
-void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
-                             uint16_t* dst_ptr,
-                             int dst_width) {
-  int src_width = dst_width >> 1;
-  int x;
-  assert((dst_width % 2 == 0) && (dst_width >= 0));
-  for (x = 0; x < src_width; ++x) {
-    dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
-    dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
-  }
-}
-
-// Only suitable for at most 12bit range.
-void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t* dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width) {
-  const uint16_t* s = src_ptr;
-  const uint16_t* t = src_ptr + src_stride;
-  uint16_t* d = dst_ptr;
-  uint16_t* e = dst_ptr + dst_stride;
-  int src_width = dst_width >> 1;
-  int x;
-  assert((dst_width % 2 == 0) && (dst_width >= 0));
-  for (x = 0; x < src_width; ++x) {
-    d[2 * x + 0] =
-        (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
-    d[2 * x + 1] =
-        (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
-    e[2 * x + 0] =
-        (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
-    e[2 * x + 1] =
-        (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
-  }
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleCols_C(uint8_t* dst_ptr,
-                 const uint8_t* src_ptr,
-                 int dst_width,
-                 int x,
-                 int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst_ptr[0] = src_ptr[x >> 16];
-    x += dx;
-    dst_ptr[1] = src_ptr[x >> 16];
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = src_ptr[x >> 16];
-  }
-}
-
-void ScaleCols_16_C(uint16_t* dst_ptr,
-                    const uint16_t* src_ptr,
-                    int dst_width,
-                    int x,
-                    int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst_ptr[0] = src_ptr[x >> 16];
-    x += dx;
-    dst_ptr[1] = src_ptr[x >> 16];
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = src_ptr[x >> 16];
-  }
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_C(uint8_t* dst_ptr,
-                    const uint8_t* src_ptr,
-                    int dst_width,
-                    int x,
-                    int dx) {
-  int j;
-  (void)x;
-  (void)dx;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
-    src_ptr += 1;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = src_ptr[0];
-  }
-}
-
-void ScaleColsUp2_16_C(uint16_t* dst_ptr,
-                       const uint16_t* src_ptr,
-                       int dst_width,
-                       int x,
-                       int dx) {
-  int j;
-  (void)x;
-  (void)dx;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
-    src_ptr += 1;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = src_ptr[0];
-  }
-}
-
-// (1-f)a + fb can be replaced with a + f(b-a)
-#if defined(__arm__) || defined(__aarch64__)
-#define BLENDER(a, b, f) \
-  (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-#else
-// Intel uses 7 bit math with rounding.
-#define BLENDER(a, b, f) \
-  (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
-#endif
-
-void ScaleFilterCols_C(uint8_t* dst_ptr,
-                       const uint8_t* src_ptr,
-                       int dst_width,
-                       int x,
-                       int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-
-void ScaleFilterCols64_C(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         int dst_width,
-                         int x32,
-                         int dx) {
-  int64_t x = (int64_t)(x32);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int64_t xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int64_t xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-#undef BLENDER
-
-// Same as 8 bit arm blender but return is cast to uint16_t
-#define BLENDER(a, b, f) \
-  (uint16_t)(            \
-      (int)(a) +         \
-      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
-
-void ScaleFilterCols_16_C(uint16_t* dst_ptr,
-                          const uint16_t* src_ptr,
-                          int dst_width,
-                          int x,
-                          int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-
-void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
-                            const uint16_t* src_ptr,
-                            int dst_width,
-                            int x32,
-                            int dx) {
-  int64_t x = (int64_t)(x32);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int64_t xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int64_t xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-#undef BLENDER
-
-void ScaleRowDown38_C(const uint8_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t* dst,
-                      int dst_width) {
-  int x;
-  (void)src_stride;
-  assert(dst_width % 3 == 0);
-  for (x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[3];
-    dst[2] = src_ptr[6];
-    dst += 3;
-    src_ptr += 8;
-  }
-}
-
-void ScaleRowDown38_16_C(const uint16_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint16_t* dst,
-                         int dst_width) {
-  int x;
-  (void)src_stride;
-  assert(dst_width % 3 == 0);
-  for (x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[3];
-    dst[2] = src_ptr[6];
-    dst += 3;
-    src_ptr += 8;
-  }
-}
-
-// 8x3 -> 3x1
-void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width) {
-  intptr_t stride = src_stride;
-  int i;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] =
-        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
-         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-            (65536 / 9) >>
-        16;
-    dst_ptr[1] =
-        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
-         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-            (65536 / 9) >>
-        16;
-    dst_ptr[2] =
-        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
-         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-            (65536 / 6) >>
-        16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t* dst_ptr,
-                               int dst_width) {
-  intptr_t stride = src_stride;
-  int i;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] =
-        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
-         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-            (65536 / 9) >>
-        16;
-    dst_ptr[1] =
-        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
-         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-            (65536 / 9) >>
-        16;
-    dst_ptr[2] =
-        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
-         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-            (65536 / 6) >>
-        16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-// 8x2 -> 3x1
-void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width) {
-  intptr_t stride = src_stride;
-  int i;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
-                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
-                     (65536 / 6) >>
-                 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
-                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
-                     (65536 / 6) >>
-                 16;
-    dst_ptr[2] =
-        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
-            (65536 / 4) >>
-        16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t* dst_ptr,
-                               int dst_width) {
-  intptr_t stride = src_stride;
-  int i;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
-                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
-                     (65536 / 6) >>
-                 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
-                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
-                     (65536 / 6) >>
-                 16;
-    dst_ptr[2] =
-        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
-            (65536 / 4) >>
-        16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
-  int x;
-  assert(src_width > 0);
-  for (x = 0; x < src_width - 1; x += 2) {
-    dst_ptr[0] += src_ptr[0];
-    dst_ptr[1] += src_ptr[1];
-    src_ptr += 2;
-    dst_ptr += 2;
-  }
-  if (src_width & 1) {
-    dst_ptr[0] += src_ptr[0];
-  }
-}
-
-void ScaleAddRow_16_C(const uint16_t* src_ptr,
-                      uint32_t* dst_ptr,
-                      int src_width) {
-  int x;
-  assert(src_width > 0);
-  for (x = 0; x < src_width - 1; x += 2) {
-    dst_ptr[0] += src_ptr[0];
-    dst_ptr[1] += src_ptr[1];
-    src_ptr += 2;
-    dst_ptr += 2;
-  }
-  if (src_width & 1) {
-    dst_ptr[0] += src_ptr[0];
-  }
-}
-
-// ARGB scale row functions
-
-void ScaleARGBRowDown2_C(const uint8_t* src_argb,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_argb,
-                         int dst_width) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-  int x;
-  (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[1];
-    dst[1] = src[3];
-    src += 4;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[1];
-  }
-}
-
-void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_argb,
-                               int dst_width) {
-  int x;
-  (void)src_stride;
-  for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
-    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
-    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
-    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
-    src_argb += 8;
-    dst_argb += 4;
-  }
-}
-
-void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_argb,
-                            int dst_width) {
-  int x;
-  for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
-                   src_argb[src_stride + 4] + 2) >>
-                  2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
-                   src_argb[src_stride + 5] + 2) >>
-                  2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
-                   src_argb[src_stride + 6] + 2) >>
-                  2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
-                   src_argb[src_stride + 7] + 2) >>
-                  2;
-    src_argb += 8;
-    dst_argb += 4;
-  }
-}
-
-void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
-                            ptrdiff_t src_stride,
-                            int src_stepx,
-                            uint8_t* dst_argb,
-                            int dst_width) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-  (void)src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[0];
-    dst[1] = src[src_stepx];
-    src += src_stepx * 2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[0];
-  }
-}
-
-void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_argb,
-                               int dst_width) {
-  int x;
-  for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
-                   src_argb[src_stride + 4] + 2) >>
-                  2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
-                   src_argb[src_stride + 5] + 2) >>
-                  2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
-                   src_argb[src_stride + 6] + 2) >>
-                  2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
-                   src_argb[src_stride + 7] + 2) >>
-                  2;
-    src_argb += src_stepx * 4;
-    dst_argb += 4;
-  }
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_C(uint8_t* dst_argb,
-                     const uint8_t* src_argb,
-                     int dst_width,
-                     int x,
-                     int dx) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst[0] = src[x >> 16];
-    x += dx;
-    dst[1] = src[x >> 16];
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[x >> 16];
-  }
-}
-
-void ScaleARGBCols64_C(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x32,
-                       int dx) {
-  int64_t x = (int64_t)(x32);
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst[0] = src[x >> 16];
-    x += dx;
-    dst[1] = src[x >> 16];
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[x >> 16];
-  }
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_C(uint8_t* dst_argb,
-                        const uint8_t* src_argb,
-                        int dst_width,
-                        int x,
-                        int dx) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-  int j;
-  (void)x;
-  (void)dx;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst[1] = dst[0] = src[0];
-    src += 1;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[0];
-  }
-}
-
-// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
-// Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
-#define BLENDERC(a, b, f, s) \
-  (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f)                                                 \
-  BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
-      BLENDERC(a, b, f, 0)
-
-void ScaleARGBFilterCols_C(uint8_t* dst_argb,
-                           const uint8_t* src_argb,
-                           int dst_width,
-                           int x,
-                           int dx) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint32_t a = src[xi];
-    uint32_t b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-    x += dx;
-    xi = x >> 16;
-    xf = (x >> 9) & 0x7f;
-    a = src[xi];
-    b = src[xi + 1];
-    dst[1] = BLENDER(a, b, xf);
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint32_t a = src[xi];
-    uint32_t b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-  }
-}
-
-void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
-                             const uint8_t* src_argb,
-                             int dst_width,
-                             int x32,
-                             int dx) {
-  int64_t x = (int64_t)(x32);
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int64_t xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint32_t a = src[xi];
-    uint32_t b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-    x += dx;
-    xi = x >> 16;
-    xf = (x >> 9) & 0x7f;
-    a = src[xi];
-    b = src[xi + 1];
-    dst[1] = BLENDER(a, b, xf);
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    int64_t xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint32_t a = src[xi];
-    uint32_t b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-  }
-}
-#undef BLENDER1
-#undef BLENDERC
-#undef BLENDER
-
-// UV scale row functions
-// same as ARGB but 2 channels
-
-void ScaleUVRowDown2_C(const uint8_t* src_uv,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst_uv,
-                       int dst_width) {
-  const uint16_t* src = (const uint16_t*)(src_uv);
-  uint16_t* dst = (uint16_t*)(dst_uv);
-  int x;
-  (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[1];
-    dst[1] = src[3];
-    src += 2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[1];
-  }
-}
-
-void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst_uv,
-                             int dst_width) {
-  int x;
-  (void)src_stride;
-  for (x = 0; x < dst_width; ++x) {
-    dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
-    dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
-    src_uv += 4;
-    dst_uv += 2;
-  }
-}
-
-void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst_uv,
-                          int dst_width) {
-  int x;
-  for (x = 0; x < dst_width; ++x) {
-    dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
-                 src_uv[src_stride + 2] + 2) >>
-                2;
-    dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
-                 src_uv[src_stride + 3] + 2) >>
-                2;
-    src_uv += 4;
-    dst_uv += 2;
-  }
-}
-
-void ScaleUVRowDownEven_C(const uint8_t* src_uv,
-                          ptrdiff_t src_stride,
-                          int src_stepx,
-                          uint8_t* dst_uv,
-                          int dst_width) {
-  const uint16_t* src = (const uint16_t*)(src_uv);
-  uint16_t* dst = (uint16_t*)(dst_uv);
-  (void)src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[0];
-    dst[1] = src[src_stepx];
-    src += src_stepx * 2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[0];
-  }
-}
-
-void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
-                             ptrdiff_t src_stride,
-                             int src_stepx,
-                             uint8_t* dst_uv,
-                             int dst_width) {
-  int x;
-  for (x = 0; x < dst_width; ++x) {
-    dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
-                 src_uv[src_stride + 2] + 2) >>
-                2;
-    dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
-                 src_uv[src_stride + 3] + 2) >>
-                2;
-    src_uv += src_stepx * 2;
-    dst_uv += 2;
-  }
-}
-
-void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int dst_width) {
-  int src_width = dst_width >> 1;
-  int x;
-  assert((dst_width % 2 == 0) && (dst_width >= 0));
-  for (x = 0; x < src_width; ++x) {
-    dst_ptr[4 * x + 0] =
-        (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
-    dst_ptr[4 * x + 1] =
-        (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
-    dst_ptr[4 * x + 2] =
-        (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
-    dst_ptr[4 * x + 3] =
-        (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
-  }
-}
-
-void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              ptrdiff_t dst_stride,
-                              int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  uint8_t* d = dst_ptr;
-  uint8_t* e = dst_ptr + dst_stride;
-  int src_width = dst_width >> 1;
-  int x;
-  assert((dst_width % 2 == 0) && (dst_width >= 0));
-  for (x = 0; x < src_width; ++x) {
-    d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
-                    t[2 * x + 2] * 1 + 8) >>
-                   4;
-    d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
-                    t[2 * x + 3] * 1 + 8) >>
-                   4;
-    d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
-                    t[2 * x + 2] * 3 + 8) >>
-                   4;
-    d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
-                    t[2 * x + 3] * 3 + 8) >>
-                   4;
-    e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
-                    t[2 * x + 2] * 3 + 8) >>
-                   4;
-    e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
-                    t[2 * x + 3] * 3 + 8) >>
-                   4;
-    e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
-                    t[2 * x + 2] * 9 + 8) >>
-                   4;
-    e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
-                    t[2 * x + 3] * 9 + 8) >>
-                   4;
-  }
-}
-
-void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
-                               uint16_t* dst_ptr,
-                               int dst_width) {
-  int src_width = dst_width >> 1;
-  int x;
-  assert((dst_width % 2 == 0) && (dst_width >= 0));
-  for (x = 0; x < src_width; ++x) {
-    dst_ptr[4 * x + 0] =
-        (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
-    dst_ptr[4 * x + 1] =
-        (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
-    dst_ptr[4 * x + 2] =
-        (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
-    dst_ptr[4 * x + 3] =
-        (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
-  }
-}
-
-void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint16_t* dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width) {
-  const uint16_t* s = src_ptr;
-  const uint16_t* t = src_ptr + src_stride;
-  uint16_t* d = dst_ptr;
-  uint16_t* e = dst_ptr + dst_stride;
-  int src_width = dst_width >> 1;
-  int x;
-  assert((dst_width % 2 == 0) && (dst_width >= 0));
-  for (x = 0; x < src_width; ++x) {
-    d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
-                    t[2 * x + 2] * 1 + 8) >>
-                   4;
-    d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
-                    t[2 * x + 3] * 1 + 8) >>
-                   4;
-    d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
-                    t[2 * x + 2] * 3 + 8) >>
-                   4;
-    d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
-                    t[2 * x + 3] * 3 + 8) >>
-                   4;
-    e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
-                    t[2 * x + 2] * 3 + 8) >>
-                   4;
-    e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
-                    t[2 * x + 3] * 3 + 8) >>
-                   4;
-    e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
-                    t[2 * x + 2] * 9 + 8) >>
-                   4;
-    e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
-                    t[2 * x + 3] * 9 + 8) >>
-                   4;
-  }
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleUVCols_C(uint8_t* dst_uv,
-                   const uint8_t* src_uv,
-                   int dst_width,
-                   int x,
-                   int dx) {
-  const uint16_t* src = (const uint16_t*)(src_uv);
-  uint16_t* dst = (uint16_t*)(dst_uv);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst[0] = src[x >> 16];
-    x += dx;
-    dst[1] = src[x >> 16];
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[x >> 16];
-  }
-}
-
-void ScaleUVCols64_C(uint8_t* dst_uv,
-                     const uint8_t* src_uv,
-                     int dst_width,
-                     int x32,
-                     int dx) {
-  int64_t x = (int64_t)(x32);
-  const uint16_t* src = (const uint16_t*)(src_uv);
-  uint16_t* dst = (uint16_t*)(dst_uv);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst[0] = src[x >> 16];
-    x += dx;
-    dst[1] = src[x >> 16];
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[x >> 16];
-  }
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleUVColsUp2_C(uint8_t* dst_uv,
-                      const uint8_t* src_uv,
-                      int dst_width,
-                      int x,
-                      int dx) {
-  const uint16_t* src = (const uint16_t*)(src_uv);
-  uint16_t* dst = (uint16_t*)(dst_uv);
-  int j;
-  (void)x;
-  (void)dx;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst[1] = dst[0] = src[0];
-    src += 1;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[0];
-  }
-}
-
-// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
-// Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
-#define BLENDERC(a, b, f, s) \
-  (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
-
-void ScaleUVFilterCols_C(uint8_t* dst_uv,
-                         const uint8_t* src_uv,
-                         int dst_width,
-                         int x,
-                         int dx) {
-  const uint16_t* src = (const uint16_t*)(src_uv);
-  uint16_t* dst = (uint16_t*)(dst_uv);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint16_t a = src[xi];
-    uint16_t b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-    x += dx;
-    xi = x >> 16;
-    xf = (x >> 9) & 0x7f;
-    a = src[xi];
-    b = src[xi + 1];
-    dst[1] = BLENDER(a, b, xf);
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint16_t a = src[xi];
-    uint16_t b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-  }
-}
-
-void ScaleUVFilterCols64_C(uint8_t* dst_uv,
-                           const uint8_t* src_uv,
-                           int dst_width,
-                           int x32,
-                           int dx) {
-  int64_t x = (int64_t)(x32);
-  const uint16_t* src = (const uint16_t*)(src_uv);
-  uint16_t* dst = (uint16_t*)(dst_uv);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int64_t xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint16_t a = src[xi];
-    uint16_t b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-    x += dx;
-    xi = x >> 16;
-    xf = (x >> 9) & 0x7f;
-    a = src[xi];
-    b = src[xi + 1];
-    dst[1] = BLENDER(a, b, xf);
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    int64_t xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint16_t a = src[xi];
-    uint16_t b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-  }
-}
-#undef BLENDER1
-#undef BLENDERC
-#undef BLENDER
-
-// Scale plane vertically with bilinear interpolation.
-void ScalePlaneVertical(int src_height,
-                        int dst_width,
-                        int dst_height,
-                        int src_stride,
-                        int dst_stride,
-                        const uint8_t* src_argb,
-                        uint8_t* dst_argb,
-                        int x,
-                        int y,
-                        int dy,
-                        int bpp,
-                        enum FilterMode filtering) {
-  // TODO(fbarchard): Allow higher bpp.
-  int dst_width_bytes = dst_width * bpp;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-  int j;
-  assert(bpp >= 1 && bpp <= 4);
-  assert(src_height != 0);
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  src_argb += (x >> 16) * bpp;
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(dst_width_bytes, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(dst_width_bytes, 8)) {
-      InterpolateRow = InterpolateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(dst_width_bytes, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-  for (j = 0; j < dst_height; ++j) {
-    int yi;
-    int yf;
-    if (y > max_y) {
-      y = max_y;
-    }
-    yi = y >> 16;
-    yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
-                   dst_width_bytes, yf);
-    dst_argb += dst_stride;
-    y += dy;
-  }
-}
-void ScalePlaneVertical_16(int src_height,
-                           int dst_width,
-                           int dst_height,
-                           int src_stride,
-                           int dst_stride,
-                           const uint16_t* src_argb,
-                           uint16_t* dst_argb,
-                           int x,
-                           int y,
-                           int dy,
-                           int wpp,
-                           enum FilterMode filtering) {
-  // TODO(fbarchard): Allow higher wpp.
-  int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_16_C;
-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-  int j;
-  assert(wpp >= 1 && wpp <= 2);
-  assert(src_height != 0);
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  src_argb += (x >> 16) * wpp;
-#if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_16_SSE2;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_16_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_16_SSSE3;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_16_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_16_AVX2;
-    if (IS_ALIGNED(dst_width_bytes, 32)) {
-      InterpolateRow = InterpolateRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_16_NEON;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_16_NEON;
-    }
-  }
-#endif
-  for (j = 0; j < dst_height; ++j) {
-    int yi;
-    int yf;
-    if (y > max_y) {
-      y = max_y;
-    }
-    yi = y >> 16;
-    yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
-                   dst_width_words, yf);
-    dst_argb += dst_stride;
-    y += dy;
-  }
-}
-
-// Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width,
-                                  int src_height,
-                                  int dst_width,
-                                  int dst_height,
-                                  enum FilterMode filtering) {
-  if (src_width < 0) {
-    src_width = -src_width;
-  }
-  if (src_height < 0) {
-    src_height = -src_height;
-  }
-  if (filtering == kFilterBox) {
-    // If scaling either axis to 0.5 or larger, switch from Box to Bilinear.
-    if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) {
-      filtering = kFilterBilinear;
-    }
-  }
-  if (filtering == kFilterBilinear) {
-    if (src_height == 1) {
-      filtering = kFilterLinear;
-    }
-    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
-    if (dst_height == src_height || dst_height * 3 == src_height) {
-      filtering = kFilterLinear;
-    }
-    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
-    // avoid reading 2 pixels horizontally that causes memory exception.
-    if (src_width == 1) {
-      filtering = kFilterNone;
-    }
-  }
-  if (filtering == kFilterLinear) {
-    if (src_width == 1) {
-      filtering = kFilterNone;
-    }
-    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
-    if (dst_width == src_width || dst_width * 3 == src_width) {
-      filtering = kFilterNone;
-    }
-  }
-  return filtering;
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_C(int num, int div) {
-  return (int)(((int64_t)(num) << 16) / div);
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv1_C(int num, int div) {
-  return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));
-}
-
-#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
-
-// Compute slope values for stepping.
-void ScaleSlope(int src_width,
-                int src_height,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering,
-                int* x,
-                int* y,
-                int* dx,
-                int* dy) {
-  assert(x != NULL);
-  assert(y != NULL);
-  assert(dx != NULL);
-  assert(dy != NULL);
-  assert(src_width != 0);
-  assert(src_height != 0);
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  // Check for 1 pixel and avoid FixedDiv overflow.
-  if (dst_width == 1 && src_width >= 32768) {
-    dst_width = src_width;
-  }
-  if (dst_height == 1 && src_height >= 32768) {
-    dst_height = src_height;
-  }
-  if (filtering == kFilterBox) {
-    // Scale step for point sampling duplicates all pixels equally.
-    *dx = FixedDiv(Abs(src_width), dst_width);
-    *dy = FixedDiv(src_height, dst_height);
-    *x = 0;
-    *y = 0;
-  } else if (filtering == kFilterBilinear) {
-    // Scale step for bilinear sampling renders last pixel once for upsample.
-    if (dst_width <= Abs(src_width)) {
-      *dx = FixedDiv(Abs(src_width), dst_width);
-      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
-    } else if (dst_width > 1) {
-      *dx = FixedDiv1(Abs(src_width), dst_width);
-      *x = 0;
-    }
-    if (dst_height <= src_height) {
-      *dy = FixedDiv(src_height, dst_height);
-      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
-    } else if (dst_height > 1) {
-      *dy = FixedDiv1(src_height, dst_height);
-      *y = 0;
-    }
-  } else if (filtering == kFilterLinear) {
-    // Scale step for bilinear sampling renders last pixel once for upsample.
-    if (dst_width <= Abs(src_width)) {
-      *dx = FixedDiv(Abs(src_width), dst_width);
-      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
-    } else if (dst_width > 1) {
-      *dx = FixedDiv1(Abs(src_width), dst_width);
-      *x = 0;
-    }
-    *dy = FixedDiv(src_height, dst_height);
-    *y = *dy >> 1;
-  } else {
-    // Scale step for point sampling duplicates all pixels equally.
-    *dx = FixedDiv(Abs(src_width), dst_width);
-    *dy = FixedDiv(src_height, dst_height);
-    *x = CENTERSTART(*dx, 0);
-    *y = CENTERSTART(*dy, 0);
-  }
-  // Negative src_width means horizontally mirror.
-  if (src_width < 0) {
-    *x += (dst_width - 1) * *dx;
-    *dx = -*dx;
-    // src_width = -src_width;   // Caller must do this.
-  }
-}
-#undef CENTERSTART
-
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_C(const uint16_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint16_t* dst,
-                      int dst_width) {
-  const uint16_t* src2 = src_ptr + src_stride;
-
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    uint16_t p0 = src_ptr[0];
-    uint16_t p1 = src_ptr[1];
-    uint16_t p2 = src2[0];
-    uint16_t p3 = src2[1];
-    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
-    dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
-    ++src_ptr;
-    ++src2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    uint16_t p0 = src_ptr[0];
-    uint16_t p1 = src_ptr[1];
-    uint16_t p2 = src2[0];
-    uint16_t p3 = src2[1];
-    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/scale_gcc.cc b/thirdparty/libyuv/source/scale_gcc.cc
deleted file mode 100644
index ebc6deb..0000000
--- a/thirdparty/libyuv/source/scale_gcc.cc
+++ /dev/null
@@ -1,2948 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__))
-
-// Offsets for source bytes 0 to 9
-static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 0 to 10
-static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
-                              8, 9, 9, 10, 10, 11, 12, 13};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
-                              10, 11, 12, 13, 13, 14, 14, 15};
-
-// Coefficients for source bytes 0 to 10
-static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
-
-// Coefficients for source bytes 10 to 21
-static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
-
-// Coefficients for source bytes 21 to 31
-static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
-
-// Coefficients for source bytes 21 to 31
-static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
-
-static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
-                               128, 128, 128, 128, 128, 128, 128, 128};
-
-static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
-                               6,   8,   11,  14,  128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 0,1,2
-static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
-                              128, 128, 128, 128, 128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 3,4,5
-static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
-                               6,   7,   12,  13,  128, 128, 128, 128};
-
-// Scaling values for boxes of 3x3 and 2x3
-static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
-                                  65536 / 9, 65536 / 6, 0,         0};
-
-// Arrange first value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
-                               11, 128, 14, 128, 128, 128, 128, 128};
-
-// Arrange second value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
-                               12, 128, 15, 128, 128, 128, 128, 128};
-
-// Arrange third value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
-                               13, 128, 128, 128, 128, 128, 128, 128};
-
-// Scaling values for boxes of 3x2 and 2x2
-static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
-                                 65536 / 3, 65536 / 2, 0,         0};
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-
-void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "psrlw       $0x8,%%xmm0                   \n"
-      "psrlw       $0x8,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
-}
-
-void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "psrlw       $0xf,%%xmm4                   \n"
-      "packuswb    %%xmm4,%%xmm4                 \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"
-      "pmaddubsw   %%xmm4,%%xmm1                 \n"
-      "pavgw       %%xmm5,%%xmm0                 \n"
-      "pavgw       %%xmm5,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "psrlw       $0xf,%%xmm4                   \n"
-      "packuswb    %%xmm4,%%xmm4                 \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
-      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
-      "lea         0x20(%0),%0                   \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"
-      "pmaddubsw   %%xmm4,%%xmm1                 \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm4,%%xmm3                 \n"
-      "paddw       %%xmm2,%%xmm0                 \n"
-      "paddw       %%xmm3,%%xmm1                 \n"
-      "psrlw       $0x1,%%xmm0                   \n"
-      "psrlw       $0x1,%%xmm1                   \n"
-      "pavgw       %%xmm5,%%xmm0                 \n"
-      "pavgw       %%xmm5,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),               // %0
-        "+r"(dst_ptr),               // %1
-        "+r"(dst_width)              // %2
-      : "r"((intptr_t)(src_stride))  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-#ifdef HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
-}
-
-void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
-      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
-      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
-      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
-      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
-      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
-      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),               // %0
-        "+r"(dst_ptr),               // %1
-        "+r"(dst_width)              // %2
-      : "r"((intptr_t)(src_stride))  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SCALEROWDOWN2_AVX2
-
-void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-      "psrld       $0x18,%%xmm5                  \n"
-      "pslld       $0x10,%%xmm5                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "pand        %%xmm5,%%xmm0                 \n"
-      "pand        %%xmm5,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "psrlw       $0x8,%%xmm0                   \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width) {
-  intptr_t stridex3;
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
-      "psrlw       $0xf,%%xmm4                   \n"
-      "movdqa      %%xmm4,%%xmm5                 \n"
-      "packuswb    %%xmm4,%%xmm4                 \n"
-      "psllw       $0x3,%%xmm5                   \n"
-      "lea         0x00(%4,%4,2),%3              \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
-      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"
-      "pmaddubsw   %%xmm4,%%xmm1                 \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm4,%%xmm3                 \n"
-      "paddw       %%xmm2,%%xmm0                 \n"
-      "paddw       %%xmm3,%%xmm1                 \n"
-      "movdqu      0x00(%0,%4,2),%%xmm2          \n"
-      "movdqu      0x10(%0,%4,2),%%xmm3          \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm4,%%xmm3                 \n"
-      "paddw       %%xmm2,%%xmm0                 \n"
-      "paddw       %%xmm3,%%xmm1                 \n"
-      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
-      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
-      "lea         0x20(%0),%0                   \n"
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "pmaddubsw   %%xmm4,%%xmm3                 \n"
-      "paddw       %%xmm2,%%xmm0                 \n"
-      "paddw       %%xmm3,%%xmm1                 \n"
-      "phaddw      %%xmm1,%%xmm0                 \n"
-      "paddw       %%xmm5,%%xmm0                 \n"
-      "psrlw       $0x4,%%xmm0                   \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),               // %0
-        "+r"(dst_ptr),               // %1
-        "+r"(dst_width),             // %2
-        "=&r"(stridex3)              // %3
-      : "r"((intptr_t)(src_stride))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#ifdef HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-      "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
-      "vpslld      $0x10,%%ymm5,%%ymm5           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
-      "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
-      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
-      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
-      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
-      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
-      "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
-      "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
-      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
-      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
-      "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
-      "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
-      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
-      "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
-      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),                   // %0
-        "+r"(dst_ptr),                   // %1
-        "+r"(dst_width)                  // %2
-      : "r"((intptr_t)(src_stride)),     // %3
-        "r"((intptr_t)(src_stride * 3))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_SCALEROWDOWN4_AVX2
-
-void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst_ptr,
-                          int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "movdqa      %0,%%xmm3                     \n"
-      "movdqa      %1,%%xmm4                     \n"
-      "movdqa      %2,%%xmm5                     \n"
-      :
-      : "m"(kShuf0),  // %0
-        "m"(kShuf1),  // %1
-        "m"(kShuf2)   // %2
-  );
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm2               \n"
-      "lea         0x20(%0),%0                   \n"
-      "movdqa      %%xmm2,%%xmm1                 \n"
-      "palignr     $0x8,%%xmm0,%%xmm1            \n"
-      "pshufb      %%xmm3,%%xmm0                 \n"
-      "pshufb      %%xmm4,%%xmm1                 \n"
-      "pshufb      %%xmm5,%%xmm2                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "movq        %%xmm1,0x8(%1)                \n"
-      "movq        %%xmm2,0x10(%1)               \n"
-      "lea         0x18(%1),%1                   \n"
-      "sub         $0x18,%2                      \n"
-      "jg          1b                            \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"  // kShuf01
-      "movdqa      %1,%%xmm3                     \n"  // kShuf11
-      "movdqa      %2,%%xmm4                     \n"  // kShuf21
-      :
-      : "m"(kShuf01),  // %0
-        "m"(kShuf11),  // %1
-        "m"(kShuf21)   // %2
-  );
-  asm volatile(
-      "movdqa      %0,%%xmm5                     \n"  // kMadd01
-      "movdqa      %1,%%xmm0                     \n"  // kMadd11
-      "movdqa      %2,%%xmm1                     \n"  // kRound34
-      :
-      : "m"(kMadd01),  // %0
-        "m"(kMadd11),  // %1
-        "m"(kRound34)  // %2
-  );
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm6                   \n"
-      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-      "pshufb      %%xmm2,%%xmm6                 \n"
-      "pmaddubsw   %%xmm5,%%xmm6                 \n"
-      "paddsw      %%xmm1,%%xmm6                 \n"
-      "psrlw       $0x2,%%xmm6                   \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "movq        %%xmm6,(%1)                   \n"
-      "movdqu      0x8(%0),%%xmm6                \n"
-      "movdqu      0x8(%0,%3,1),%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-      "pshufb      %%xmm3,%%xmm6                 \n"
-      "pmaddubsw   %%xmm0,%%xmm6                 \n"
-      "paddsw      %%xmm1,%%xmm6                 \n"
-      "psrlw       $0x2,%%xmm6                   \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "movq        %%xmm6,0x8(%1)                \n"
-      "movdqu      0x10(%0),%%xmm6               \n"
-      "movdqu      0x10(%0,%3,1),%%xmm7          \n"
-      "lea         0x20(%0),%0                   \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-      "pshufb      %%xmm4,%%xmm6                 \n"
-      "pmaddubsw   %4,%%xmm6                     \n"
-      "paddsw      %%xmm1,%%xmm6                 \n"
-      "psrlw       $0x2,%%xmm6                   \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "movq        %%xmm6,0x10(%1)               \n"
-      "lea         0x18(%1),%1                   \n"
-      "sub         $0x18,%2                      \n"
-      "jg          1b                            \n"
-               : "+r"(src_ptr),                // %0
-                 "+r"(dst_ptr),                // %1
-                 "+r"(dst_width)               // %2
-               : "r"((intptr_t)(src_stride)),  // %3
-                 "m"(kMadd21)                  // %4
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6", "xmm7");
-}
-
-void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"  // kShuf01
-      "movdqa      %1,%%xmm3                     \n"  // kShuf11
-      "movdqa      %2,%%xmm4                     \n"  // kShuf21
-      :
-      : "m"(kShuf01),  // %0
-        "m"(kShuf11),  // %1
-        "m"(kShuf21)   // %2
-  );
-  asm volatile(
-      "movdqa      %0,%%xmm5                     \n"  // kMadd01
-      "movdqa      %1,%%xmm0                     \n"  // kMadd11
-      "movdqa      %2,%%xmm1                     \n"  // kRound34
-      :
-      : "m"(kMadd01),  // %0
-        "m"(kMadd11),  // %1
-        "m"(kRound34)  // %2
-  );
-
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm6                   \n"
-      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
-      "pavgb       %%xmm6,%%xmm7                 \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-      "pshufb      %%xmm2,%%xmm6                 \n"
-      "pmaddubsw   %%xmm5,%%xmm6                 \n"
-      "paddsw      %%xmm1,%%xmm6                 \n"
-      "psrlw       $0x2,%%xmm6                   \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "movq        %%xmm6,(%1)                   \n"
-      "movdqu      0x8(%0),%%xmm6                \n"
-      "movdqu      0x8(%0,%3,1),%%xmm7           \n"
-      "pavgb       %%xmm6,%%xmm7                 \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-      "pshufb      %%xmm3,%%xmm6                 \n"
-      "pmaddubsw   %%xmm0,%%xmm6                 \n"
-      "paddsw      %%xmm1,%%xmm6                 \n"
-      "psrlw       $0x2,%%xmm6                   \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "movq        %%xmm6,0x8(%1)                \n"
-      "movdqu      0x10(%0),%%xmm6               \n"
-      "movdqu      0x10(%0,%3,1),%%xmm7          \n"
-      "lea         0x20(%0),%0                   \n"
-      "pavgb       %%xmm6,%%xmm7                 \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-      "pshufb      %%xmm4,%%xmm6                 \n"
-      "pmaddubsw   %4,%%xmm6                     \n"
-      "paddsw      %%xmm1,%%xmm6                 \n"
-      "psrlw       $0x2,%%xmm6                   \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "movq        %%xmm6,0x10(%1)               \n"
-      "lea         0x18(%1),%1                   \n"
-      "sub         $0x18,%2                      \n"
-      "jg          1b                            \n"
-               : "+r"(src_ptr),                // %0
-                 "+r"(dst_ptr),                // %1
-                 "+r"(dst_width)               // %2
-               : "r"((intptr_t)(src_stride)),  // %3
-                 "m"(kMadd21)                  // %4
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6", "xmm7");
-}
-
-void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst_ptr,
-                          int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
-      "paddusb     %%xmm1,%%xmm0                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "movhlps     %%xmm0,%%xmm1                 \n"
-      "movd        %%xmm1,0x8(%1)                \n"
-      "lea         0xc(%1),%1                    \n"
-      "sub         $0xc,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "m"(kShuf38a),   // %3
-        "m"(kShuf38b)    // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"
-      "movdqa      %1,%%xmm3                     \n"
-      "movdqa      %2,%%xmm4                     \n"
-      "movdqa      %3,%%xmm5                     \n"
-      :
-      : "m"(kShufAb0),  // %0
-        "m"(kShufAb1),  // %1
-        "m"(kShufAb2),  // %2
-        "m"(kScaleAb2)  // %3
-  );
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%3,1),%%xmm1          \n"
-      "lea         0x10(%0),%0                   \n"
-      "pavgb       %%xmm1,%%xmm0                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pshufb      %%xmm2,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm6                 \n"
-      "pshufb      %%xmm3,%%xmm6                 \n"
-      "paddusw     %%xmm6,%%xmm1                 \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "paddusw     %%xmm0,%%xmm1                 \n"
-      "pmulhuw     %%xmm5,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm1                 \n"
-      "movd        %%xmm1,(%1)                   \n"
-      "psrlq       $0x10,%%xmm1                  \n"
-      "movd        %%xmm1,0x2(%1)                \n"
-      "lea         0x6(%1),%1                    \n"
-      "sub         $0x6,%2                       \n"
-      "jg          1b                            \n"
-               : "+r"(src_ptr),               // %0
-                 "+r"(dst_ptr),               // %1
-                 "+r"(dst_width)              // %2
-               : "r"((intptr_t)(src_stride))  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6");
-}
-
-void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"
-      "movdqa      %1,%%xmm3                     \n"
-      "movdqa      %2,%%xmm4                     \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-      :
-      : "m"(kShufAc),    // %0
-        "m"(kShufAc3),   // %1
-        "m"(kScaleAc33)  // %2
-  );
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%3,1),%%xmm6          \n"
-      "movhlps     %%xmm0,%%xmm1                 \n"
-      "movhlps     %%xmm6,%%xmm7                 \n"
-      "punpcklbw   %%xmm5,%%xmm0                 \n"
-      "punpcklbw   %%xmm5,%%xmm1                 \n"
-      "punpcklbw   %%xmm5,%%xmm6                 \n"
-      "punpcklbw   %%xmm5,%%xmm7                 \n"
-      "paddusw     %%xmm6,%%xmm0                 \n"
-      "paddusw     %%xmm7,%%xmm1                 \n"
-      "movdqu      0x00(%0,%3,2),%%xmm6          \n"
-      "lea         0x10(%0),%0                   \n"
-      "movhlps     %%xmm6,%%xmm7                 \n"
-      "punpcklbw   %%xmm5,%%xmm6                 \n"
-      "punpcklbw   %%xmm5,%%xmm7                 \n"
-      "paddusw     %%xmm6,%%xmm0                 \n"
-      "paddusw     %%xmm7,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm6                 \n"
-      "psrldq      $0x2,%%xmm0                   \n"
-      "paddusw     %%xmm0,%%xmm6                 \n"
-      "psrldq      $0x2,%%xmm0                   \n"
-      "paddusw     %%xmm0,%%xmm6                 \n"
-      "pshufb      %%xmm2,%%xmm6                 \n"
-      "movdqa      %%xmm1,%%xmm7                 \n"
-      "psrldq      $0x2,%%xmm1                   \n"
-      "paddusw     %%xmm1,%%xmm7                 \n"
-      "psrldq      $0x2,%%xmm1                   \n"
-      "paddusw     %%xmm1,%%xmm7                 \n"
-      "pshufb      %%xmm3,%%xmm7                 \n"
-      "paddusw     %%xmm7,%%xmm6                 \n"
-      "pmulhuw     %%xmm4,%%xmm6                 \n"
-      "packuswb    %%xmm6,%%xmm6                 \n"
-      "movd        %%xmm6,(%1)                   \n"
-      "psrlq       $0x10,%%xmm6                  \n"
-      "movd        %%xmm6,0x2(%1)                \n"
-      "lea         0x6(%1),%1                    \n"
-      "sub         $0x6,%2                       \n"
-      "jg          1b                            \n"
-               : "+r"(src_ptr),               // %0
-                 "+r"(dst_ptr),               // %1
-                 "+r"(dst_width)              // %2
-               : "r"((intptr_t)(src_stride))  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6", "xmm7");
-}
-
-static const uvec8 kLinearShuffleFar = {2,  3,  0, 1, 6,  7,  4,  5,
-                                        10, 11, 8, 9, 14, 15, 12, 13};
-
-static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
-                                    3, 1, 1, 3, 3, 1, 1, 3};
-
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
-void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int dst_width) {
-  asm volatile(
-      "pxor        %%xmm0,%%xmm0                 \n"  // 0
-      "pcmpeqw     %%xmm6,%%xmm6                 \n"
-      "psrlw       $15,%%xmm6                    \n"
-      "psllw       $1,%%xmm6                     \n"  // all 2
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm1                   \n"  // 01234567
-      "movq        1(%0),%%xmm2                  \n"  // 12345678
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
-      "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
-      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
-      "movdqa      %%xmm1,%%xmm4                 \n"
-      "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
-      "paddw       %%xmm5,%%xmm4                 \n"
-      "movdqa      %%xmm3,%%xmm5                 \n"
-      "paddw       %%xmm6,%%xmm4                 \n"
-      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
-      "paddw       %%xmm5,%%xmm5                 \n"
-      "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
-      "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
-
-      "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
-      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
-      "paddw       %%xmm2,%%xmm1                 \n"
-      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
-      "paddw       %%xmm6,%%xmm1                 \n"
-      "paddw       %%xmm3,%%xmm3                 \n"
-      "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
-      "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
-
-      "packuswb    %%xmm1,%%xmm5                 \n"
-      "movdqu      %%xmm5,(%1)                   \n"
-
-      "lea         0x8(%0),%0                    \n"
-      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
-void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width) {
-  asm volatile(
-      LABELALIGN
-      "1:                                        \n"
-      "pxor        %%xmm0,%%xmm0                 \n"  // 0
-      // above line
-      "movq        (%0),%%xmm1                   \n"  // 01234567
-      "movq        1(%0),%%xmm2                  \n"  // 12345678
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
-      "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
-      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
-
-      "movdqa      %%xmm1,%%xmm4                 \n"
-      "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
-      "paddw       %%xmm5,%%xmm4                 \n"  // near+far
-      "movdqa      %%xmm3,%%xmm5                 \n"
-      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
-      "paddw       %%xmm5,%%xmm5                 \n"  // 2*near
-      "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
-
-      "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
-      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
-      "paddw       %%xmm2,%%xmm1                 \n"
-      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
-      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
-      "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
-
-      // below line
-      "movq        (%0,%3),%%xmm6                \n"  // 01234567
-      "movq        1(%0,%3),%%xmm2               \n"  // 12345678
-      "movdqa      %%xmm6,%%xmm3                 \n"
-      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
-      "punpcklbw   %%xmm6,%%xmm6                 \n"  // 0011223344556677
-      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
-
-      "movdqa      %%xmm6,%%xmm5                 \n"
-      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 00112233 (16)
-      "movdqa      %%xmm2,%%xmm7                 \n"
-      "punpcklbw   %%xmm0,%%xmm7                 \n"  // 11223344 (16)
-      "paddw       %%xmm7,%%xmm5                 \n"  // near+far
-      "movdqa      %%xmm3,%%xmm7                 \n"
-      "punpcklbw   %%xmm0,%%xmm7                 \n"  // 01122334 (16)
-      "paddw       %%xmm7,%%xmm7                 \n"  // 2*near
-      "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far (2, lo)
-
-      "punpckhbw   %%xmm0,%%xmm6                 \n"  // 44556677 (16)
-      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
-      "paddw       %%xmm6,%%xmm2                 \n"  // near+far
-      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
-      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
-      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
-
-      // xmm4 xmm1
-      // xmm5 xmm2
-      "pcmpeqw     %%xmm0,%%xmm0                 \n"
-      "psrlw       $15,%%xmm0                    \n"
-      "psllw       $3,%%xmm0                     \n"  // all 8
-
-      "movdqa      %%xmm4,%%xmm3                 \n"
-      "movdqa      %%xmm5,%%xmm6                 \n"
-      "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (1, lo)
-      "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
-      "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
-      "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
-      "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
-
-      "movdqa      %%xmm1,%%xmm7                 \n"
-      "movdqa      %%xmm2,%%xmm6                 \n"
-      "paddw       %%xmm7,%%xmm7                 \n"  // 6*near+2*far (1, hi)
-      "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
-      "paddw       %%xmm1,%%xmm7                 \n"  // 9*near+3*far (1, hi)
-      "paddw       %%xmm6,%%xmm7                 \n"  // 9 3 3 1 + 8 (1, hi)
-      "psrlw       $4,%%xmm7                     \n"  // ^ div by 16
-
-      "packuswb    %%xmm7,%%xmm3                 \n"
-      "movdqu      %%xmm3,(%1)                   \n"  // save above line
-
-      "movdqa      %%xmm5,%%xmm3                 \n"
-      "paddw       %%xmm0,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
-      "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, lo)
-      "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
-      "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (lo)
-      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
-
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "paddw       %%xmm0,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
-      "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, hi)
-      "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
-      "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (hi)
-      "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
-
-      "packuswb    %%xmm2,%%xmm5                 \n"
-      "movdqu      %%xmm5,(%1,%4)                \n"  // save below line
-
-      "lea         0x8(%0),%0                    \n"
-      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride))   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3
-void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
-                                 uint16_t* dst_ptr,
-                                 int dst_width) {
-  asm volatile(
-      "movdqa      %3,%%xmm5                     \n"
-      "pcmpeqw     %%xmm4,%%xmm4                 \n"
-      "psrlw       $15,%%xmm4                    \n"
-      "psllw       $1,%%xmm4                     \n"  // all 2
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
-      "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
-
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
-      "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
-
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pshufb      %%xmm5,%%xmm3                 \n"  // 54657687 (far)
-      "pshufb      %%xmm5,%%xmm1                 \n"  // 10213243 (far)
-
-      "paddw       %%xmm4,%%xmm1                 \n"  // far+2
-      "paddw       %%xmm4,%%xmm3                 \n"  // far+2
-      "paddw       %%xmm0,%%xmm1                 \n"  // near+far+2
-      "paddw       %%xmm2,%%xmm3                 \n"  // near+far+2
-      "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
-      "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
-      "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far+2 (lo)
-      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far+2 (hi)
-
-      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far
-      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm2,16(%1)                 \n"
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),          // %0
-        "+r"(dst_ptr),          // %1
-        "+r"(dst_width)         // %2
-      : "m"(kLinearShuffleFar)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3
-void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint16_t* dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm7,%%xmm7                 \n"
-      "psrlw       $15,%%xmm7                    \n"
-      "psllw       $3,%%xmm7                     \n"  // all 8
-      "movdqa      %5,%%xmm6                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      // above line
-      "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
-      "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
-      "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "pshufb      %%xmm6,%%xmm3                 \n"  // 54657687 (far)
-      "pshufb      %%xmm6,%%xmm1                 \n"  // 10213243 (far)
-      "paddw       %%xmm0,%%xmm1                 \n"  // near+far
-      "paddw       %%xmm2,%%xmm3                 \n"  // near+far
-      "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
-      "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
-      "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far (1, lo)
-      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (1, hi)
-
-      // below line
-      "movdqu      (%0,%3,2),%%xmm1              \n"  // 01234567 (16)
-      "movdqu      2(%0,%3,2),%%xmm4             \n"  // 12345678 (16)
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "punpckhwd   %%xmm4,%%xmm3                 \n"  // 45566778 (16)
-      "punpcklwd   %%xmm4,%%xmm1                 \n"  // 01122334 (16)
-      "movdqa      %%xmm3,%%xmm5                 \n"
-      "movdqa      %%xmm1,%%xmm4                 \n"
-      "pshufb      %%xmm6,%%xmm5                 \n"  // 54657687 (far)
-      "pshufb      %%xmm6,%%xmm4                 \n"  // 10213243 (far)
-      "paddw       %%xmm1,%%xmm4                 \n"  // near+far
-      "paddw       %%xmm3,%%xmm5                 \n"  // near+far
-      "paddw       %%xmm1,%%xmm1                 \n"  // 2*near
-      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
-      "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (2, lo)
-      "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
-
-      // xmm0 xmm2
-      // xmm1 xmm3
-
-      "movdqa      %%xmm0,%%xmm4                 \n"
-      "movdqa      %%xmm1,%%xmm5                 \n"
-      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, lo)
-      "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
-      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
-      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
-      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
-      "movdqu      %%xmm4,(%1)                   \n"
-
-      "movdqa      %%xmm2,%%xmm4                 \n"
-      "movdqa      %%xmm3,%%xmm5                 \n"
-      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, hi)
-      "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, hi)
-      "paddw       %%xmm2,%%xmm4                 \n"  // 9*near+3*far (1, hi)
-      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, hi)
-      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
-      "movdqu      %%xmm4,0x10(%1)               \n"
-
-      "movdqa      %%xmm1,%%xmm4                 \n"
-      "paddw       %%xmm7,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
-      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, lo)
-      "paddw       %%xmm4,%%xmm1                 \n"  // 9*near+3*far (2, lo)
-      "paddw       %%xmm0,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, lo)
-      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16
-      "movdqu      %%xmm1,(%1,%4,2)              \n"
-
-      "movdqa      %%xmm3,%%xmm4                 \n"
-      "paddw       %%xmm7,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
-      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, hi)
-      "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (2, hi)
-      "paddw       %%xmm2,%%xmm3                 \n"  // 9 3 3 1 + 8 (2, hi)
-      "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
-      "movdqu      %%xmm3,0x10(%1,%4,2)          \n"
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride)),  // %4
-        "m"(kLinearShuffleFar)        // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
-void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "pxor        %%xmm5,%%xmm5                 \n"
-      "pcmpeqd     %%xmm4,%%xmm4                 \n"
-      "psrld       $31,%%xmm4                    \n"
-      "pslld       $1,%%xmm4                     \n"  // all 2
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
-      "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
-
-      "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0123 (32b)
-      "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1234 (32b)
-
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-
-      "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
-      "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
-
-      "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
-      "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
-      "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
-      "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
-      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
-      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
-      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
-      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
-
-      "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
-      "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
-      "packssdw    %%xmm1,%%xmm0                 \n"
-      "pshufd      $0b11011000,%%xmm0,%%xmm0     \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-
-      "lea         0x8(%0),%0                    \n"
-      "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
-void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-  asm volatile(
-      "pxor        %%xmm7,%%xmm7                 \n"
-      "pcmpeqd     %%xmm6,%%xmm6                 \n"
-      "psrld       $31,%%xmm6                    \n"
-      "pslld       $3,%%xmm6                     \n"  // all 8
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
-      "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
-      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
-      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
-      "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
-      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
-      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
-      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
-      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
-      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
-      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
-
-      "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
-      "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
-      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0123 (32b)
-      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1234 (32b)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
-      "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
-      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (lo)
-      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (hi)
-      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
-      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
-      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
-      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
-
-      "movq        (%0,%3,2),%%xmm2              \n"
-      "movq        2(%0,%3,2),%%xmm3             \n"
-      "punpcklwd   %%xmm7,%%xmm2                 \n"  // 0123 (32b)
-      "punpcklwd   %%xmm7,%%xmm3                 \n"  // 1234 (32b)
-      "movdqa      %%xmm2,%%xmm4                 \n"
-      "movdqa      %%xmm3,%%xmm5                 \n"
-      "pshufd      $0b10110001,%%xmm4,%%xmm4     \n"  // 1032 (even, far)
-      "pshufd      $0b10110001,%%xmm5,%%xmm5     \n"  // 2143 (odd, far)
-      "paddd       %%xmm2,%%xmm4                 \n"  // near+far (lo)
-      "paddd       %%xmm3,%%xmm5                 \n"  // near+far (hi)
-      "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (lo)
-      "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (hi)
-      "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
-      "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
-
-      "movdqa      %%xmm0,%%xmm4                 \n"
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
-      "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
-      "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
-      "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
-      "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
-
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
-      "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
-      "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
-      "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
-      "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
-
-      "movdqa      %%xmm1,%%xmm0                 \n"
-      "movdqa      %%xmm3,%%xmm2                 \n"
-      "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
-      "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
-      "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
-      "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
-      "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
-
-      "movdqa      %%xmm3,%%xmm2                 \n"
-      "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
-      "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
-      "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
-      "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
-      "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
-
-      "packssdw    %%xmm0,%%xmm4                 \n"
-      "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
-      "movdqu      %%xmm4,(%1)                   \n"  // store above
-      "packssdw    %%xmm2,%%xmm5                 \n"
-      "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
-      "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
-
-      "lea         0x8(%0),%0                    \n"
-      "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride))   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
-void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm4,%%xmm4                 \n"
-      "psrlw       $15,%%xmm4                    \n"
-      "psllw       $1,%%xmm4                     \n"  // all 2
-      "movdqa      %3,%%xmm3                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"  // 01234567
-      "movq        1(%0),%%xmm1                  \n"  // 12345678
-      "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
-      "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
-      "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
-      "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
-      "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
-      "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
-      "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
-      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
-      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
-      "vpackuswb   %%xmm2,%%xmm0,%%xmm0          \n"
-      "vmovdqu     %%xmm0,(%1)                   \n"
-
-      "lea         0x8(%0),%0                    \n"
-      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),      // %0
-        "+r"(dst_ptr),      // %1
-        "+r"(dst_width)     // %2
-      : "m"(kLinearMadd31)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
-void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                ptrdiff_t dst_stride,
-                                int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm6,%%xmm6                 \n"
-      "psrlw       $15,%%xmm6                    \n"
-      "psllw       $3,%%xmm6                     \n"  // all 8
-      "movdqa      %5,%%xmm7                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"  // 01234567
-      "movq        1(%0),%%xmm1                  \n"  // 12345678
-      "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
-      "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
-      "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
-      "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
-      "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)
-
-      "movq        (%0,%3),%%xmm1                \n"
-      "movq        1(%0,%3),%%xmm4               \n"
-      "punpcklwd   %%xmm1,%%xmm1                 \n"
-      "punpcklwd   %%xmm4,%%xmm4                 \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "punpckhdq   %%xmm4,%%xmm3                 \n"
-      "punpckldq   %%xmm4,%%xmm1                 \n"
-      "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
-      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
-
-      // xmm0 xmm2
-      // xmm1 xmm3
-
-      "movdqa      %%xmm0,%%xmm4                 \n"
-      "movdqa      %%xmm1,%%xmm5                 \n"
-      "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
-      "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
-      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
-      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
-      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
-
-      "movdqa      %%xmm1,%%xmm5                 \n"
-      "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
-      "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
-      "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
-      "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
-      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
-
-      "movdqa      %%xmm2,%%xmm0                 \n"
-      "movdqa      %%xmm3,%%xmm1                 \n"
-      "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
-      "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
-      "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
-      "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
-      "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
-
-      "movdqa      %%xmm3,%%xmm1                 \n"
-      "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
-      "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
-      "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
-      "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
-      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
-
-      "packuswb    %%xmm0,%%xmm4                 \n"
-      "movdqu      %%xmm4,(%1)                   \n"  // store above
-      "packuswb    %%xmm1,%%xmm5                 \n"
-      "movdqu      %%xmm5,(%1,%4)                \n"  // store below
-
-      "lea         0x8(%0),%0                    \n"
-      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride)),  // %4
-        "m"(kLinearMadd31)            // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
-void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
-      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
-      "vbroadcastf128 %3,%%ymm3                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
-      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
-      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
-      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
-      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
-      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
-      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
-      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
-      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
-      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
-      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
-      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),      // %0
-        "+r"(dst_ptr),      // %1
-        "+r"(dst_width)     // %2
-      : "m"(kLinearMadd31)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
-void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
-      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
-      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
-      "vbroadcastf128 %5,%%ymm7                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
-      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
-      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
-      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
-      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
-      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
-      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
-      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
-
-      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
-      "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
-      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
-      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
-      "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
-      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
-      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
-      "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
-      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
-      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
-
-      // ymm0 ymm1
-      // ymm2 ymm3
-
-      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
-      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
-      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
-      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
-      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
-
-      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
-      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
-      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
-      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
-      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
-
-      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
-      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
-      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
-      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
-      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
-
-      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
-      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
-      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
-      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
-      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
-
-      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
-      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
-      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
-      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride)),  // %4
-        "m"(kLinearMadd31)            // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2
-void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm5                  \n"
-      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
-      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"  // 0123456789ABCDEF (16b)
-      "vmovdqu     2(%0),%%ymm1                  \n"  // 123456789ABCDEF0 (16b)
-
-      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 012389AB4567CDEF
-      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 12349ABC5678DEF0
-
-      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"  // 899AABBCCDDEEFF0 (near)
-      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
-      "vpshufb     %%ymm5,%%ymm2,%%ymm3          \n"  // 98A9BACBDCEDFE0F (far)
-      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
-
-      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // far+2
-      "vpaddw      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2
-      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far+2
-      "vpaddw      %%ymm2,%%ymm3,%%ymm3          \n"  // near+far+2
-      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
-      "vpaddw      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near
-      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 3*near+far+2
-      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 3*near+far+2
-
-      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
-      "vpsrlw      $2,%%ymm2,%%ymm2              \n"  // 3/4*near+1/4*far
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm2,32(%1)                 \n"
-
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x40(%1),%1                   \n"  // 16 sample to 32 sample
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),          // %0
-        "+r"(dst_ptr),          // %1
-        "+r"(dst_width)         // %2
-      : "m"(kLinearShuffleFar)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2
-void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
-      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
-      "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
-      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
-      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
-      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
-      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
-      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
-      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
-      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
-      "vpaddw      %%ymm0,%%ymm1,%%ymm2          \n"  // 3*near+far (1)
-
-      "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
-      "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
-      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
-      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
-      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
-      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
-      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
-      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
-      "vpaddw      %%ymm0,%%ymm1,%%ymm3          \n"  // 3*near+far (2)
-
-      "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
-      "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
-      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
-      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
-      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
-      "vmovdqu     %%ymm0,(%1)                   \n"  // store above
-
-      "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
-      "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
-      "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
-      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
-      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
-      "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride)),  // %4
-        "m"(kLinearShuffleFar)        // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
-void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrld      $31,%%ymm4,%%ymm4             \n"
-      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
-      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
-
-      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
-      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
-
-      "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
-      "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
-
-      "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
-      "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
-      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
-      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
-      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
-      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
-      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
-      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
-
-      "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
-      "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
-      "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpshufd     $0b11011000,%%ymm0,%%ymm0     \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif
-
-#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
-void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
-      "vpsrld      $31,%%ymm6,%%ymm6             \n"
-      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
-      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
-      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
-      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
-      "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
-      "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
-      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
-      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
-      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
-      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
-      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (1, lo)
-      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (1, hi)
-
-      "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b, 1u1v)
-      "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b, 1u1v)
-      "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
-      "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
-      "vpshufd     $0b10110001,%%ymm2,%%ymm4     \n"  // 10325476 (lo, far)
-      "vpshufd     $0b10110001,%%ymm3,%%ymm5     \n"  // 21436587 (hi, far)
-      "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
-      "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
-      "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
-      "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
-      "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (2, lo)
-      "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (2, hi)
-
-      "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
-      "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
-      "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
-      "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
-      "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
-
-      "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
-      "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
-      "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
-      "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
-      "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
-
-      "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
-      "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
-      "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
-      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
-      "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
-
-      "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
-      "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
-      "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
-      "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
-      "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
-
-      "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
-      "vpshufd     $0b11011000,%%ymm4,%%ymm4     \n"
-      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
-      "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
-      "vpshufd     $0b11011000,%%ymm5,%%ymm5     \n"
-      "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride))   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-// Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8_t* src_ptr,
-                      uint16_t* dst_ptr,
-                      int src_width) {
-      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
-
-               // 16 pixel loop.
-               LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm3                   \n"
-      "lea         0x10(%0),%0                   \n"  // src_ptr += 16
-      "movdqu      (%1),%%xmm0                   \n"
-      "movdqu      0x10(%1),%%xmm1               \n"
-      "movdqa      %%xmm3,%%xmm2                 \n"
-      "punpcklbw   %%xmm5,%%xmm2                 \n"
-      "punpckhbw   %%xmm5,%%xmm3                 \n"
-      "paddusw     %%xmm2,%%xmm0                 \n"
-      "paddusw     %%xmm3,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(src_width)  // %2
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-#ifdef HAS_SCALEADDROW_AVX2
-// Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8_t* src_ptr,
-                      uint16_t* dst_ptr,
-                      int src_width) {
-      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
-
-               LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm3                   \n"
-      "lea         0x20(%0),%0                   \n"  // src_ptr += 32
-      "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
-      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
-      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
-      "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
-      "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(src_width)  // %2
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SCALEADDROW_AVX2
-
-// Constant for making pixels signed to avoid pmaddubsw
-// saturation.
-static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-// Constant for making pixels unsigned and adding .5 for rounding.
-static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
-                               0x4040, 0x4040, 0x4040, 0x4040};
-
-// Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
-                           const uint8_t* src_ptr,
-                           int dst_width,
-                           int x,
-                           int dx) {
-  intptr_t x0, x1, temp_pixel;
-  asm volatile(
-      "movd        %6,%%xmm2                     \n"
-      "movd        %7,%%xmm3                     \n"
-      "movl        $0x04040000,%k2               \n"
-      "movd        %k2,%%xmm5                    \n"
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
-      "psrlw       $15,%%xmm7                    \n"  // 0x00010001
-
-      "pextrw      $0x1,%%xmm2,%k3               \n"
-      "subl        $0x2,%5                       \n"
-      "jl          29f                           \n"
-      "movdqa      %%xmm2,%%xmm0                 \n"
-      "paddd       %%xmm3,%%xmm0                 \n"
-      "punpckldq   %%xmm0,%%xmm2                 \n"
-      "punpckldq   %%xmm3,%%xmm3                 \n"
-      "paddd       %%xmm3,%%xmm3                 \n"
-      "pextrw      $0x3,%%xmm2,%k4               \n"
-
-      LABELALIGN
-      "2:                                        \n"
-      "movdqa      %%xmm2,%%xmm1                 \n"
-      "paddd       %%xmm3,%%xmm2                 \n"
-      "movzwl      0x00(%1,%3,1),%k2             \n"
-      "movd        %k2,%%xmm0                    \n"
-      "psrlw       $0x9,%%xmm1                   \n"
-      "movzwl      0x00(%1,%4,1),%k2             \n"
-      "movd        %k2,%%xmm4                    \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
-      "punpcklwd   %%xmm4,%%xmm0                 \n"
-      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
-      "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
-                                                      // 1
-      "paddusb     %%xmm7,%%xmm1                 \n"
-      "pmaddubsw   %%xmm0,%%xmm1                 \n"
-      "pextrw      $0x1,%%xmm2,%k3               \n"
-      "pextrw      $0x3,%%xmm2,%k4               \n"
-      "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
-      "psrlw       $0x7,%%xmm1                   \n"
-      "packuswb    %%xmm1,%%xmm1                 \n"
-      "movd        %%xmm1,%k2                    \n"
-      "mov         %w2,(%0)                      \n"
-      "lea         0x2(%0),%0                    \n"
-      "subl        $0x2,%5                       \n"
-      "jge         2b                            \n"
-
-      LABELALIGN
-      "29:                                       \n"
-      "addl        $0x1,%5                       \n"
-      "jl          99f                           \n"
-      "movzwl      0x00(%1,%3,1),%k2             \n"
-      "movd        %k2,%%xmm0                    \n"
-      "psrlw       $0x9,%%xmm2                   \n"
-      "pshufb      %%xmm5,%%xmm2                 \n"
-      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
-      "pxor        %%xmm6,%%xmm2                 \n"
-      "paddusb     %%xmm7,%%xmm2                 \n"
-      "pmaddubsw   %%xmm0,%%xmm2                 \n"
-      "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
-      "psrlw       $0x7,%%xmm2                   \n"
-      "packuswb    %%xmm2,%%xmm2                 \n"
-      "movd        %%xmm2,%k2                    \n"
-      "mov         %b2,(%0)                      \n"
-      "99:                                       \n"
-      : "+r"(dst_ptr),      // %0
-        "+r"(src_ptr),      // %1
-        "=&a"(temp_pixel),  // %2
-        "=&r"(x0),          // %3
-        "=&r"(x1),          // %4
-#if defined(__x86_64__)
-        "+rm"(dst_width)  // %5
-#else
-        "+m"(dst_width)  // %5
-#endif
-      : "rm"(x),   // %6
-        "rm"(dx),  // %7
-#if defined(__x86_64__)
-        "x"(kFsub80),  // %8
-        "x"(kFadd40)   // %9
-#else
-        "m"(kFsub80),    // %8
-        "m"(kFadd40)     // %9
-#endif
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
-                       const uint8_t* src_ptr,
-                       int dst_width,
-                       int x,
-                       int dx) {
-  (void)x;
-  (void)dx;
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%1),%%xmm0                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm0,%%xmm0                 \n"
-      "punpckhbw   %%xmm1,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%0)                   \n"
-      "movdqu      %%xmm1,0x10(%0)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-
-               : "+r"(dst_ptr),   // %0
-                 "+r"(src_ptr),   // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
-}
-
-void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_argb,
-                            int dst_width) {
-  (void)src_stride;
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "shufps      $0xdd,%%xmm1,%%xmm0           \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
-}
-
-void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  (void)src_stride;
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "shufps      $0x88,%%xmm1,%%xmm0           \n"
-      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
-      "pavgb       %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
-}
-
-void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_argb,
-                               int dst_width) {
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
-      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
-      "lea         0x20(%0),%0                   \n"
-      "pavgb       %%xmm2,%%xmm0                 \n"
-      "pavgb       %%xmm3,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "shufps      $0x88,%%xmm1,%%xmm0           \n"
-      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
-      "pavgb       %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-               : "+r"(src_argb),              // %0
-                 "+r"(dst_argb),              // %1
-                 "+r"(dst_width)              // %2
-               : "r"((intptr_t)(src_stride))  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_argb,
-                               int dst_width) {
-  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
-  intptr_t src_stepx_x12;
-  (void)src_stride;
-  asm volatile(
-      "lea         0x00(,%1,4),%1                \n"
-      "lea         0x00(%1,%1,2),%4              \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movd        (%0),%%xmm0                   \n"
-      "movd        0x00(%0,%1,1),%%xmm1          \n"
-      "punpckldq   %%xmm1,%%xmm0                 \n"
-      "movd        0x00(%0,%1,2),%%xmm2          \n"
-      "movd        0x00(%0,%4,1),%%xmm3          \n"
-      "lea         0x00(%0,%1,4),%0              \n"
-      "punpckldq   %%xmm3,%%xmm2                 \n"
-      "punpcklqdq  %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%3                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),       // %0
-        "+r"(src_stepx_x4),   // %1
-        "+r"(dst_argb),       // %2
-        "+r"(dst_width),      // %3
-        "=&r"(src_stepx_x12)  // %4
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
-  intptr_t src_stepx_x12;
-  intptr_t row1 = (intptr_t)(src_stride);
-  asm volatile(
-      "lea         0x00(,%1,4),%1                \n"
-      "lea         0x00(%1,%1,2),%4              \n"
-      "lea         0x00(%0,%5,1),%5              \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"
-      "movhps      0x00(%0,%1,1),%%xmm0          \n"
-      "movq        0x00(%0,%1,2),%%xmm1          \n"
-      "movhps      0x00(%0,%4,1),%%xmm1          \n"
-      "lea         0x00(%0,%1,4),%0              \n"
-      "movq        (%5),%%xmm2                   \n"
-      "movhps      0x00(%5,%1,1),%%xmm2          \n"
-      "movq        0x00(%5,%1,2),%%xmm3          \n"
-      "movhps      0x00(%5,%4,1),%%xmm3          \n"
-      "lea         0x00(%5,%1,4),%5              \n"
-      "pavgb       %%xmm2,%%xmm0                 \n"
-      "pavgb       %%xmm3,%%xmm1                 \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "shufps      $0x88,%%xmm1,%%xmm0           \n"
-      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
-      "pavgb       %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%3                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),        // %0
-        "+r"(src_stepx_x4),    // %1
-        "+r"(dst_argb),        // %2
-        "+rm"(dst_width),      // %3
-        "=&r"(src_stepx_x12),  // %4
-        "+r"(row1)             // %5
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-
-void ScaleARGBCols_SSE2(uint8_t* dst_argb,
-                        const uint8_t* src_argb,
-                        int dst_width,
-                        int x,
-                        int dx) {
-  intptr_t x0, x1;
-  asm volatile(
-      "movd        %5,%%xmm2                     \n"
-      "movd        %6,%%xmm3                     \n"
-      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
-      "pshufd      $0x11,%%xmm3,%%xmm0           \n"
-      "paddd       %%xmm0,%%xmm2                 \n"
-      "paddd       %%xmm3,%%xmm3                 \n"
-      "pshufd      $0x5,%%xmm3,%%xmm0            \n"
-      "paddd       %%xmm0,%%xmm2                 \n"
-      "paddd       %%xmm3,%%xmm3                 \n"
-      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
-      "pextrw      $0x1,%%xmm2,%k0               \n"
-      "pextrw      $0x3,%%xmm2,%k1               \n"
-      "cmp         $0x0,%4                       \n"
-      "jl          99f                           \n"
-      "sub         $0x4,%4                       \n"
-      "jl          49f                           \n"
-
-      LABELALIGN
-      "40:                                       \n"
-      "movd        0x00(%3,%0,4),%%xmm0          \n"
-      "movd        0x00(%3,%1,4),%%xmm1          \n"
-      "pextrw      $0x5,%%xmm2,%k0               \n"
-      "pextrw      $0x7,%%xmm2,%k1               \n"
-      "paddd       %%xmm3,%%xmm2                 \n"
-      "punpckldq   %%xmm1,%%xmm0                 \n"
-      "movd        0x00(%3,%0,4),%%xmm1          \n"
-      "movd        0x00(%3,%1,4),%%xmm4          \n"
-      "pextrw      $0x1,%%xmm2,%k0               \n"
-      "pextrw      $0x3,%%xmm2,%k1               \n"
-      "punpckldq   %%xmm4,%%xmm1                 \n"
-      "punpcklqdq  %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%4                       \n"
-      "jge         40b                           \n"
-
-      "49:                                       \n"
-      "test        $0x2,%4                       \n"
-      "je          29f                           \n"
-      "movd        0x00(%3,%0,4),%%xmm0          \n"
-      "movd        0x00(%3,%1,4),%%xmm1          \n"
-      "pextrw      $0x5,%%xmm2,%k0               \n"
-      "punpckldq   %%xmm1,%%xmm0                 \n"
-      "movq        %%xmm0,(%2)                   \n"
-      "lea         0x8(%2),%2                    \n"
-      "29:                                       \n"
-      "test        $0x1,%4                       \n"
-      "je          99f                           \n"
-      "movd        0x00(%3,%0,4),%%xmm0          \n"
-      "movd        %%xmm0,(%2)                   \n"
-      "99:                                       \n"
-      : "=&a"(x0),       // %0
-        "=&d"(x1),       // %1
-        "+r"(dst_argb),  // %2
-        "+r"(src_argb),  // %3
-        "+r"(dst_width)  // %4
-      : "rm"(x),         // %5
-        "rm"(dx)         // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
-                           const uint8_t* src_argb,
-                           int dst_width,
-                           int x,
-                           int dx) {
-  (void)x;
-  (void)dx;
-  asm volatile(LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%1),%%xmm0                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpckldq   %%xmm0,%%xmm0                 \n"
-      "punpckhdq   %%xmm1,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%0)                   \n"
-      "movdqu      %%xmm1,0x10(%0)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-
-               : "+r"(dst_argb),  // %0
-                 "+r"(src_argb),  // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
-}
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static const uvec8 kShuffleColARGB = {
-    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
-    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static const uvec8 kShuffleFractions = {
-    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
-                               const uint8_t* src_argb,
-                               int dst_width,
-                               int x,
-                               int dx) {
-  intptr_t x0, x1;
-  asm volatile(
-      "movdqa      %0,%%xmm4                     \n"
-      "movdqa      %1,%%xmm5                     \n"
-      :
-      : "m"(kShuffleColARGB),   // %0
-        "m"(kShuffleFractions)  // %1
-  );
-
-  asm volatile(
-      "movd        %5,%%xmm2                     \n"
-      "movd        %6,%%xmm3                     \n"
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "psrlw       $0x9,%%xmm6                   \n"
-      "pextrw      $0x1,%%xmm2,%k3               \n"
-      "sub         $0x2,%2                       \n"
-      "jl          29f                           \n"
-      "movdqa      %%xmm2,%%xmm0                 \n"
-      "paddd       %%xmm3,%%xmm0                 \n"
-      "punpckldq   %%xmm0,%%xmm2                 \n"
-      "punpckldq   %%xmm3,%%xmm3                 \n"
-      "paddd       %%xmm3,%%xmm3                 \n"
-      "pextrw      $0x3,%%xmm2,%k4               \n"
-
-      LABELALIGN
-      "2:                                        \n"
-      "movdqa      %%xmm2,%%xmm1                 \n"
-      "paddd       %%xmm3,%%xmm2                 \n"
-      "movq        0x00(%1,%3,4),%%xmm0          \n"
-      "psrlw       $0x9,%%xmm1                   \n"
-      "movhps      0x00(%1,%4,4),%%xmm0          \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "pxor        %%xmm6,%%xmm1                 \n"
-      "pmaddubsw   %%xmm1,%%xmm0                 \n"
-      "psrlw       $0x7,%%xmm0                   \n"
-      "pextrw      $0x1,%%xmm2,%k3               \n"
-      "pextrw      $0x3,%%xmm2,%k4               \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movq        %%xmm0,(%0)                   \n"
-      "lea         0x8(%0),%0                    \n"
-      "sub         $0x2,%2                       \n"
-      "jge         2b                            \n"
-
-      LABELALIGN
-      "29:                                       \n"
-      "add         $0x1,%2                       \n"
-      "jl          99f                           \n"
-      "psrlw       $0x9,%%xmm2                   \n"
-      "movq        0x00(%1,%3,4),%%xmm0          \n"
-      "pshufb      %%xmm5,%%xmm2                 \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "pxor        %%xmm6,%%xmm2                 \n"
-      "pmaddubsw   %%xmm2,%%xmm0                 \n"
-      "psrlw       $0x7,%%xmm0                   \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movd        %%xmm0,(%0)                   \n"
-
-      LABELALIGN
-      "99:                                       \n"  // clang-format error.
-
-      : "+r"(dst_argb),    // %0
-        "+r"(src_argb),    // %1
-        "+rm"(dst_width),  // %2
-        "=&r"(x0),         // %3
-        "=&r"(x1)          // %4
-      : "rm"(x),           // %5
-        "rm"(dx)           // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_X86(int num, int div) {
-  asm volatile(
-      "cdq                                       \n"
-      "shld        $0x10,%%eax,%%edx             \n"
-      "shl         $0x10,%%eax                   \n"
-      "idiv        %1                            \n"
-      "mov         %0, %%eax                     \n"
-      : "+a"(num)  // %0
-      : "c"(div)   // %1
-      : "memory", "cc", "edx");
-  return num;
-}
-
-// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
-int FixedDiv1_X86(int num, int div) {
-  asm volatile(
-      "cdq                                       \n"
-      "shld        $0x10,%%eax,%%edx             \n"
-      "shl         $0x10,%%eax                   \n"
-      "sub         $0x10001,%%eax                \n"
-      "sbb         $0x0,%%edx                    \n"
-      "sub         $0x1,%1                       \n"
-      "idiv        %1                            \n"
-      "mov         %0, %%eax                     \n"
-      : "+a"(num)  // %0
-      : "c"(div)   // %1
-      : "memory", "cc", "edx");
-  return num;
-}
-
-#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
-// Shuffle table for splitting UV into upper and lower part of register.
-static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
-                                      1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
-static const uvec8 kShuffleMergeUV = {0u,   8u,   2u,   10u,  4u,   12u,
-                                      6u,   14u,  0x80, 0x80, 0x80, 0x80,
-                                      0x80, 0x80, 0x80, 0x80};
-
-void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
-      "psrlw       $0xf,%%xmm4                   \n"
-      "packuswb    %%xmm4,%%xmm4                 \n"
-      "pxor        %%xmm5, %%xmm5                \n"  // zero
-      "movdqa      %4,%%xmm1                     \n"  // split shuffler
-      "movdqa      %5,%%xmm3                     \n"  // merge shuffler
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
-      "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
-      "lea         0x10(%0),%0                   \n"
-      "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
-      "pshufb      %%xmm1,%%xmm2                 \n"
-      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
-      "pmaddubsw   %%xmm4,%%xmm2                 \n"
-      "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
-      "psrlw       $0x1,%%xmm0                   \n"  // round
-      "pavgw       %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
-      "movq        %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"  // 4 UV
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "m"(kShuffleSplitUV),         // %4
-        "m"(kShuffleMergeUV)          // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
-
-#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
-void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst_ptr,
-                             int dst_width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
-      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
-      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
-      "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
-      "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
-      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
-      "lea         0x20(%0),%0                   \n"
-      "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
-      "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
-      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
-      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
-      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
-      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
-      "vmovdqu     %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"  // 8 UV
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "m"(kShuffleSplitUV),         // %4
-        "m"(kShuffleMergeUV)          // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
-
-static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
-                                      3, 1, 3, 1, 1, 3, 1, 3};
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
-void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm4,%%xmm4                 \n"
-      "psrlw       $15,%%xmm4                    \n"
-      "psllw       $1,%%xmm4                     \n"  // all 2
-      "movdqa      %3,%%xmm3                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
-      "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
-      "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
-      "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
-      "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
-      "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
-      "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
-      "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
-      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
-      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
-      "packuswb    %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-
-      "lea         0x8(%0),%0                    \n"
-      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),        // %0
-        "+r"(dst_ptr),        // %1
-        "+r"(dst_width)       // %2
-      : "m"(kUVLinearMadd31)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
-void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm6,%%xmm6                 \n"
-      "psrlw       $15,%%xmm6                    \n"
-      "psllw       $3,%%xmm6                     \n"  // all 8
-      "movdqa      %5,%%xmm7                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
-      "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
-      "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
-      "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
-      "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
-      "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
-
-      "movq        (%0,%3),%%xmm1                \n"
-      "movq        2(%0,%3),%%xmm4               \n"
-      "punpcklbw   %%xmm4,%%xmm1                 \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "punpckhdq   %%xmm1,%%xmm3                 \n"
-      "punpckldq   %%xmm1,%%xmm1                 \n"
-      "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
-      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
-
-      // xmm0 xmm2
-      // xmm1 xmm3
-
-      "movdqa      %%xmm0,%%xmm4                 \n"
-      "movdqa      %%xmm1,%%xmm5                 \n"
-      "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
-      "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
-      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
-      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
-      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
-
-      "movdqa      %%xmm1,%%xmm5                 \n"
-      "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
-      "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
-      "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
-      "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
-      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
-
-      "movdqa      %%xmm2,%%xmm0                 \n"
-      "movdqa      %%xmm3,%%xmm1                 \n"
-      "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
-      "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
-      "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
-      "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
-      "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
-
-      "movdqa      %%xmm3,%%xmm1                 \n"
-      "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
-      "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
-      "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
-      "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
-      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
-
-      "packuswb    %%xmm0,%%xmm4                 \n"
-      "movdqu      %%xmm4,(%1)                   \n"  // store above
-      "packuswb    %%xmm1,%%xmm5                 \n"
-      "movdqu      %%xmm5,(%1,%4)                \n"  // store below
-
-      "lea         0x8(%0),%0                    \n"
-      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride)),  // %4
-        "m"(kUVLinearMadd31)          // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
-
-void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
-      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
-      "vbroadcastf128 %3,%%ymm3                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%xmm0                   \n"
-      "vmovdqu     2(%0),%%xmm1                  \n"
-      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
-      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
-      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
-      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
-      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
-      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
-      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
-      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
-      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),        // %0
-        "+r"(dst_ptr),        // %1
-        "+r"(dst_width)       // %2
-      : "m"(kUVLinearMadd31)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
-void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
-      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
-      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
-      "vbroadcastf128 %5,%%ymm7                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%xmm0                   \n"
-      "vmovdqu     2(%0),%%xmm1                  \n"
-      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
-      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
-      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
-      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
-      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
-
-      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
-      "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
-      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
-      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
-      "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
-      "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
-      "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
-      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
-      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
-
-      // ymm0 ymm1
-      // ymm2 ymm3
-
-      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
-      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
-      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
-      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
-      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
-
-      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
-      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
-      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
-      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
-      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
-
-      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
-      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
-      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
-      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
-      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
-
-      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
-      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
-      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
-      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
-      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
-
-      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
-      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
-      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
-      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride)),  // %4
-        "m"(kUVLinearMadd31)          // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
-void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
-                                  uint16_t* dst_ptr,
-                                  int dst_width) {
-  asm volatile(
-      "pxor        %%xmm5,%%xmm5                 \n"
-      "pcmpeqd     %%xmm4,%%xmm4                 \n"
-      "psrld       $31,%%xmm4                    \n"
-      "pslld       $1,%%xmm4                     \n"  // all 2
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
-      "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
-
-      "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0011 (32b, 1u1v)
-      "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1122 (32b, 1u1v)
-
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-
-      "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (lo, far)
-      "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (hi, far)
-
-      "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
-      "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
-      "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
-      "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
-      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
-      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
-      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
-      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
-
-      "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
-      "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
-      "packusdw    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-
-      "lea         0x8(%0),%0                    \n"
-      "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
-void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width) {
-  asm volatile(
-      "pxor        %%xmm7,%%xmm7                 \n"
-      "pcmpeqd     %%xmm6,%%xmm6                 \n"
-      "psrld       $31,%%xmm6                    \n"
-      "pslld       $3,%%xmm6                     \n"  // all 8
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
-      "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
-      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
-      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "movdqa      %%xmm1,%%xmm3                 \n"
-      "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
-      "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
-      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
-      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
-      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
-      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
-      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
-      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
-
-      "movq        (%0,%3,2),%%xmm2              \n"
-      "movq        4(%0,%3,2),%%xmm3             \n"
-      "punpcklwd   %%xmm7,%%xmm2                 \n"
-      "punpcklwd   %%xmm7,%%xmm3                 \n"
-      "movdqa      %%xmm2,%%xmm4                 \n"
-      "movdqa      %%xmm3,%%xmm5                 \n"
-      "pshufd      $0b01001110,%%xmm4,%%xmm4     \n"  // 1100 (far) (2, lo)
-      "pshufd      $0b01001110,%%xmm5,%%xmm5     \n"  // 2211 (far) (2, hi)
-      "paddd       %%xmm2,%%xmm4                 \n"  // near+far (2, lo)
-      "paddd       %%xmm3,%%xmm5                 \n"  // near+far (2, hi)
-      "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (2, lo)
-      "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (2, hi)
-      "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
-      "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
-
-      "movdqa      %%xmm0,%%xmm4                 \n"
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
-      "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
-      "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
-      "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
-      "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
-
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
-      "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
-      "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
-      "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
-      "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
-
-      "movdqa      %%xmm1,%%xmm0                 \n"
-      "movdqa      %%xmm3,%%xmm2                 \n"
-      "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
-      "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
-      "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
-      "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
-      "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
-
-      "movdqa      %%xmm3,%%xmm2                 \n"
-      "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
-      "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
-      "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
-      "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
-      "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
-
-      "packusdw    %%xmm0,%%xmm4                 \n"
-      "movdqu      %%xmm4,(%1)                   \n"  // store above
-      "packusdw    %%xmm2,%%xmm5                 \n"
-      "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
-
-      "lea         0x8(%0),%0                    \n"
-      "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride))   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
-void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
-                                  uint16_t* dst_ptr,
-                                  int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsrld      $31,%%ymm4,%%ymm4             \n"
-      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
-      "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
-
-      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
-      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
-
-      "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
-      "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
-
-      "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
-      "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
-      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
-      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
-      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
-      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
-      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
-      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
-
-      "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
-      "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
-      "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
-void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
-      "vpsrld      $31,%%ymm6,%%ymm6             \n"
-      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
-
-      LABELALIGN
-      "1:                                        \n"
-
-      "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
-      "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
-      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
-      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
-      "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
-      "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
-      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
-      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
-      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
-      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
-      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (lo)
-      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (hi)
-
-      "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 00112233 (16b, 1u1v)
-      "vmovdqu     4(%0,%3,2),%%xmm3             \n"  // 11223344 (16b, 1u1v)
-      "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
-      "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
-      "vpshufd     $0b01001110,%%ymm2,%%ymm4     \n"  // 11003322 (lo, far)
-      "vpshufd     $0b01001110,%%ymm3,%%ymm5     \n"  // 22114433 (hi, far)
-      "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
-      "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
-      "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
-      "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
-      "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (lo)
-      "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (hi)
-
-      "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
-      "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
-      "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
-      "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
-      "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
-
-      "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
-      "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
-      "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
-      "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
-      "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
-
-      "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
-      "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
-      "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
-      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
-      "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
-
-      "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
-      "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
-      "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
-      "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
-      "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
-
-      "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
-      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
-      "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
-      "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
-
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride))   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/scale_mmi.cc b/thirdparty/libyuv/source/scale_mmi.cc
deleted file mode 100644
index 1226ef3..0000000
--- a/thirdparty/libyuv/source/scale_mmi.cc
+++ /dev/null
@@ -1,1168 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// clang-format off
-
-// CPU agnostic row functions
-void ScaleRowDown2_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-  const uint64_t shift = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlh      %[src0],         %[src0],           %[shift]      \n\t"
-
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlh      %[src1],         %[src1],           %[shift]      \n\t"
-
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t mask = 0x00ff00ff00ff00ffULL;
-  const uint64_t shift = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_ptr])                \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_ptr])                \n\t"
-      "and        %[dest0],         %[src0],          %[mask]       \n\t"
-      "gsldrc1    %[src1],          0x08(%[src_ptr])                \n\t"
-      "gsldlc1    %[src1],          0x0f(%[src_ptr])                \n\t"
-      "and        %[dest1],         %[src1],          %[mask]       \n\t"
-      "packushb   %[dest0],         %[dest0],         %[dest1]      \n\t"
-
-      "psrlh      %[src0],          %[src0],          %[shift]      \n\t"
-      "psrlh      %[src1],          %[src1],          %[shift]      \n\t"
-      "packushb   %[dest1],         %[src0],          %[src1]       \n\t"
-
-      "pavgb      %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],       %[src_ptr],        0x10         \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
-      "daddi      %[width],         %[width],         -0x08         \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
-        [shift] "f"(shift), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-
-  uint64_t s0, s1, t0, t1;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t mask = 0x00ff00ff00ff00ffULL;
-  const uint64_t shift0 = 0x2ULL;
-  const uint64_t shift1 = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddh      %[dest0],         %[s0],            %[s1]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[t0]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[t1]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[ph]         \n\t"
-      "psrlh      %[dest0],         %[dest0],         %[shift0]     \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddh      %[dest1],         %[s0],            %[s1]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[t0]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[t1]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[ph]         \n\t"
-      "psrlh      %[dest1],         %[dest1],         %[shift0]     \n\t"
-
-      "packushb   %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[s],            %[s],              0x10          \n\t"
-      "daddiu     %[t],            %[t],              0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_argb,
-                           int dst_width) {
-  (void)src_stride;
-
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[src0],           %[src1]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src0],         0x00(%[src_ptr])                 \n\t"
-      "lwc1       %[src1],         0x08(%[src_ptr])                 \n\t"
-      "punpcklwd  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "lwc1       %[src0],         0x04(%[src_ptr])                 \n\t"
-      "lwc1       %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "punpcklwd  %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "pavgb      %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  const uint8_t* s = src_argb;
-  const uint8_t* t = src_argb + src_stride;
-
-  uint64_t s0, s_hi, s_lo;
-  uint64_t t0, t_hi, t_lo;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t mask = 0x0ULL;
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t shfit = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
-      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
-      "paddh      %[dest_lo],       %[s_lo],         %[s_hi]        \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
-      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
-      "paddh      %[dest_lo],       %[dest_lo],      %[t_lo]        \n\t"
-      "paddh      %[dest_lo],       %[dest_lo],      %[t_hi]        \n\t"
-
-      "paddh      %[dest_lo],      %[dest_lo],       %[ph]          \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],       %[shfit]       \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
-      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
-      "paddh      %[dest_hi],       %[s_lo],         %[s_hi]        \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
-      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
-      "paddh      %[dest_hi],       %[dest_hi],      %[t_lo]        \n\t"
-      "paddh      %[dest_hi],       %[dest_hi],      %[t_hi]        \n\t"
-
-      "paddh      %[dest_hi],      %[dest_hi],       %[ph]          \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],       %[shfit]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],       %[dest_hi]     \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[s],            %[s],              0x10          \n\t"
-      "daddiu     %[t],            %[t],              0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
-        [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
-        [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
-      : "memory");
-}
-
-void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-  const uint64_t shift = 0x10ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-
-      "packsswh   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint16_t* dst,
-                                int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpcklhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "punpcklhw  %[src0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhhw  %[src1],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "pavgh      %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width) {
-  const uint16_t* s = src_ptr;
-  const uint16_t* t = src_ptr + src_stride;
-
-  uint64_t s0, s1, s_hi, s_lo;
-  uint64_t t0, t1, t_hi, t_lo;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t ph = 0x0000000200000002ULL;
-  const uint64_t mask = 0x0000ffff0000ffffULL;
-  const uint64_t shift0 = 0x10ULL;
-  const uint64_t shift1 = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddw      %[dest0],         %[s0],            %[s1]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[t0]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[t1]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[ph]         \n\t"
-      "psrlw      %[dest0],         %[dest0],         %[shift1]     \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddw      %[dest1],         %[s0],            %[s1]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[t0]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[t1]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[ph]         \n\t"
-      "psrlw      %[dest1],         %[dest1],         %[shift1]     \n\t"
-
-      "packsswh   %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[s],             %[s],              0x10         \n\t"
-      "daddiu     %[t],             %[t],              0x10         \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
-      "daddi      %[width],         %[width],         -0x04         \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
-        [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
-        [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
-        [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleRowDown4_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t shift = 0x10ULL;
-  const uint64_t mask = 0x000000ff000000ffULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-      "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-      "and        %[src1],         %[src1],           %[mask]       \n\t"
-      "packsswh   %[dest_lo],      %[src0],           %[src1]       \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-      "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-      "and        %[src1],         %[src1],           %[mask]       \n\t"
-      "packsswh   %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],         %[dest_hi]   \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift), [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpckhhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "punpcklhw  %[dest_lo],      %[dest_lo],        %[mask]       \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
-      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
-      "punpcklhw  %[dest_hi],      %[dest_hi],        %[mask]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_PUNPCKADD()                              \
-  "punpcklbh  %[src_lo],       %[src],           %[mask0]      \n\t" \
-  "punpckhbh  %[src_hi],       %[src],           %[mask0]      \n\t" \
-  "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]     \n\t" \
-  "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]     \n\t"
-
-#define DO_SCALEROWDOWN4BOX_LOOP(reg)                                \
-  "ldc1       %[src],          0x00(%[src0_ptr])               \n\t" \
-  "punpcklbh  %[dest_lo],      %[src],           %[mask0]      \n\t" \
-  "punpckhbh  %[dest_hi],      %[src],           %[mask0]      \n\t" \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src1_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src2_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src3_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "pmaddhw    %[dest_lo],      %[dest_lo],       %[mask1]      \n\t" \
-  "pmaddhw    %[dest_hi],      %[dest_hi],       %[mask1]      \n\t" \
-  "packsswh   " #reg   ",      %[dest_lo],       %[dest_hi]    \n\t" \
-  "pmaddhw    " #reg   ",      " #reg   ",       %[mask1]      \n\t" \
-  "paddh      " #reg   ",      " #reg   ",       %[ph]         \n\t" \
-  "psrlh      " #reg   ",      " #reg   ",       %[shift]      \n\t" \
-                                                                     \
-  "daddiu     %[src0_ptr],     %[src0_ptr],      0x08          \n\t" \
-  "daddiu     %[src1_ptr],     %[src1_ptr],      0x08          \n\t" \
-  "daddiu     %[src2_ptr],     %[src2_ptr],      0x08          \n\t" \
-  "daddiu     %[src3_ptr],     %[src3_ptr],      0x08          \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box */
-void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  const uint8_t* src0_ptr = src_ptr;
-  const uint8_t* src1_ptr = src_ptr + src_stride;
-  const uint8_t* src2_ptr = src_ptr + src_stride * 2;
-  const uint8_t* src3_ptr = src_ptr + src_stride * 3;
-
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
-  const uint64_t mask0 = 0x0ULL;
-  const uint64_t mask1 = 0x0001000100010001ULL;
-  const uint64_t ph = 0x0008000800080008ULL;
-  const uint64_t shift = 0x4ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
-
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[dest_hi],      %[dest2],          %[dest3]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
-        [ph] "f"(ph), [mask1] "f"(mask1)
-      : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                            \
-  "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t" \
-  "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t" \
-  "paddh      %[dest_lo],      %[dest_lo],        %[src_lo]     \n\t" \
-  "paddh      %[dest_hi],      %[dest_hi],        %[src_hi]     \n\t"
-
-#define DO_SCALEROWDOWN4BOX_16_LOOP(reg)                              \
-  "ldc1       %[src],          0x00(%[src0_ptr])                \n\t" \
-  "punpcklbh  %[dest_lo],      %[src],            %[mask0]      \n\t" \
-  "punpckhbh  %[dest_hi],      %[src],            %[mask0]      \n\t" \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src1_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src2_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src3_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "paddw      %[dest],         %[dest_lo],        %[dest_hi]    \n\t" \
-  "punpckhwd  %[dest_hi],      %[dest],           %[dest]       \n\t" \
-  "paddw      %[dest],         %[dest_hi],        %[dest]       \n\t" \
-  "paddw      %[dest],         %[dest],           %[ph]         \n\t" \
-  "psraw      %[dest],         %[dest],           %[shift]      \n\t" \
-  "and        " #reg ",        %[dest],           %[mask1]      \n\t" \
-                                                                      \
-  "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t" \
-  "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t" \
-  "daddiu     %[src2_ptr],     %[src2_ptr],       0x08          \n\t" \
-  "daddiu     %[src3_ptr],     %[src3_ptr],       0x08          \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box_16 */
-void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width) {
-  const uint16_t* src0_ptr = src_ptr;
-  const uint16_t* src1_ptr = src_ptr + src_stride;
-  const uint16_t* src2_ptr = src_ptr + src_stride * 2;
-  const uint16_t* src3_ptr = src_ptr + src_stride * 3;
-
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
-  const uint64_t mask0 = 0x0ULL;
-  const uint64_t mask1 = 0x00000000ffffffffULL;
-  const uint64_t ph = 0x0000000800000008ULL;
-  const uint64_t shift = 0x04ULL;
-
-  __asm__ volatile(
-      "1:                                                        \n\t"
-
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
-      "punpcklwd  %[dest_lo],      %[dest0],          %[dest1]   \n\t"
-      "punpcklwd  %[dest_hi],      %[dest2],          %[dest3]   \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi] \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])              \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])              \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08       \n\t"
-      "daddi      %[width],        %[width],         -0x04       \n\t"
-      "bnez       %[width],        1b                            \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
-        [ph] "f"(ph), [mask1] "f"(mask1)
-      : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_MMI(uint8_t* dst_ptr,
-                      const uint8_t* src_ptr,
-                      int dst_width,
-                      int x,
-                      int dx) {
-  uint64_t src, dest;
-
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
-                         const uint16_t* src_ptr,
-                         int dst_width,
-                         int x,
-                         int dx) {
-  uint64_t src, dest;
-
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-
-      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
-  uint64_t src, src_hi, src_lo, dest0, dest1;
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask]       \n\t"
-
-      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "paddush    %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "paddush    %[dest1],        %[dest1],          %[src_hi]     \n\t"
-
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
-                        uint32_t* dst_ptr,
-                        int src_width) {
-  uint64_t src, src_hi, src_lo, dest0, dest1;
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "punpcklhw  %[src_lo],       %[src],            %[mask]       \n\t"
-      "punpckhhw  %[src_hi],       %[src],            %[mask]       \n\t"
-
-      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-
-      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src_hi]     \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              int src_stepx,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src0],          0x00(%[src_ptr])                \n\t"
-      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
-      "lwc1       %[src1],          0x00(%[src_ptr])                \n\t"
-      "punpcklwd  %[dest],          %[src0],          %[src1]       \n\t"
-
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x08          \n\t"
-      "daddi      %[width],         %[width],        -0x02          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
-        [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 int src_stepx,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  const uint8_t* src0_ptr = src_argb;
-  const uint8_t* src1_ptr = src_argb + src_stride;
-
-  uint64_t src0, src1, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1;
-
-  const uint64_t mask = 0x0ULL;
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t shift = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-
-      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
-      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
-
-      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
-      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
-      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
-      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
-      "paddh      %[dest0],        %[dest_hi],       %[dest_lo]     \n\t"
-      "paddh      %[dest0],        %[dest0],         %[ph]          \n\t"
-      "psrlh      %[dest0],        %[dest0],         %[shift]       \n\t"
-
-      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
-      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
-
-      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
-      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
-
-      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
-      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
-      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
-      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
-      "paddh      %[dest1],        %[dest_hi],       %[dest_lo]     \n\t"
-      "paddh      %[dest1],        %[dest1],         %[ph]          \n\t"
-      "psrlh      %[dest1],        %[dest1],         %[shift]       \n\t"
-
-      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
-      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
-        [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
-        [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
-        [ph] "f"(ph)
-      : "memory");
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_MMI(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x,
-                       int dx) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-
-  const uint32_t* src_tmp;
-
-  uint64_t dest, offset;
-
-  const uint64_t shift0 = 16;
-  const uint64_t shift1 = 2;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "srav       %[offset],        %[x],             %[shift0]     \n\t"
-      "sllv       %[offset],        %[offset],        %[shift1]     \n\t"
-      "dadd       %[src_tmp],       %[src_ptr],       %[offset]     \n\t"
-      "lwc1       %[dest],          0x00(%[src_tmp])                \n\t"
-      "swc1       %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "dadd       %[x],             %[x],             %[dx]         \n\t"
-
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x04          \n\t"
-      "daddi      %[width],         %[width],        -0x01          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
-      : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
-                          const uint8_t* src_argb,
-                          int dst_width,
-                          int x,
-                          int dx) {
-  uint64_t src, dest0, dest1;
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],           0x00(%[src_ptr])                \n\t"
-      "gsldlc1    %[src],           0x07(%[src_ptr])                \n\t"
-      "punpcklwd  %[dest0],         %[src],           %[src]        \n\t"
-      "gssdlc1    %[dest0],         0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest0],         0x00(%[dst_ptr])                \n\t"
-      "punpckhwd  %[dest1],         %[src],           %[src]        \n\t"
-      "gssdlc1    %[dest1],         0x0f(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest1],         0x08(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],       %[src_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
-      "daddi      %[width],         %[width],        -0x04          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
-      : "memory");
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVBaseTest.TestFixedDiv */
-int FixedDiv_MIPS(int num, int div) {
-  int quotient = 0;
-  const int shift = 16;
-
-  asm(
-      "dsll    %[num],     %[num],     %[shift]    \n\t"
-      "ddiv    %[num],     %[div]                  \t\n"
-      "mflo    %[quo]                              \t\n"
-      : [quo] "+&r"(quotient)
-      : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
-
-  return quotient;
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
-int FixedDiv1_MIPS(int num, int div) {
-  int quotient = 0;
-  const int shift = 16;
-  const int val1 = 1;
-  const int64_t val11 = 0x00010001ULL;
-
-  asm(
-      "dsll    %[num],     %[num],     %[shift]    \n\t"
-      "dsub    %[num],     %[num],     %[val11]    \n\t"
-      "dsub    %[div],     %[div],     %[val1]     \n\t"
-      "ddiv    %[num],     %[div]                  \t\n"
-      "mflo    %[quo]                              \t\n"
-      : [quo] "+&r"(quotient)
-      : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
-        [shift] "r"(shift));
-
-  return quotient;
-}
-
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint16_t* dst,
-                        int dst_width) {
-  const uint16_t* src2_ptr = src_ptr + src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest04, dest15, dest26, dest37;
-  uint64_t tmp0, tmp1, tmp2, tmp3;
-
-  const uint64_t mask0 = 0x0003000900030009ULL;
-  const uint64_t mask1 = 0x0001000300010003ULL;
-  const uint64_t mask2 = 0x0009000300090003ULL;
-  const uint64_t mask3 = 0x0003000100030001ULL;
-  const uint64_t ph = 0x0000000800000008ULL;
-  const uint64_t shift = 4;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],          0x00(%[src1_ptr])               \n\t"
-      "gsldlc1    %[src0],          0x07(%[src1_ptr])               \n\t"
-      "pmaddhw    %[dest04],        %[src0],          %[mask0]      \n\t"
-      "gsldrc1    %[src1],          0x00(%[src2_ptr])               \n\t"
-      "gsldlc1    %[src1],          0x07(%[src2_ptr])               \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
-      "paddw      %[dest04],        %[dest04],        %[dest]       \n\t"
-      "paddw      %[dest04],        %[dest04],        %[ph]         \n\t"
-      "psrlw      %[dest04],        %[dest04],        %[shift]      \n\t"
-
-      "pmaddhw    %[dest15],        %[src0],          %[mask2]      \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
-      "paddw      %[dest15],        %[dest15],        %[dest]       \n\t"
-      "paddw      %[dest15],        %[dest15],        %[ph]         \n\t"
-      "psrlw      %[dest15],        %[dest15],        %[shift]      \n\t"
-
-      "gsldrc1    %[src0],          0x02(%[src1_ptr])               \n\t"
-      "gsldlc1    %[src0],          0x09(%[src1_ptr])               \n\t"
-      "pmaddhw    %[dest26],        %[src0],          %[mask0]      \n\t"
-      "gsldrc1    %[src1],          0x02(%[src2_ptr])               \n\t"
-      "gsldlc1    %[src1],          0x09(%[src2_ptr])               \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
-      "paddw      %[dest26],        %[dest26],        %[dest]       \n\t"
-      "paddw      %[dest26],        %[dest26],        %[ph]         \n\t"
-      "psrlw      %[dest26],        %[dest26],        %[shift]      \n\t"
-
-      "pmaddhw    %[dest37],        %[src0],          %[mask2]      \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
-      "paddw      %[dest37],        %[dest37],        %[dest]       \n\t"
-      "paddw      %[dest37],        %[dest37],        %[ph]         \n\t"
-      "psrlw      %[dest37],        %[dest37],        %[shift]      \n\t"
-
-      /* tmp0 = ( 00 04 02 06 ) */
-      "packsswh   %[tmp0],          %[dest04],        %[dest26]     \n\t"
-      /* tmp1 = ( 01 05 03 07 ) */
-      "packsswh   %[tmp1],          %[dest15],        %[dest37]     \n\t"
-
-      /* tmp2 = ( 00 01 04 05 )*/
-      "punpcklhw  %[tmp2],          %[tmp0],          %[tmp1]       \n\t"
-      /* tmp3 = ( 02 03 06 07 )*/
-      "punpckhhw  %[tmp3],          %[tmp0],          %[tmp1]       \n\t"
-
-      /* ( 00 01 02 03 ) */
-      "punpcklwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      /* ( 04 05 06 07 ) */
-      "punpckhwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
-      "gssdlc1    %[dest],          0x0f(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x08(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src1_ptr],      %[src1_ptr],      0x08          \n\t"
-      "daddiu     %[src2_ptr],      %[src2_ptr],      0x08          \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
-      "daddi      %[width],         %[width],        -0x08          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
-        [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
-        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
-      : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
-      : "memory");
-}
-
-void ScaleRowDown34_MMI(const uint8_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t* dst,
-                      int dst_width) {
-  (void)src_stride;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint64_t src[2];
-  uint64_t tmp[2];
-  __asm__ volatile (
-    "1:                                                           \n\t"
-    "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-    "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-    "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-    "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-    "and        %[tmp1],         %[src0],        %[mask1]         \n\t"
-    "psrlw      %[tmp0],         %[src0],        %[rmov]          \n\t"
-    "psllw      %[tmp0],         %[tmp0],        %[lmov1]         \n\t"
-    "or         %[src0],         %[tmp0],        %[tmp1]          \n\t"
-    "punpckhwd  %[tmp0],         %[src0],        %[src0]          \n\t"
-    "psllw      %[tmp1],         %[tmp0],        %[rmov]          \n\t"
-    "or         %[src0],         %[src0],        %[tmp1]          \n\t"
-    "psrlw      %[tmp0],         %[tmp0],        %[rmov8]         \n\t"
-    "pextrh     %[tmp0],         %[tmp0],        %[zero]          \n\t"
-    "pinsrh_2   %[src0],         %[src0],        %[tmp0]          \n\t"
-    "pextrh     %[tmp0],         %[src1],        %[zero]          \n\t"
-    "pinsrh_3   %[src0],         %[src0],        %[tmp0]          \n\t"
-
-    "punpckhwd  %[tmp0],         %[src1],        %[src1]          \n\t"
-    "pextrh     %[tmp1],         %[tmp0],        %[zero]          \n\t"
-    "psrlw      %[src1],         %[src1],        %[rmov]          \n\t"
-    "psllw      %[tmp1],         %[tmp1],        %[rmov8]         \n\t"
-    "or         %[src1],         %[src1],        %[tmp1]          \n\t"
-    "and        %[tmp0],         %[tmp0],        %[mask2]         \n\t"
-    "or         %[src1],         %[src1],        %[tmp0]          \n\t"
-
-    "gssdlc1    %[src0],         0x07(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[src0],         0x00(%[dst_ptr])                 \n\t"
-    "gsswlc1    %[src1],         0x0b(%[dst_ptr])                 \n\t"
-    "gsswrc1    %[src1],         0x08(%[dst_ptr])                 \n\t"
-
-    "daddiu     %[src_ptr],      %[src_ptr],     0x10             \n\t"
-    "daddi      %[width],        %[width],      -0x0c             \n\t"
-    "daddiu     %[dst_ptr],      %[dst_ptr],     0x0c             \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [src0]"=&f"(src[0]),              [src1]"=&f"(src[1]),
-      [tmp0]"=&f"(tmp[0]),              [tmp1]"=&f"(tmp[1])
-    : [src_ptr]"r"(src_ptr),            [dst_ptr]"r"(dst),
-      [lmov]"f"(0xc),                   [rmov]"f"(0x18),
-      [mask1]"f"(0xffff0000ffff),       [rmov8]"f"(0x8),
-      [zero]"f"(0x0),                   [mask2]"f"(0xff000000),
-      [width]"r"(dst_width),            [lmov1]"f"(0x10)
-    : "memory"
-  );
-}
-// clang-format on
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/scale_msa.cc b/thirdparty/libyuv/source/scale_msa.cc
deleted file mode 100644
index 482a521..0000000
--- a/thirdparty/libyuv/source/scale_msa.cc
+++ /dev/null
@@ -1,949 +0,0 @@
-/*
- *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "libyuv/scale_row.h"
-
-// This module is for GCC MSA
-#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
-#include "libyuv/macros_msa.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define LOAD_INDEXED_DATA(srcp, indx0, out0) \
-  {                                          \
-    out0[0] = srcp[indx0[0]];                \
-    out0[1] = srcp[indx0[1]];                \
-    out0[2] = srcp[indx0[2]];                \
-    out0[3] = srcp[indx0[3]];                \
-  }
-
-void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_argb,
-                           int dst_width) {
-  int x;
-  v16u8 src0, src1, dst0;
-  (void)src_stride;
-
-  for (x = 0; x < dst_width; x += 4) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
-    dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
-    ST_UB(dst0, dst_argb);
-    src_argb += 32;
-    dst_argb += 16;
-  }
-}
-
-void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  int x;
-  v16u8 src0, src1, vec0, vec1, dst0;
-  (void)src_stride;
-
-  for (x = 0; x < dst_width; x += 4) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
-    vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
-    vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
-    dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
-    ST_UB(dst0, dst_argb);
-    src_argb += 32;
-    dst_argb += 16;
-  }
-}
-
-void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  int x;
-  const uint8_t* s = src_argb;
-  const uint8_t* t = src_argb + src_stride;
-  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
-  v8u16 reg0, reg1, reg2, reg3;
-  v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
-
-  for (x = 0; x < dst_width; x += 4) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
-    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
-    vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
-    vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
-    vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
-    vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
-    reg0 = __msa_hadd_u_h(vec0, vec0);
-    reg1 = __msa_hadd_u_h(vec1, vec1);
-    reg2 = __msa_hadd_u_h(vec2, vec2);
-    reg3 = __msa_hadd_u_h(vec3, vec3);
-    reg0 += reg2;
-    reg1 += reg3;
-    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
-    reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
-    ST_UB(dst0, dst_argb);
-    s += 32;
-    t += 32;
-    dst_argb += 16;
-  }
-}
-
-void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              int32_t src_stepx,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  int x;
-  int32_t stepx = src_stepx * 4;
-  int32_t data0, data1, data2, data3;
-  (void)src_stride;
-
-  for (x = 0; x < dst_width; x += 4) {
-    data0 = LW(src_argb);
-    data1 = LW(src_argb + stepx);
-    data2 = LW(src_argb + stepx * 2);
-    data3 = LW(src_argb + stepx * 3);
-    SW(data0, dst_argb);
-    SW(data1, dst_argb + 4);
-    SW(data2, dst_argb + 8);
-    SW(data3, dst_argb + 12);
-    src_argb += stepx * 4;
-    dst_argb += 16;
-  }
-}
-
-void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 int src_stepx,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  int x;
-  const uint8_t* nxt_argb = src_argb + src_stride;
-  int32_t stepx = src_stepx * 4;
-  int64_t data0, data1, data2, data3;
-  v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
-  v16u8 vec0, vec1, vec2, vec3;
-  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-  v16u8 dst0;
-
-  for (x = 0; x < dst_width; x += 4) {
-    data0 = LD(src_argb);
-    data1 = LD(src_argb + stepx);
-    data2 = LD(src_argb + stepx * 2);
-    data3 = LD(src_argb + stepx * 3);
-    src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
-    src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
-    src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
-    src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
-    data0 = LD(nxt_argb);
-    data1 = LD(nxt_argb + stepx);
-    data2 = LD(nxt_argb + stepx * 2);
-    data3 = LD(nxt_argb + stepx * 3);
-    src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
-    src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
-    src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
-    src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
-    vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
-    vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
-    vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
-    vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
-    reg0 = __msa_hadd_u_h(vec0, vec0);
-    reg1 = __msa_hadd_u_h(vec1, vec1);
-    reg2 = __msa_hadd_u_h(vec2, vec2);
-    reg3 = __msa_hadd_u_h(vec3, vec3);
-    reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
-    reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
-    reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
-    reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
-    reg4 += reg6;
-    reg5 += reg7;
-    reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
-    reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
-    ST_UB(dst0, dst_argb);
-    src_argb += stepx * 4;
-    nxt_argb += stepx * 4;
-    dst_argb += 16;
-  }
-}
-
-void ScaleRowDown2_MSA(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
-  (void)src_stride;
-
-  for (x = 0; x < dst_width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
-    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
-    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    ST_UB2(dst0, dst1, dst, 16);
-    src_ptr += 64;
-    dst += 32;
-  }
-}
-
-void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width) {
-  int x;
-  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
-  (void)src_stride;
-
-  for (x = 0; x < dst_width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
-    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
-    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
-    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
-    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
-    dst0 = __msa_aver_u_b(vec1, vec0);
-    dst1 = __msa_aver_u_b(vec3, vec2);
-    ST_UB2(dst0, dst1, dst, 16);
-    src_ptr += 64;
-    dst += 32;
-  }
-}
-
-void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  int x;
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
-  v8u16 vec0, vec1, vec2, vec3;
-
-  for (x = 0; x < dst_width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
-    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
-    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
-    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
-    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
-    vec0 = __msa_hadd_u_h(src0, src0);
-    vec1 = __msa_hadd_u_h(src1, src1);
-    vec2 = __msa_hadd_u_h(src2, src2);
-    vec3 = __msa_hadd_u_h(src3, src3);
-    vec0 += __msa_hadd_u_h(src4, src4);
-    vec1 += __msa_hadd_u_h(src5, src5);
-    vec2 += __msa_hadd_u_h(src6, src6);
-    vec3 += __msa_hadd_u_h(src7, src7);
-    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
-    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
-    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
-    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    ST_UB2(dst0, dst1, dst, 16);
-    s += 64;
-    t += 64;
-    dst += 32;
-  }
-}
-
-void ScaleRowDown4_MSA(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  int x;
-  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
-  (void)src_stride;
-
-  for (x = 0; x < dst_width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
-    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
-    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
-    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
-    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst);
-    src_ptr += 64;
-    dst += 16;
-  }
-}
-
-void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  int x;
-  const uint8_t* s = src_ptr;
-  const uint8_t* t0 = s + src_stride;
-  const uint8_t* t1 = s + src_stride * 2;
-  const uint8_t* t2 = s + src_stride * 3;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
-  v8u16 vec0, vec1, vec2, vec3;
-  v4u32 reg0, reg1, reg2, reg3;
-
-  for (x = 0; x < dst_width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
-    src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
-    src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
-    src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
-    src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
-    vec0 = __msa_hadd_u_h(src0, src0);
-    vec1 = __msa_hadd_u_h(src1, src1);
-    vec2 = __msa_hadd_u_h(src2, src2);
-    vec3 = __msa_hadd_u_h(src3, src3);
-    vec0 += __msa_hadd_u_h(src4, src4);
-    vec1 += __msa_hadd_u_h(src5, src5);
-    vec2 += __msa_hadd_u_h(src6, src6);
-    vec3 += __msa_hadd_u_h(src7, src7);
-    src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
-    src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
-    src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
-    src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
-    src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
-    src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
-    vec0 += __msa_hadd_u_h(src0, src0);
-    vec1 += __msa_hadd_u_h(src1, src1);
-    vec2 += __msa_hadd_u_h(src2, src2);
-    vec3 += __msa_hadd_u_h(src3, src3);
-    vec0 += __msa_hadd_u_h(src4, src4);
-    vec1 += __msa_hadd_u_h(src5, src5);
-    vec2 += __msa_hadd_u_h(src6, src6);
-    vec3 += __msa_hadd_u_h(src7, src7);
-    reg0 = __msa_hadd_u_w(vec0, vec0);
-    reg1 = __msa_hadd_u_w(vec1, vec1);
-    reg2 = __msa_hadd_u_w(vec2, vec2);
-    reg3 = __msa_hadd_u_w(vec3, vec3);
-    reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
-    reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
-    reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
-    reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst);
-    s += 64;
-    t0 += 64;
-    t1 += 64;
-    t2 += 64;
-    dst += 16;
-  }
-}
-
-void ScaleRowDown38_MSA(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width) {
-  int x, width;
-  uint64_t dst0;
-  uint32_t dst1;
-  v16u8 src0, src1, vec0;
-  v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
-  (void)src_stride;
-
-  assert(dst_width % 3 == 0);
-  width = dst_width / 3;
-
-  for (x = 0; x < width; x += 4) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
-    vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
-    dst0 = __msa_copy_u_d((v2i64)vec0, 0);
-    dst1 = __msa_copy_u_w((v4i32)vec0, 2);
-    SD(dst0, dst);
-    SW(dst1, dst + 8);
-    src_ptr += 32;
-    dst += 12;
-  }
-}
-
-void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width) {
-  int x, width;
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  uint64_t dst0;
-  uint32_t dst1;
-  v16u8 src0, src1, src2, src3, out;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
-  v8i16 zero = {0};
-  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
-  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
-  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
-  v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
-
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  width = dst_width / 3;
-
-  for (x = 0; x < width; x += 4) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
-    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
-    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
-    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
-    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
-    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
-    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
-    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
-    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
-    vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
-    vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
-    vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
-    vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
-    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
-    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
-    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
-    tmp0 = __msa_hadd_u_w(vec4, vec4);
-    tmp1 = __msa_hadd_u_w(vec5, vec5);
-    tmp2 = __msa_hadd_u_w(vec6, vec6);
-    tmp3 = __msa_hadd_u_w(vec7, vec7);
-    tmp4 = __msa_hadd_u_w(vec0, vec0);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
-    tmp0 = __msa_hadd_u_w(vec0, vec0);
-    tmp1 = __msa_hadd_u_w(vec1, vec1);
-    tmp0 *= const_0x2AAA;
-    tmp1 *= const_0x2AAA;
-    tmp4 *= const_0x4000;
-    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
-    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
-    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
-    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
-    dst0 = __msa_copy_u_d((v2i64)out, 0);
-    dst1 = __msa_copy_u_w((v4i32)out, 2);
-    SD(dst0, dst_ptr);
-    SW(dst1, dst_ptr + 8);
-    s += 32;
-    t += 32;
-    dst_ptr += 12;
-  }
-}
-
-void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width) {
-  int x, width;
-  const uint8_t* s = src_ptr;
-  const uint8_t* t0 = s + src_stride;
-  const uint8_t* t1 = s + src_stride * 2;
-  uint64_t dst0;
-  uint32_t dst1;
-  v16u8 src0, src1, src2, src3, src4, src5, out;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
-  v8u16 zero = {0};
-  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
-  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
-  v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
-  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
-
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  width = dst_width / 3;
-
-  for (x = 0; x < width; x += 4) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
-    src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
-    src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
-    src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
-    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
-    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
-    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
-    vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
-    vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
-    vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
-    vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
-    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
-    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
-    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
-    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
-    vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
-    vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
-    vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
-    vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
-    vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
-    vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
-    vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
-    vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
-    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
-    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
-    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
-    tmp0 = __msa_hadd_u_w(vec4, vec4);
-    tmp1 = __msa_hadd_u_w(vec5, vec5);
-    tmp2 = __msa_hadd_u_w(vec6, vec6);
-    tmp3 = __msa_hadd_u_w(vec7, vec7);
-    tmp4 = __msa_hadd_u_w(vec0, vec0);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
-    tmp0 = __msa_hadd_u_w(vec0, vec0);
-    tmp1 = __msa_hadd_u_w(vec1, vec1);
-    tmp0 *= const_0x1C71;
-    tmp1 *= const_0x1C71;
-    tmp4 *= const_0x2AAA;
-    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
-    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
-    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
-    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
-    dst0 = __msa_copy_u_d((v2i64)out, 0);
-    dst1 = __msa_copy_u_w((v4i32)out, 2);
-    SD(dst0, dst_ptr);
-    SW(dst1, dst_ptr + 8);
-    s += 32;
-    t0 += 32;
-    t1 += 32;
-    dst_ptr += 12;
-  }
-}
-
-void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
-  int x;
-  v16u8 src0;
-  v8u16 dst0, dst1;
-  v16i8 zero = {0};
-
-  assert(src_width > 0);
-
-  for (x = 0; x < src_width; x += 16) {
-    src0 = LD_UB(src_ptr);
-    dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
-    dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
-    dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
-    dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
-    ST_UH2(dst0, dst1, dst_ptr, 8);
-    src_ptr += 16;
-    dst_ptr += 16;
-  }
-}
-
-void ScaleFilterCols_MSA(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         int dst_width,
-                         int x,
-                         int dx) {
-  int j;
-  v4i32 vec_x = __msa_fill_w(x);
-  v4i32 vec_dx = __msa_fill_w(dx);
-  v4i32 vec_const = {0, 1, 2, 3};
-  v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
-  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8u16 reg0, reg1;
-  v16u8 dst0;
-  v4i32 const_0xFFFF = __msa_fill_w(0xFFFF);
-  v4i32 const_0x40 = __msa_fill_w(0x40);
-
-  vec0 = vec_dx * vec_const;
-  vec1 = vec_dx * 4;
-  vec_x += vec0;
-
-  for (j = 0; j < dst_width - 1; j += 16) {
-    vec2 = vec_x >> 16;
-    vec6 = vec_x & const_0xFFFF;
-    vec_x += vec1;
-    vec3 = vec_x >> 16;
-    vec7 = vec_x & const_0xFFFF;
-    vec_x += vec1;
-    vec4 = vec_x >> 16;
-    vec8 = vec_x & const_0xFFFF;
-    vec_x += vec1;
-    vec5 = vec_x >> 16;
-    vec9 = vec_x & const_0xFFFF;
-    vec_x += vec1;
-    vec6 >>= 9;
-    vec7 >>= 9;
-    vec8 >>= 9;
-    vec9 >>= 9;
-    LOAD_INDEXED_DATA(src_ptr, vec2, tmp0);
-    LOAD_INDEXED_DATA(src_ptr, vec3, tmp1);
-    LOAD_INDEXED_DATA(src_ptr, vec4, tmp2);
-    LOAD_INDEXED_DATA(src_ptr, vec5, tmp3);
-    vec2 += 1;
-    vec3 += 1;
-    vec4 += 1;
-    vec5 += 1;
-    LOAD_INDEXED_DATA(src_ptr, vec2, tmp4);
-    LOAD_INDEXED_DATA(src_ptr, vec3, tmp5);
-    LOAD_INDEXED_DATA(src_ptr, vec4, tmp6);
-    LOAD_INDEXED_DATA(src_ptr, vec5, tmp7);
-    tmp4 -= tmp0;
-    tmp5 -= tmp1;
-    tmp6 -= tmp2;
-    tmp7 -= tmp3;
-    tmp4 *= vec6;
-    tmp5 *= vec7;
-    tmp6 *= vec8;
-    tmp7 *= vec9;
-    tmp4 += const_0x40;
-    tmp5 += const_0x40;
-    tmp6 += const_0x40;
-    tmp7 += const_0x40;
-    tmp4 >>= 7;
-    tmp5 >>= 7;
-    tmp6 >>= 7;
-    tmp7 >>= 7;
-    tmp0 += tmp4;
-    tmp1 += tmp5;
-    tmp2 += tmp6;
-    tmp3 += tmp7;
-    reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
-    reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
-    __msa_st_b(dst0, dst_ptr, 0);
-    dst_ptr += 16;
-  }
-}
-
-void ScaleARGBCols_MSA(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x,
-                       int dx) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-  int j;
-  v4i32 x_vec = __msa_fill_w(x);
-  v4i32 dx_vec = __msa_fill_w(dx);
-  v4i32 const_vec = {0, 1, 2, 3};
-  v4i32 vec0, vec1, vec2;
-  v4i32 dst0;
-
-  vec0 = dx_vec * const_vec;
-  vec1 = dx_vec * 4;
-  x_vec += vec0;
-
-  for (j = 0; j < dst_width; j += 4) {
-    vec2 = x_vec >> 16;
-    x_vec += vec1;
-    LOAD_INDEXED_DATA(src, vec2, dst0);
-    __msa_st_w(dst0, dst, 0);
-    dst += 4;
-  }
-}
-
-void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
-                             const uint8_t* src_argb,
-                             int dst_width,
-                             int x,
-                             int dx) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  int j;
-  v4u32 src0, src1, src2, src3;
-  v4u32 vec0, vec1, vec2, vec3;
-  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-  v16u8 mult0, mult1, mult2, mult3;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 dst0, dst1;
-  v4u32 vec_x = (v4u32)__msa_fill_w(x);
-  v4u32 vec_dx = (v4u32)__msa_fill_w(dx);
-  v4u32 vec_const = {0, 1, 2, 3};
-  v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f);
-
-  vec0 = vec_dx * vec_const;
-  vec1 = vec_dx * 4;
-  vec_x += vec0;
-
-  for (j = 0; j < dst_width - 1; j += 8) {
-    vec2 = vec_x >> 16;
-    reg0 = (v16u8)(vec_x >> 9);
-    vec_x += vec1;
-    vec3 = vec_x >> 16;
-    reg1 = (v16u8)(vec_x >> 9);
-    vec_x += vec1;
-    reg0 = reg0 & const_0x7f;
-    reg1 = reg1 & const_0x7f;
-    reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0);
-    reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0);
-    reg2 = reg0 ^ const_0x7f;
-    reg3 = reg1 ^ const_0x7f;
-    mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2);
-    mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2);
-    mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3);
-    mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3);
-    LOAD_INDEXED_DATA(src, vec2, src0);
-    LOAD_INDEXED_DATA(src, vec3, src1);
-    vec2 += 1;
-    vec3 += 1;
-    LOAD_INDEXED_DATA(src, vec2, src2);
-    LOAD_INDEXED_DATA(src, vec3, src3);
-    reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
-    reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
-    reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
-    reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
-    tmp0 = __msa_dotp_u_h(reg4, mult0);
-    tmp1 = __msa_dotp_u_h(reg5, mult1);
-    tmp2 = __msa_dotp_u_h(reg6, mult2);
-    tmp3 = __msa_dotp_u_h(reg7, mult3);
-    tmp0 >>= 7;
-    tmp1 >>= 7;
-    tmp2 >>= 7;
-    tmp3 >>= 7;
-    dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-    __msa_st_b(dst0, dst_argb, 0);
-    __msa_st_b(dst1, dst_argb, 16);
-    dst_argb += 32;
-  }
-}
-
-void ScaleRowDown34_MSA(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width) {
-  int x;
-  (void)src_stride;
-  v16u8 src0, src1, src2, src3;
-  v16u8 vec0, vec1, vec2;
-  v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20};
-  v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25};
-  v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20,
-                 21, 23, 24, 25, 27, 28, 29, 31};
-
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-
-  for (x = 0; x < dst_width; x += 48) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
-    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
-    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
-    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1);
-    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2);
-    __msa_st_b((v16i8)vec0, dst, 0);
-    __msa_st_b((v16i8)vec1, dst, 16);
-    __msa_st_b((v16i8)vec2, dst, 32);
-    src_ptr += 64;
-    dst += 48;
-  }
-}
-
-void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* d,
-                              int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  int x;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
-  v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
-  v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
-  v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
-  v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
-  v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
-  v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
-  v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
-  v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
-                 16, 17, 17, 18, 18, 19, 20, 21};
-  v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
-  v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
-  v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
-  v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
-
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-
-  for (x = 0; x < dst_width; x += 48) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
-    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
-    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
-    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
-    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
-    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
-    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
-    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
-    vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
-    vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
-    vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
-    vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
-    vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
-    vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
-    vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
-    vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
-    vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
-    reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
-    reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
-    reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
-    reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
-    reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
-    reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
-    reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
-    reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
-    reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
-    reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
-    reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
-    reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
-    reg0 = __msa_srar_h(reg0, shft0);
-    reg1 = __msa_srar_h(reg1, shft1);
-    reg2 = __msa_srar_h(reg2, shft2);
-    reg3 = __msa_srar_h(reg3, shft0);
-    reg4 = __msa_srar_h(reg4, shft1);
-    reg5 = __msa_srar_h(reg5, shft2);
-    reg6 = __msa_srar_h(reg6, shft0);
-    reg7 = __msa_srar_h(reg7, shft1);
-    reg8 = __msa_srar_h(reg8, shft2);
-    reg9 = __msa_srar_h(reg9, shft0);
-    reg10 = __msa_srar_h(reg10, shft1);
-    reg11 = __msa_srar_h(reg11, shft2);
-    reg0 = reg0 * 3 + reg6;
-    reg1 = reg1 * 3 + reg7;
-    reg2 = reg2 * 3 + reg8;
-    reg3 = reg3 * 3 + reg9;
-    reg4 = reg4 * 3 + reg10;
-    reg5 = reg5 * 3 + reg11;
-    reg0 = __msa_srari_h(reg0, 2);
-    reg1 = __msa_srari_h(reg1, 2);
-    reg2 = __msa_srari_h(reg2, 2);
-    reg3 = __msa_srari_h(reg3, 2);
-    reg4 = __msa_srari_h(reg4, 2);
-    reg5 = __msa_srari_h(reg5, 2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
-    dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
-    __msa_st_b((v16i8)dst0, d, 0);
-    __msa_st_b((v16i8)dst1, d, 16);
-    __msa_st_b((v16i8)dst2, d, 32);
-    s += 64;
-    t += 64;
-    d += 48;
-  }
-}
-
-void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* d,
-                              int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-  int x;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
-  v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
-  v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
-  v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
-  v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
-  v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
-  v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
-  v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
-  v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
-                 16, 17, 17, 18, 18, 19, 20, 21};
-  v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
-  v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
-  v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
-  v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
-
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-
-  for (x = 0; x < dst_width; x += 48) {
-    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
-    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
-    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
-    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
-    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
-    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
-    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
-    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
-    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
-    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
-    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
-    vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
-    vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
-    vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
-    vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
-    vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
-    vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
-    vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
-    vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
-    vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
-    reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
-    reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
-    reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
-    reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
-    reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
-    reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
-    reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
-    reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
-    reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
-    reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
-    reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
-    reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
-    reg0 = __msa_srar_h(reg0, shft0);
-    reg1 = __msa_srar_h(reg1, shft1);
-    reg2 = __msa_srar_h(reg2, shft2);
-    reg3 = __msa_srar_h(reg3, shft0);
-    reg4 = __msa_srar_h(reg4, shft1);
-    reg5 = __msa_srar_h(reg5, shft2);
-    reg6 = __msa_srar_h(reg6, shft0);
-    reg7 = __msa_srar_h(reg7, shft1);
-    reg8 = __msa_srar_h(reg8, shft2);
-    reg9 = __msa_srar_h(reg9, shft0);
-    reg10 = __msa_srar_h(reg10, shft1);
-    reg11 = __msa_srar_h(reg11, shft2);
-    reg0 += reg6;
-    reg1 += reg7;
-    reg2 += reg8;
-    reg3 += reg9;
-    reg4 += reg10;
-    reg5 += reg11;
-    reg0 = __msa_srari_h(reg0, 1);
-    reg1 = __msa_srari_h(reg1, 1);
-    reg2 = __msa_srari_h(reg2, 1);
-    reg3 = __msa_srari_h(reg3, 1);
-    reg4 = __msa_srari_h(reg4, 1);
-    reg5 = __msa_srari_h(reg5, 1);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
-    dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
-    dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
-    __msa_st_b((v16i8)dst0, d, 0);
-    __msa_st_b((v16i8)dst1, d, 16);
-    __msa_st_b((v16i8)dst2, d, 32);
-    s += 64;
-    t += 64;
-    d += 48;
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/thirdparty/libyuv/source/scale_neon.cc b/thirdparty/libyuv/source/scale_neon.cc
deleted file mode 100644
index 6a0d6e1..0000000
--- a/thirdparty/libyuv/source/scale_neon.cc
+++ /dev/null
@@ -1,1494 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-// NEON downscalers with interpolation.
-// Provided by Fritz Koenig
-
-// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      // load even pixels into q0, odd into q1
-      "vld2.8      {q0, q1}, [%0]!               \n"
-      "subs        %2, %2, #16                   \n"  // 16 processed per loop
-      "vst1.8      {q1}, [%1]!                   \n"  // store odd pixels
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "q0", "q1"  // Clobber List
-  );
-}
-
-// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst,
-                              int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8      {q0, q1}, [%0]!               \n"  // load 32 pixels
-      "subs        %2, %2, #16                   \n"  // 16 processed per loop
-      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
-      "vst1.8      {q0}, [%1]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "q0", "q1"  // Clobber List
-  );
-}
-
-// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst,
-                           int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add         %1, %0                        \n"
-      "1:                                        \n"
-      "vld1.8      {q0, q1}, [%0]!               \n"  // load row 1 and post inc
-      "vld1.8      {q2, q3}, [%1]!               \n"  // load row 2 and post inc
-      "subs        %3, %3, #16                   \n"  // 16 processed per loop
-      "vpaddl.u8   q0, q0                        \n"  // row 1 add adjacent
-      "vpaddl.u8   q1, q1                        \n"
-      "vpadal.u8   q0, q2                        \n"  // row 2 add adjacent +
-                                                      // row1
-      "vpadal.u8   q1, q3                        \n"
-      "vrshrn.u16  d0, q0, #2                    \n"  // downshift, round and
-                                                      // pack
-      "vrshrn.u16  d1, q1, #2                    \n"
-      "vst1.8      {q0}, [%2]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-void ScaleRowDown4_NEON(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop
-      "vst1.8      {d2}, [%1]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "q0", "q1", "memory", "cc");
-}
-
-void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                   \n"  // load up 16x4
-      "vld1.8      {q1}, [%3]!                   \n"
-      "vld1.8      {q2}, [%4]!                   \n"
-      "vld1.8      {q3}, [%5]!                   \n"
-      "subs        %2, %2, #4                    \n"
-      "vpaddl.u8   q0, q0                        \n"
-      "vpadal.u8   q0, q1                        \n"
-      "vpadal.u8   q0, q2                        \n"
-      "vpadal.u8   q0, q3                        \n"
-      "vpaddl.u16  q0, q0                        \n"
-      "vrshrn.u32  d0, q0, #4                    \n"  // divide by 16 w/rounding
-      "vmovn.u16   d0, q0                        \n"
-      "vst1.32     {d0[0]}, [%1]!                \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_ptr1),   // %3
-        "+r"(src_ptr2),   // %4
-        "+r"(src_ptr3)    // %5
-      :
-      : "q0", "q1", "q2", "q3", "memory", "cc");
-}
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
-      "subs        %2, %2, #24                   \n"
-      "vmov        d2, d3                        \n"  // order d0, d1, d2
-      "vst3.8      {d0, d1, d2}, [%1]!           \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "d0", "d1", "d2", "d3", "memory", "cc");
-}
-
-void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "vmov.u8     d24, #3                       \n"
-      "add         %3, %0                        \n"
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
-      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"  // src line 1
-      "subs        %2, %2, #24                   \n"
-
-      // filter src line 0 with src line 1
-      // expand chars to shorts to allow for room
-      // when adding lines together
-      "vmovl.u8    q8, d4                        \n"
-      "vmovl.u8    q9, d5                        \n"
-      "vmovl.u8    q10, d6                       \n"
-      "vmovl.u8    q11, d7                       \n"
-
-      // 3 * line_0 + line_1
-      "vmlal.u8    q8, d0, d24                   \n"
-      "vmlal.u8    q9, d1, d24                   \n"
-      "vmlal.u8    q10, d2, d24                  \n"
-      "vmlal.u8    q11, d3, d24                  \n"
-
-      // (3 * line_0 + line_1 + 2) >> 2
-      "vqrshrn.u16 d0, q8, #2                    \n"
-      "vqrshrn.u16 d1, q9, #2                    \n"
-      "vqrshrn.u16 d2, q10, #2                   \n"
-      "vqrshrn.u16 d3, q11, #2                   \n"
-
-      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
-      "vmovl.u8    q8, d1                        \n"
-      "vmlal.u8    q8, d0, d24                   \n"
-      "vqrshrn.u16 d0, q8, #2                    \n"
-
-      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
-      "vrhadd.u8   d1, d1, d2                    \n"
-
-      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
-      "vmovl.u8    q8, d2                        \n"
-      "vmlal.u8    q8, d3, d24                   \n"
-      "vqrshrn.u16 d2, q8, #2                    \n"
-
-      "vst3.8      {d0, d1, d2}, [%1]!           \n"
-
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_stride)  // %3
-      :
-      : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
-        "cc");
-}
-
-void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "vmov.u8     d24, #3                       \n"
-      "add         %3, %0                        \n"
-      "1:                                        \n"
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
-      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"  // src line 1
-      "subs        %2, %2, #24                   \n"
-      // average src line 0 with src line 1
-      "vrhadd.u8   q0, q0, q2                    \n"
-      "vrhadd.u8   q1, q1, q3                    \n"
-
-      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
-      "vmovl.u8    q3, d1                        \n"
-      "vmlal.u8    q3, d0, d24                   \n"
-      "vqrshrn.u16 d0, q3, #2                    \n"
-
-      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
-      "vrhadd.u8   d1, d1, d2                    \n"
-
-      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
-      "vmovl.u8    q3, d2                        \n"
-      "vmlal.u8    q3, d3, d24                   \n"
-      "vqrshrn.u16 d2, q3, #2                    \n"
-
-      "vst3.8      {d0, d1, d2}, [%1]!           \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_stride)  // %3
-      :
-      : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
-}
-
-#define HAS_SCALEROWDOWN38_NEON
-static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
-                              22, 24, 27, 30, 0,  0,  0,  0};
-static const uvec8 kShuf38_2 = {0,  8, 16, 2,  10, 17, 4, 12,
-                                18, 6, 14, 19, 0,  0,  0, 0};
-static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
-                                   65536 / 12, 65536 / 12, 65536 / 12,
-                                   65536 / 12, 65536 / 12};
-static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
-                                   65536 / 18, 65536 / 18, 65536 / 18,
-                                   65536 / 18, 65536 / 18};
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "vld1.8      {q3}, [%3]                    \n"
-      "1:                                        \n"
-      "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"
-      "subs        %2, %2, #12                   \n"
-      "vtbl.u8     d4, {d0, d1, d2, d3}, d6      \n"
-      "vtbl.u8     d5, {d0, d1, d2, d3}, d7      \n"
-      "vst1.8      {d4}, [%1]!                   \n"
-      "vst1.32     {d5[0]}, [%1]!                \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "r"(&kShuf38)    // %3
-      : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
-}
-
-// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8_t* dst_ptr,
-                                      int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
-
-  asm volatile(
-      "vld1.16     {q13}, [%5]                   \n"
-      "vld1.8      {q14}, [%6]                   \n"
-      "vld1.8      {q15}, [%7]                   \n"
-      "add         %3, %0                        \n"
-      "1:                                        \n"
-
-      // d0 = 00 40 01 41 02 42 03 43
-      // d1 = 10 50 11 51 12 52 13 53
-      // d2 = 20 60 21 61 22 62 23 63
-      // d3 = 30 70 31 71 32 72 33 73
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"
-      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"
-      "vld4.8      {d16, d17, d18, d19}, [%4]!   \n"
-      "subs        %2, %2, #12                   \n"
-
-      // Shuffle the input data around to get align the data
-      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-      // d0 = 00 10 01 11 02 12 03 13
-      // d1 = 40 50 41 51 42 52 43 53
-      "vtrn.u8     d0, d1                        \n"
-      "vtrn.u8     d4, d5                        \n"
-      "vtrn.u8     d16, d17                      \n"
-
-      // d2 = 20 30 21 31 22 32 23 33
-      // d3 = 60 70 61 71 62 72 63 73
-      "vtrn.u8     d2, d3                        \n"
-      "vtrn.u8     d6, d7                        \n"
-      "vtrn.u8     d18, d19                      \n"
-
-      // d0 = 00+10 01+11 02+12 03+13
-      // d2 = 40+50 41+51 42+52 43+53
-      "vpaddl.u8   q0, q0                        \n"
-      "vpaddl.u8   q2, q2                        \n"
-      "vpaddl.u8   q8, q8                        \n"
-
-      // d3 = 60+70 61+71 62+72 63+73
-      "vpaddl.u8   d3, d3                        \n"
-      "vpaddl.u8   d7, d7                        \n"
-      "vpaddl.u8   d19, d19                      \n"
-
-      // combine source lines
-      "vadd.u16    q0, q2                        \n"
-      "vadd.u16    q0, q8                        \n"
-      "vadd.u16    d4, d3, d7                    \n"
-      "vadd.u16    d4, d19                       \n"
-
-      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-      //             + s[6 + st * 1] + s[7 + st * 1]
-      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-      "vqrdmulh.s16 q2, q2, q13                  \n"
-      "vmovn.u16   d4, q2                        \n"
-
-      // Shuffle 2,3 reg around so that 2 can be added to the
-      //  0,1 reg and 3 can be added to the 4,5 reg. This
-      //  requires expanding from u8 to u16 as the 0,1 and 4,5
-      //  registers are already expanded. Then do transposes
-      //  to get aligned.
-      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-      "vmovl.u8    q1, d2                        \n"
-      "vmovl.u8    q3, d6                        \n"
-      "vmovl.u8    q9, d18                       \n"
-
-      // combine source lines
-      "vadd.u16    q1, q3                        \n"
-      "vadd.u16    q1, q9                        \n"
-
-      // d4 = xx 20 xx 30 xx 22 xx 32
-      // d5 = xx 21 xx 31 xx 23 xx 33
-      "vtrn.u32    d2, d3                        \n"
-
-      // d4 = xx 20 xx 21 xx 22 xx 23
-      // d5 = xx 30 xx 31 xx 32 xx 33
-      "vtrn.u16    d2, d3                        \n"
-
-      // 0+1+2, 3+4+5
-      "vadd.u16    q0, q1                        \n"
-
-      // Need to divide, but can't downshift as the the value
-      //  isn't a power of 2. So multiply by 65536 / n
-      //  and take the upper 16 bits.
-      "vqrdmulh.s16 q0, q0, q15                  \n"
-
-      // Align for table lookup, vtbl requires registers to
-      //  be adjacent
-      "vmov.u8     d2, d4                        \n"
-
-      "vtbl.u8     d3, {d0, d1, d2}, d28         \n"
-      "vtbl.u8     d4, {d0, d1, d2}, d29         \n"
-
-      "vst1.8      {d3}, [%1]!                   \n"
-      "vst1.32     {d4[0]}, [%1]!                \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),       // %0
-        "+r"(dst_ptr),       // %1
-        "+r"(dst_width),     // %2
-        "+r"(src_stride),    // %3
-        "+r"(src_ptr1)       // %4
-      : "r"(&kMult38_Div6),  // %5
-        "r"(&kShuf38_2),     // %6
-        "r"(&kMult38_Div9)   // %7
-      : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
-        "cc");
-}
-
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "vld1.16     {q13}, [%4]                   \n"
-      "vld1.8      {q14}, [%5]                   \n"
-      "add         %3, %0                        \n"
-      "1:                                        \n"
-
-      // d0 = 00 40 01 41 02 42 03 43
-      // d1 = 10 50 11 51 12 52 13 53
-      // d2 = 20 60 21 61 22 62 23 63
-      // d3 = 30 70 31 71 32 72 33 73
-      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"
-      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"
-      "subs        %2, %2, #12                   \n"
-
-      // Shuffle the input data around to get align the data
-      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-      // d0 = 00 10 01 11 02 12 03 13
-      // d1 = 40 50 41 51 42 52 43 53
-      "vtrn.u8     d0, d1                        \n"
-      "vtrn.u8     d4, d5                        \n"
-
-      // d2 = 20 30 21 31 22 32 23 33
-      // d3 = 60 70 61 71 62 72 63 73
-      "vtrn.u8     d2, d3                        \n"
-      "vtrn.u8     d6, d7                        \n"
-
-      // d0 = 00+10 01+11 02+12 03+13
-      // d2 = 40+50 41+51 42+52 43+53
-      "vpaddl.u8   q0, q0                        \n"
-      "vpaddl.u8   q2, q2                        \n"
-
-      // d3 = 60+70 61+71 62+72 63+73
-      "vpaddl.u8   d3, d3                        \n"
-      "vpaddl.u8   d7, d7                        \n"
-
-      // combine source lines
-      "vadd.u16    q0, q2                        \n"
-      "vadd.u16    d4, d3, d7                    \n"
-
-      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-      "vqrshrn.u16 d4, q2, #2                    \n"
-
-      // Shuffle 2,3 reg around so that 2 can be added to the
-      //  0,1 reg and 3 can be added to the 4,5 reg. This
-      //  requires expanding from u8 to u16 as the 0,1 and 4,5
-      //  registers are already expanded. Then do transposes
-      //  to get aligned.
-      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-      "vmovl.u8    q1, d2                        \n"
-      "vmovl.u8    q3, d6                        \n"
-
-      // combine source lines
-      "vadd.u16    q1, q3                        \n"
-
-      // d4 = xx 20 xx 30 xx 22 xx 32
-      // d5 = xx 21 xx 31 xx 23 xx 33
-      "vtrn.u32    d2, d3                        \n"
-
-      // d4 = xx 20 xx 21 xx 22 xx 23
-      // d5 = xx 30 xx 31 xx 32 xx 33
-      "vtrn.u16    d2, d3                        \n"
-
-      // 0+1+2, 3+4+5
-      "vadd.u16    q0, q1                        \n"
-
-      // Need to divide, but can't downshift as the the value
-      //  isn't a power of 2. So multiply by 65536 / n
-      //  and take the upper 16 bits.
-      "vqrdmulh.s16 q0, q0, q13                  \n"
-
-      // Align for table lookup, vtbl requires registers to
-      //  be adjacent
-      "vmov.u8     d2, d4                        \n"
-
-      "vtbl.u8     d3, {d0, d1, d2}, d28         \n"
-      "vtbl.u8     d4, {d0, d1, d2}, d29         \n"
-
-      "vst1.8      {d3}, [%1]!                   \n"
-      "vst1.32     {d4[0]}, [%1]!                \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),       // %0
-        "+r"(dst_ptr),       // %1
-        "+r"(dst_width),     // %2
-        "+r"(src_stride)     // %3
-      : "r"(&kMult38_Div6),  // %4
-        "r"(&kShuf38_2)      // %5
-      : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
-}
-
-void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int dst_width) {
-  const uint8_t* src_temp = src_ptr + 1;
-  asm volatile(
-      "vmov.u8     d30, #3                       \n"
-
-      "1:                                        \n"
-      "vld1.8      {d4}, [%0]!                   \n"  // 01234567
-      "vld1.8      {d5}, [%3]!                   \n"  // 12345678
-
-      "vmovl.u8    q0, d4                        \n"  // 01234567 (16b)
-      "vmovl.u8    q1, d5                        \n"  // 12345678 (16b)
-      "vmlal.u8    q0, d5, d30                   \n"  // 3*near+far (odd)
-      "vmlal.u8    q1, d4, d30                   \n"  // 3*near+far (even)
-
-      "vrshrn.u16  d1, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
-      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (even)
-
-      "vst2.8      {d0, d1}, [%1]!               \n"  // store
-      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_temp)    // %3
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
-  );
-}
-
-void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
-  const uint8_t* src_temp = src_ptr + 1;
-  const uint8_t* src_temp1 = src_ptr1 + 1;
-
-  asm volatile(
-      "vmov.u16    q15, #3                       \n"
-      "vmov.u8     d28, #3                       \n"
-
-      "1:                                        \n"
-      "vld1.8      {d4}, [%0]!                   \n"  // 01234567
-      "vld1.8      {d5}, [%5]!                   \n"  // 12345678
-
-      "vmovl.u8    q0, d4                        \n"  // 01234567 (16b)
-      "vmovl.u8    q1, d5                        \n"  // 12345678 (16b)
-      "vmlal.u8    q0, d5, d28                   \n"  // 3*near+far (1, odd)
-      "vmlal.u8    q1, d4, d28                   \n"  // 3*near+far (1, even)
-
-      "vld1.8      {d8}, [%1]!                   \n"
-      "vld1.8      {d9}, [%6]!                   \n"
-
-      "vmovl.u8    q2, d8                        \n"
-      "vmovl.u8    q3, d9                        \n"
-      "vmlal.u8    q2, d9, d28                   \n"  // 3*near+far (2, odd)
-      "vmlal.u8    q3, d8, d28                   \n"  // 3*near+far (2, even)
-
-      // e  o
-      // q1 q0
-      // q3 q2
-
-      "vmovq       q4, q2                        \n"
-      "vmovq       q5, q3                        \n"
-      "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
-      "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
-      "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
-      "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
-
-      // e  o
-      // q5 q4
-      // q1 q0
-
-      "vrshrn.u16  d2, q1, #4                    \n"  // 2, even
-      "vrshrn.u16  d3, q0, #4                    \n"  // 2, odd
-      "vrshrn.u16  d0, q5, #4                    \n"  // 1, even
-      "vrshrn.u16  d1, q4, #4                    \n"  // 1, odd
-
-      "vst2.8      {d0, d1}, [%2]!               \n"  // store
-      "vst2.8      {d2, d3}, [%3]!               \n"  // store
-      "subs        %4, %4, #16                   \n"  // 8 sample -> 16 sample
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(src_ptr1),   // %1
-        "+r"(dst_ptr),    // %2
-        "+r"(dst_ptr1),   // %3
-        "+r"(dst_width),  // %4
-        "+r"(src_temp),   // %5
-        "+r"(src_temp1)   // %6
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
-        "q15"  // Clobber List
-  );
-}
-
-void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width) {
-  const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
-      "vmov.u16    q15, #3                       \n"
-
-      "1:                                        \n"
-      "vld1.16     {q1}, [%0]!                   \n"  // 01234567 (16b)
-      "vld1.16     {q0}, [%3]!                   \n"  // 12345678 (16b)
-
-      "vmovq       q2, q0                        \n"
-      "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (odd)
-      "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (even)
-
-      "vrshr.u16   q0, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
-      "vrshr.u16   q1, q1, #2                    \n"  // 3/4*near+1/4*far (even)
-
-      "vst2.16     {d0, d1, d2, d3}, [%1]!       \n"  // store
-      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_temp)    // %3
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
-  );
-}
-
-void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
-  const uint16_t* src_temp = src_ptr + 1;
-  const uint16_t* src_temp1 = src_ptr1 + 1;
-
-  asm volatile(
-      "vmov.u16    q15, #3                       \n"
-
-      "1:                                        \n"
-      "vld1.16     {q0}, [%0]!                   \n"  // 01234567 (16b)
-      "vld1.16     {q1}, [%5]!                   \n"  // 12345678 (16b)
-
-      "vmovq       q2, q0                        \n"
-      "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (odd)
-      "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (even)
-
-      "vld1.16     {q2}, [%1]!                   \n"  // 01234567 (16b)
-      "vld1.16     {q3}, [%6]!                   \n"  // 12345678 (16b)
-
-      "vmovq       q4, q2                        \n"
-      "vmla.u16    q2, q3, q15                   \n"  // 3*near+far (odd)
-      "vmla.u16    q3, q4, q15                   \n"  // 3*near+far (even)
-
-      "vmovq       q4, q2                        \n"
-      "vmovq       q5, q3                        \n"
-      "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
-      "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
-      "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
-      "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
-
-      "vrshr.u16   q2, q1, #4                    \n"  // 2, even
-      "vrshr.u16   q3, q0, #4                    \n"  // 2, odd
-      "vrshr.u16   q0, q5, #4                    \n"  // 1, even
-      "vrshr.u16   q1, q4, #4                    \n"  // 1, odd
-
-      "vst2.16     {d0, d1, d2, d3}, [%2]!       \n"  // store
-      "vst2.16     {d4, d5, d6, d7}, [%3]!       \n"  // store
-      "subs        %4, %4, #16                   \n"  // 8 sample -> 16 sample
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(src_ptr1),   // %1
-        "+r"(dst_ptr),    // %2
-        "+r"(dst_ptr1),   // %3
-        "+r"(dst_width),  // %4
-        "+r"(src_temp),   // %5
-        "+r"(src_temp1)   // %6
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
-        "q15"  // Clobber List
-  );
-}
-
-void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width) {
-  const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
-      "vmov.u16    d31, #3                       \n"
-
-      "1:                                        \n"
-      "vld1.16     {q0}, [%0]!                   \n"  // 01234567 (16b)
-      "vld1.16     {q1}, [%3]!                   \n"  // 12345678 (16b)
-
-      "vmovl.u16   q2, d0                        \n"  // 0123 (32b)
-      "vmovl.u16   q3, d1                        \n"  // 4567 (32b)
-      "vmovl.u16   q4, d2                        \n"  // 1234 (32b)
-      "vmovl.u16   q5, d3                        \n"  // 5678 (32b)
-
-      "vmlal.u16   q2, d2, d31                   \n"
-      "vmlal.u16   q3, d3, d31                   \n"
-      "vmlal.u16   q4, d0, d31                   \n"
-      "vmlal.u16   q5, d1, d31                   \n"
-
-      "vrshrn.u32  d0, q4, #2                    \n"
-      "vrshrn.u32  d1, q5, #2                    \n"
-      "vrshrn.u32  d2, q2, #2                    \n"
-      "vrshrn.u32  d3, q3, #2                    \n"
-
-      "vst2.16     {q0, q1}, [%1]!               \n"  // store
-      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_temp)    // %3
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
-  );
-}
-
-void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
-  const uint16_t* src_temp = src_ptr + 1;
-  const uint16_t* src_temp1 = src_ptr1 + 1;
-
-  asm volatile(
-      "vmov.u16    d31, #3                       \n"
-      "vmov.u32    q14, #3                       \n"
-
-      "1:                                        \n"
-      "vld1.16     {d0}, [%0]!                   \n"  // 0123 (16b)
-      "vld1.16     {d1}, [%5]!                   \n"  // 1234 (16b)
-      "vmovl.u16   q2, d0                        \n"  // 0123 (32b)
-      "vmovl.u16   q3, d1                        \n"  // 1234 (32b)
-      "vmlal.u16   q2, d1, d31                   \n"
-      "vmlal.u16   q3, d0, d31                   \n"
-
-      "vld1.16     {d0}, [%1]!                   \n"  // 0123 (16b)
-      "vld1.16     {d1}, [%6]!                   \n"  // 1234 (16b)
-      "vmovl.u16   q4, d0                        \n"  // 0123 (32b)
-      "vmovl.u16   q5, d1                        \n"  // 1234 (32b)
-      "vmlal.u16   q4, d1, d31                   \n"
-      "vmlal.u16   q5, d0, d31                   \n"
-
-      "vmovq       q0, q4                        \n"
-      "vmovq       q1, q5                        \n"
-      "vmla.u32    q4, q2, q14                   \n"
-      "vmla.u32    q5, q3, q14                   \n"
-      "vmla.u32    q2, q0, q14                   \n"
-      "vmla.u32    q3, q1, q14                   \n"
-
-      "vrshrn.u32  d1, q4, #4                    \n"
-      "vrshrn.u32  d0, q5, #4                    \n"
-      "vrshrn.u32  d3, q2, #4                    \n"
-      "vrshrn.u32  d2, q3, #4                    \n"
-
-      "vst2.16     {d0, d1}, [%2]!               \n"  // store
-      "vst2.16     {d2, d3}, [%3]!               \n"  // store
-      "subs        %4, %4, #8                    \n"  // 4 sample -> 8 sample
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(src_ptr1),   // %1
-        "+r"(dst_ptr),    // %2
-        "+r"(dst_ptr1),   // %3
-        "+r"(dst_width),  // %4
-        "+r"(src_temp),   // %5
-        "+r"(src_temp1)   // %6
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
-        "d31"  // Clobber List
-  );
-}
-
-void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  const uint8_t* src_temp = src_ptr + 2;
-  asm volatile(
-      "vmov.u8     d30, #3                       \n"
-
-      "1:                                        \n"
-      "vld1.8      {d4}, [%0]!                   \n"  // 00112233 (1u1v)
-      "vld1.8      {d5}, [%3]!                   \n"  // 11223344 (1u1v)
-
-      "vmovl.u8    q0, d4                        \n"  // 00112233 (1u1v, 16b)
-      "vmovl.u8    q1, d5                        \n"  // 11223344 (1u1v, 16b)
-      "vmlal.u8    q0, d5, d30                   \n"  // 3*near+far (odd)
-      "vmlal.u8    q1, d4, d30                   \n"  // 3*near+far (even)
-
-      "vrshrn.u16  d1, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
-      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (even)
-
-      "vst2.16     {d0, d1}, [%1]!               \n"  // store
-      "subs        %2, %2, #8                    \n"  // 4 uv -> 8 uv
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_temp)    // %3
-      :
-      : "memory", "cc", "q0", "q1", "q2", "d30"  // Clobber List
-  );
-}
-
-void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
-  const uint8_t* src_temp = src_ptr + 2;
-  const uint8_t* src_temp1 = src_ptr1 + 2;
-
-  asm volatile(
-      "vmov.u16    q15, #3                       \n"
-      "vmov.u8     d28, #3                       \n"
-
-      "1:                                        \n"
-      "vld1.8      {d4}, [%0]!                   \n"  // 00112233 (1u1v)
-      "vld1.8      {d5}, [%5]!                   \n"  // 11223344 (1u1v)
-
-      "vmovl.u8    q0, d4                        \n"  // 00112233 (1u1v, 16b)
-      "vmovl.u8    q1, d5                        \n"  // 11223344 (1u1v, 16b)
-      "vmlal.u8    q0, d5, d28                   \n"  // 3*near+far (1, odd)
-      "vmlal.u8    q1, d4, d28                   \n"  // 3*near+far (1, even)
-
-      "vld1.8      {d8}, [%1]!                   \n"  // 00112233 (1u1v)
-      "vld1.8      {d9}, [%6]!                   \n"  // 11223344 (1u1v)
-
-      "vmovl.u8    q2, d8                        \n"  // 00112233 (1u1v, 16b)
-      "vmovl.u8    q3, d9                        \n"  // 11223344 (1u1v, 16b)
-      "vmlal.u8    q2, d9, d28                   \n"  // 3*near+far (2, odd)
-      "vmlal.u8    q3, d8, d28                   \n"  // 3*near+far (2, even)
-
-      // e  o
-      // q1 q0
-      // q3 q2
-
-      "vmovq       q4, q2                        \n"
-      "vmovq       q5, q3                        \n"
-      "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
-      "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
-      "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
-      "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
-
-      // e  o
-      // q5 q4
-      // q1 q0
-
-      "vrshrn.u16  d2, q1, #4                    \n"  // 2, even
-      "vrshrn.u16  d3, q0, #4                    \n"  // 2, odd
-      "vrshrn.u16  d0, q5, #4                    \n"  // 1, even
-      "vrshrn.u16  d1, q4, #4                    \n"  // 1, odd
-
-      "vst2.16     {d0, d1}, [%2]!               \n"  // store
-      "vst2.16     {d2, d3}, [%3]!               \n"  // store
-      "subs        %4, %4, #8                    \n"  // 4 uv -> 8 uv
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(src_ptr1),   // %1
-        "+r"(dst_ptr),    // %2
-        "+r"(dst_ptr1),   // %3
-        "+r"(dst_width),  // %4
-        "+r"(src_temp),   // %5
-        "+r"(src_temp1)   // %6
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
-        "q15"  // Clobber List
-  );
-}
-
-void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
-                                  uint16_t* dst_ptr,
-                                  int dst_width) {
-  const uint16_t* src_temp = src_ptr + 2;
-  asm volatile(
-      "vmov.u16    d30, #3                       \n"
-
-      "1:                                        \n"
-      "vld1.16     {q0}, [%0]!                   \n"  // 00112233 (1u1v, 16)
-      "vld1.16     {q1}, [%3]!                   \n"  // 11223344 (1u1v, 16)
-
-      "vmovl.u16   q2, d0                        \n"  // 0011 (1u1v, 32b)
-      "vmovl.u16   q3, d2                        \n"  // 1122 (1u1v, 32b)
-      "vmovl.u16   q4, d1                        \n"  // 2233 (1u1v, 32b)
-      "vmovl.u16   q5, d3                        \n"  // 3344 (1u1v, 32b)
-      "vmlal.u16   q2, d2, d30                   \n"  // 3*near+far (odd)
-      "vmlal.u16   q3, d0, d30                   \n"  // 3*near+far (even)
-      "vmlal.u16   q4, d3, d30                   \n"  // 3*near+far (odd)
-      "vmlal.u16   q5, d1, d30                   \n"  // 3*near+far (even)
-
-      "vrshrn.u32  d1, q2, #2                    \n"  // 3/4*near+1/4*far (odd)
-      "vrshrn.u32  d0, q3, #2                    \n"  // 3/4*near+1/4*far (even)
-      "vrshrn.u32  d3, q4, #2                    \n"  // 3/4*near+1/4*far (odd)
-      "vrshrn.u32  d2, q5, #2                    \n"  // 3/4*near+1/4*far (even)
-
-      "vst2.32     {d0, d1}, [%1]!               \n"  // store
-      "vst2.32     {d2, d3}, [%1]!               \n"  // store
-      "subs        %2, %2, #8                    \n"  // 4 uv -> 8 uv
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_temp)    // %3
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
-        "d30"  // Clobber List
-  );
-}
-
-void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width) {
-  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
-  const uint16_t* src_temp = src_ptr + 2;
-  const uint16_t* src_temp1 = src_ptr1 + 2;
-
-  asm volatile(
-      "vmov.u16    d30, #3                       \n"
-      "vmov.u32    q14, #3                       \n"
-
-      "1:                                        \n"
-      "vld1.8      {d0}, [%0]!                   \n"  // 0011 (1u1v)
-      "vld1.8      {d1}, [%5]!                   \n"  // 1122 (1u1v)
-      "vmovl.u16   q2, d0                        \n"  // 0011 (1u1v, 32b)
-      "vmovl.u16   q3, d1                        \n"  // 1122 (1u1v, 32b)
-      "vmlal.u16   q2, d1, d30                   \n"  // 3*near+far (1, odd)
-      "vmlal.u16   q3, d0, d30                   \n"  // 3*near+far (1, even)
-
-      "vld1.8      {d0}, [%1]!                   \n"  // 0011 (1u1v)
-      "vld1.8      {d1}, [%6]!                   \n"  // 1122 (1u1v)
-      "vmovl.u16   q4, d0                        \n"  // 0011 (1u1v, 32b)
-      "vmovl.u16   q5, d1                        \n"  // 1122 (1u1v, 32b)
-      "vmlal.u16   q4, d1, d30                   \n"  // 3*near+far (2, odd)
-      "vmlal.u16   q5, d0, d30                   \n"  // 3*near+far (2, even)
-
-      "vmovq       q0, q4                        \n"
-      "vmovq       q1, q5                        \n"
-      "vmla.u32    q4, q2, q14                   \n"  // 9 3 3 1 (1, odd)
-      "vmla.u32    q5, q3, q14                   \n"  // 9 3 3 1 (1, even)
-      "vmla.u32    q2, q0, q14                   \n"  // 9 3 3 1 (2, odd)
-      "vmla.u32    q3, q1, q14                   \n"  // 9 3 3 1 (2, even)
-
-      "vrshrn.u32  d1, q4, #4                    \n"  // 1, odd
-      "vrshrn.u32  d0, q5, #4                    \n"  // 1, even
-      "vrshrn.u32  d3, q2, #4                    \n"  // 2, odd
-      "vrshrn.u32  d2, q3, #4                    \n"  // 2, even
-
-      "vst2.32     {d0, d1}, [%2]!               \n"  // store
-      "vst2.32     {d2, d3}, [%3]!               \n"  // store
-      "subs        %4, %4, #4                    \n"  // 2 uv -> 4 uv
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(src_ptr1),   // %1
-        "+r"(dst_ptr),    // %2
-        "+r"(dst_ptr1),   // %3
-        "+r"(dst_width),  // %4
-        "+r"(src_temp),   // %5
-        "+r"(src_temp1)   // %6
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
-        "d30"  // Clobber List
-  );
-}
-
-// Add a row of bytes to a row of shorts.  Used for box filter.
-// Reads 16 bytes and accumulates to 16 shorts at a time.
-void ScaleAddRow_NEON(const uint8_t* src_ptr,
-                      uint16_t* dst_ptr,
-                      int src_width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.16     {q1, q2}, [%1]                \n"  // load accumulator
-      "vld1.8      {q0}, [%0]!                   \n"  // load 16 bytes
-      "vaddw.u8    q2, q2, d1                    \n"  // add
-      "vaddw.u8    q1, q1, d0                    \n"
-      "vst1.16     {q1, q2}, [%1]!               \n"  // store accumulator
-      "subs        %2, %2, #16                   \n"  // 16 processed per loop
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_width)  // %2
-      :
-      : "memory", "cc", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                      \
-  "lsr        %5, %3, #16                    \n" \
-  "add        %6, %1, %5                     \n" \
-  "add        %3, %3, %4                     \n" \
-  "vld2.8     {d6[" #n "], d7[" #n "]}, [%6] \n"
-
-// The NEON version mimics this formula (from row_common.cc):
-// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
-//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-
-void ScaleFilterCols_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          int dst_width,
-                          int x,
-                          int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8_t* src_tmp = src_ptr;
-  asm volatile (
-      "vdup.32     q0, %3                        \n"  // x
-      "vdup.32     q1, %4                        \n"  // dx
-      "vld1.32     {q2}, [%5]                    \n"  // 0 1 2 3
-      "vshl.i32    q3, q1, #2                    \n"  // 4 * dx
-      "vmul.s32    q1, q1, q2                    \n"
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-      "vadd.s32    q1, q1, q0                    \n"
-    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-      "vadd.s32    q2, q1, q3                    \n"
-      "vshl.i32    q0, q3, #1                    \n"  // 8 * dx
-      "1:                                        \n"
-    LOAD2_DATA8_LANE(0)
-    LOAD2_DATA8_LANE(1)
-    LOAD2_DATA8_LANE(2)
-    LOAD2_DATA8_LANE(3)
-    LOAD2_DATA8_LANE(4)
-    LOAD2_DATA8_LANE(5)
-    LOAD2_DATA8_LANE(6)
-    LOAD2_DATA8_LANE(7)
-      "vmov        q10, q1                       \n"
-      "vmov        q11, q2                       \n"
-      "vuzp.16     q10, q11                      \n"
-      "vmovl.u8    q8, d6                        \n"
-      "vmovl.u8    q9, d7                        \n"
-      "vsubl.s16   q11, d18, d16                 \n"
-      "vsubl.s16   q12, d19, d17                 \n"
-      "vmovl.u16   q13, d20                      \n"
-      "vmovl.u16   q10, d21                      \n"
-      "vmul.s32    q11, q11, q13                 \n"
-      "vmul.s32    q12, q12, q10                 \n"
-      "vrshrn.s32  d18, q11, #16                 \n"
-      "vrshrn.s32  d19, q12, #16                 \n"
-      "vadd.s16    q8, q8, q9                    \n"
-      "vmovn.s16   d6, q8                        \n"
-
-      "vst1.8      {d6}, [%0]!                   \n"  // store pixels
-      "vadd.s32    q1, q1, q0                    \n"
-      "vadd.s32    q2, q2, q0                    \n"
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop
-      "bgt         1b                            \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(x),                // %3
-    "+r"(dx),               // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3",
-    "q8", "q9", "q10", "q11", "q12", "q13"
-  );
-}
-
-#undef LOAD2_DATA8_LANE
-
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction) {
-  asm volatile(
-      "cmp         %4, #0                        \n"
-      "beq         100f                          \n"
-      "add         %2, %1                        \n"
-      "cmp         %4, #64                       \n"
-      "beq         75f                           \n"
-      "cmp         %4, #128                      \n"
-      "beq         50f                           \n"
-      "cmp         %4, #192                      \n"
-      "beq         25f                           \n"
-
-      "vdup.8      d5, %4                        \n"
-      "rsb         %4, #256                      \n"
-      "vdup.8      d4, %4                        \n"
-      // General purpose row blend.
-      "1:                                        \n"
-      "vld1.8      {q0}, [%1]!                   \n"
-      "vld1.8      {q1}, [%2]!                   \n"
-      "subs        %3, %3, #16                   \n"
-      "vmull.u8    q13, d0, d4                   \n"
-      "vmull.u8    q14, d1, d4                   \n"
-      "vmlal.u8    q13, d2, d5                   \n"
-      "vmlal.u8    q14, d3, d5                   \n"
-      "vrshrn.u16  d0, q13, #8                   \n"
-      "vrshrn.u16  d1, q14, #8                   \n"
-      "vst1.8      {q0}, [%0]!                   \n"
-      "bgt         1b                            \n"
-      "b           99f                           \n"
-
-      // Blend 25 / 75.
-      "25:                                       \n"
-      "vld1.8      {q0}, [%1]!                   \n"
-      "vld1.8      {q1}, [%2]!                   \n"
-      "subs        %3, %3, #16                   \n"
-      "vrhadd.u8   q0, q1                        \n"
-      "vrhadd.u8   q0, q1                        \n"
-      "vst1.8      {q0}, [%0]!                   \n"
-      "bgt         25b                           \n"
-      "b           99f                           \n"
-
-      // Blend 50 / 50.
-      "50:                                       \n"
-      "vld1.8      {q0}, [%1]!                   \n"
-      "vld1.8      {q1}, [%2]!                   \n"
-      "subs        %3, %3, #16                   \n"
-      "vrhadd.u8   q0, q1                        \n"
-      "vst1.8      {q0}, [%0]!                   \n"
-      "bgt         50b                           \n"
-      "b           99f                           \n"
-
-      // Blend 75 / 25.
-      "75:                                       \n"
-      "vld1.8      {q1}, [%1]!                   \n"
-      "vld1.8      {q0}, [%2]!                   \n"
-      "subs        %3, %3, #16                   \n"
-      "vrhadd.u8   q0, q1                        \n"
-      "vrhadd.u8   q0, q1                        \n"
-      "vst1.8      {q0}, [%0]!                   \n"
-      "bgt         75b                           \n"
-      "b           99f                           \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      "100:                                      \n"
-      "vld1.8      {q0}, [%1]!                   \n"
-      "subs        %3, %3, #16                   \n"
-      "vst1.8      {q0}, [%0]!                   \n"
-      "bgt         100b                          \n"
-
-      "99:                                       \n"
-      "vst1.8      {d1[7]}, [%0]                 \n"
-      : "+r"(dst_ptr),           // %0
-        "+r"(src_ptr),           // %1
-        "+r"(src_stride),        // %2
-        "+r"(dst_width),         // %3
-        "+r"(source_y_fraction)  // %4
-      :
-      : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
-}
-
-void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst,
-                            int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop
-      "vmov        q2, q1                        \n"  // load next 8 ARGB
-      "vst2.32     {q2, q3}, [%1]!               \n"  // store odd pixels
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-//  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
-//  4a:  3e04        subs  r6, #4
-//  4c:  f964 118d   vld4.32  {d17,d19,d21,d23}, [r4]!
-//  50:  ef64 21f4   vorr  q9, q10, q10
-//  54:  f942 038d   vst2.32  {d16-d19}, [r2]!
-//  58:  d1f5        bne.n  46 <ScaleARGBRowDown2_C+0x46>
-
-void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop
-      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
-      "vrhadd.u8   q1, q2, q3                    \n"  // rounding half add
-      "vst2.32     {q0, q1}, [%1]!               \n"
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst,
-                               int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add         %1, %1, %0                    \n"
-      "1:                                        \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
-      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
-      "vpaddl.u8   q3, q3                        \n"  // A 16 bytes -> 8 shorts.
-      "vld4.8      {d16, d18, d20, d22}, [%1]!   \n"  // load 8 more ARGB
-      "vld4.8      {d17, d19, d21, d23}, [%1]!   \n"  // load last 8 ARGB
-      "vpadal.u8   q0, q8                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q9                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q2, q10                       \n"  // R 16 bytes -> 8 shorts.
-      "vpadal.u8   q3, q11                       \n"  // A 16 bytes -> 8 shorts.
-      "vrshrn.u16  d0, q0, #2                    \n"  // round and pack to bytes
-      "vrshrn.u16  d1, q1, #2                    \n"
-      "vrshrn.u16  d2, q2, #2                    \n"
-      "vrshrn.u16  d3, q3, #2                    \n"
-      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_argb,
-                               int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "mov         r12, %3, lsl #2               \n"
-      "1:                                        \n"
-      "vld1.32     {d0[0]}, [%0], r12            \n"
-      "vld1.32     {d0[1]}, [%0], r12            \n"
-      "vld1.32     {d1[0]}, [%0], r12            \n"
-      "vld1.32     {d1[1]}, [%0], r12            \n"
-      "subs        %2, %2, #4                    \n"  // 4 pixels per loop.
-      "vst1.8      {q0}, [%1]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-      : "r"(src_stepx)   // %3
-      : "memory", "cc", "r12", "q0");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  asm volatile(
-      "mov         r12, %4, lsl #2               \n"
-      "add         %1, %1, %0                    \n"
-      "1:                                        \n"
-      "vld1.8      {d0}, [%0], r12               \n"  // 4 2x2 blocks -> 2x1
-      "vld1.8      {d1}, [%1], r12               \n"
-      "vld1.8      {d2}, [%0], r12               \n"
-      "vld1.8      {d3}, [%1], r12               \n"
-      "vld1.8      {d4}, [%0], r12               \n"
-      "vld1.8      {d5}, [%1], r12               \n"
-      "vld1.8      {d6}, [%0], r12               \n"
-      "vld1.8      {d7}, [%1], r12               \n"
-      "vaddl.u8    q0, d0, d1                    \n"
-      "vaddl.u8    q1, d2, d3                    \n"
-      "vaddl.u8    q2, d4, d5                    \n"
-      "vaddl.u8    q3, d6, d7                    \n"
-      "vswp.8      d1, d2                        \n"  // ab_cd -> ac_bd
-      "vswp.8      d5, d6                        \n"  // ef_gh -> eg_fh
-      "vadd.u16    q0, q0, q1                    \n"  // (a+b)_(c+d)
-      "vadd.u16    q2, q2, q3                    \n"  // (e+f)_(g+h)
-      "vrshrn.u16  d0, q0, #2                    \n"  // first 2 pixels.
-      "vrshrn.u16  d1, q2, #2                    \n"  // next 2 pixels.
-      "subs        %3, %3, #4                    \n"  // 4 pixels per loop.
-      "vst1.8      {q0}, [%2]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_argb),    // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(dst_width)    // %3
-      : "r"(src_stepx)     // %4
-      : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n)                 \
-  "lsr        %5, %3, #16                    \n" \
-  "add        %6, %1, %5, lsl #2             \n" \
-  "add        %3, %3, %4                     \n" \
-  "vld1.32    {" #dn "[" #n "]}, [%6]        \n"
-
-void ScaleARGBCols_NEON(uint8_t* dst_argb,
-                        const uint8_t* src_argb,
-                        int dst_width,
-                        int x,
-                        int dx) {
-  int tmp;
-  const uint8_t* src_tmp = src_argb;
-  asm volatile(
-      "1:                                        \n"
-      // clang-format off
-      LOAD1_DATA32_LANE(d0, 0)
-      LOAD1_DATA32_LANE(d0, 1)
-      LOAD1_DATA32_LANE(d1, 0)
-      LOAD1_DATA32_LANE(d1, 1)
-      LOAD1_DATA32_LANE(d2, 0)
-      LOAD1_DATA32_LANE(d2, 1)
-      LOAD1_DATA32_LANE(d3, 0)
-      LOAD1_DATA32_LANE(d3, 1)
-      // clang-format on
-      "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
-      "subs        %2, %2, #8                    \n"  // 8 processed per loop
-      "bgt         1b                            \n"
-      : "+r"(dst_argb),   // %0
-        "+r"(src_argb),   // %1
-        "+r"(dst_width),  // %2
-        "+r"(x),          // %3
-        "+r"(dx),         // %4
-        "=&r"(tmp),       // %5
-        "+r"(src_tmp)     // %6
-      :
-      : "memory", "cc", "q0", "q1");
-}
-
-#undef LOAD1_DATA32_LANE
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n)                       \
-  "lsr        %5, %3, #16                                \n" \
-  "add        %6, %1, %5, lsl #2                         \n" \
-  "add        %3, %3, %4                                 \n" \
-  "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
-
-void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
-                              const uint8_t* src_argb,
-                              int dst_width,
-                              int x,
-                              int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8_t* src_tmp = src_argb;
-  asm volatile (
-      "vdup.32     q0, %3                        \n"  // x
-      "vdup.32     q1, %4                        \n"  // dx
-      "vld1.32     {q2}, [%5]                    \n"  // 0 1 2 3
-      "vshl.i32    q9, q1, #2                    \n"  // 4 * dx
-      "vmul.s32    q1, q1, q2                    \n"
-      "vmov.i8     q3, #0x7f                     \n"  // 0x7F
-      "vmov.i16    q15, #0x7f                    \n"  // 0x7F
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-      "vadd.s32    q8, q1, q0                    \n"
-      "1:                                        \n"
-    // d0, d1: a
-    // d2, d3: b
-    LOAD2_DATA32_LANE(d0, d2, 0)
-    LOAD2_DATA32_LANE(d0, d2, 1)
-    LOAD2_DATA32_LANE(d1, d3, 0)
-    LOAD2_DATA32_LANE(d1, d3, 1)
-    "vshrn.i32   d22, q8, #9                   \n"
-    "vand.16     d22, d22, d30                 \n"
-    "vdup.8      d24, d22[0]                   \n"
-    "vdup.8      d25, d22[2]                   \n"
-    "vdup.8      d26, d22[4]                   \n"
-    "vdup.8      d27, d22[6]                   \n"
-    "vext.8      d4, d24, d25, #4              \n"
-    "vext.8      d5, d26, d27, #4              \n"  // f
-    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
-    "vmull.u8    q11, d0, d20                  \n"
-    "vmull.u8    q12, d1, d21                  \n"
-    "vmull.u8    q13, d2, d4                   \n"
-    "vmull.u8    q14, d3, d5                   \n"
-    "vadd.i16    q11, q11, q13                 \n"
-    "vadd.i16    q12, q12, q14                 \n"
-    "vshrn.i16   d0, q11, #7                   \n"
-    "vshrn.i16   d1, q12, #7                   \n"
-
-    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
-    "vadd.s32    q8, q8, q9                    \n"
-    "subs        %2, %2, #4                    \n"  // 4 processed per loop
-    "bgt         1b                            \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width),        // %2
-    "+r"(x),                // %3
-    "+r"(dx),               // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#undef LOAD2_DATA32_LANE
-
-void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add         %1, %1, %0                    \n"
-      "1:                                        \n"
-      "vld2.8      {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
-      "vld2.8      {d1, d3}, [%0]!               \n"  // load next 8 UV
-      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vpaddl.u8   q0, q0                        \n"  // U 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // V 16 bytes -> 8 shorts.
-      "vld2.8      {d16, d18}, [%1]!             \n"  // load 8 more UV
-      "vld2.8      {d17, d19}, [%1]!             \n"  // load last 8 UV
-      "vpadal.u8   q0, q8                        \n"  // U 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q9                        \n"  // V 16 bytes -> 8 shorts.
-      "vrshrn.u16  d0, q0, #2                    \n"  // round and pack to bytes
-      "vrshrn.u16  d1, q1, #2                    \n"
-      "vst2.8      {d0, d1}, [%2]!               \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "memory", "cc", "q0", "q1", "q8", "q9");
-}
-
-// Reads 4 pixels at a time.
-void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             int src_stepx,  // pixel step
-                             uint8_t* dst_ptr,
-                             int dst_width) {
-  const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
-  const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
-  const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld1.16     {d0[0]}, [%0], %6             \n"
-      "vld1.16     {d0[1]}, [%1], %6             \n"
-      "vld1.16     {d0[2]}, [%2], %6             \n"
-      "vld1.16     {d0[3]}, [%3], %6             \n"
-      "subs        %5, %5, #4                    \n"  // 4 pixels per loop.
-      "vst1.8      {d0}, [%4]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src_ptr),      // %0
-        "+r"(src1_ptr),     // %1
-        "+r"(src2_ptr),     // %2
-        "+r"(src3_ptr),     // %3
-        "+r"(dst_ptr),      // %4
-        "+r"(dst_width)     // %5
-      : "r"(src_stepx * 8)  // %6
-      : "memory", "cc", "d0");
-}
-
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/scale_neon64.cc b/thirdparty/libyuv/source/scale_neon64.cc
deleted file mode 100644
index 8656fec..0000000
--- a/thirdparty/libyuv/source/scale_neon64.cc
+++ /dev/null
@@ -1,1634 +0,0 @@
-/*
- *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/scale.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon armv8 64 bit.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      // load even pixels into v0, odd into v1
-      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
-      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "st1         {v1.16b}, [%1], #16           \n"  // store odd pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "v0", "v1"  // Clobber List
-  );
-}
-
-// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst,
-                              int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      // load even pixels into v0, odd into v1
-      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
-      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "st1         {v0.16b}, [%1], #16           \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "v0", "v1"  // Clobber List
-  );
-}
-
-// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst,
-                           int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add         %1, %1, %0                    \n"
-      "1:                                        \n"
-      "ld1         {v0.16b, v1.16b}, [%0], #32   \n"  // load row 1 and post inc
-      "ld1         {v2.16b, v3.16b}, [%1], #32   \n"  // load row 2 and post inc
-      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
-      "uaddlp      v0.8h, v0.16b                 \n"  // row 1 add adjacent
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "uaddlp      v1.8h, v1.16b                 \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v0.8h, v2.16b                 \n"  // += row 2 add adjacent
-      "uadalp      v1.8h, v3.16b                 \n"
-      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
-      "rshrn2      v0.16b, v1.8h, #2             \n"
-      "st1         {v0.16b}, [%2], #16           \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void ScaleRowDown4_NEON(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "st1         {v2.8b}, [%1], #8             \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load up 16x4
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "ld1         {v2.16b}, [%3], #16           \n"
-      "ld1         {v3.16b}, [%4], #16           \n"
-      "subs        %w5, %w5, #4                  \n"
-      "uaddlp      v0.8h, v0.16b                 \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "uadalp      v0.8h, v1.16b                 \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "uadalp      v0.8h, v2.16b                 \n"
-      "prfm        pldl1keep, [%3, 448]          \n"
-      "uadalp      v0.8h, v3.16b                 \n"
-      "prfm        pldl1keep, [%4, 448]          \n"
-      "addp        v0.8h, v0.8h, v0.8h           \n"
-      "rshrn       v0.8b, v0.8h, #4              \n"  // divide by 16 w/rounding
-      "st1         {v0.s}[0], [%1], #4           \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_ptr1),  // %2
-        "+r"(src_ptr2),  // %3
-        "+r"(src_ptr3),  // %4
-        "+r"(dst_width)  // %5
-      :
-      : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
-      "subs        %w2, %w2, #24                 \n"
-      "orr         v2.16b, v3.16b, v3.16b        \n"  // order v0,v1,v2
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "movi        v20.8b, #3                    \n"
-      "add         %3, %3, %0                    \n"
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
-      "subs        %w2, %w2, #24                 \n"
-
-      // filter src line 0 with src line 1
-      // expand chars to shorts to allow for room
-      // when adding lines together
-      "ushll       v16.8h, v4.8b, #0             \n"
-      "ushll       v17.8h, v5.8b, #0             \n"
-      "ushll       v18.8h, v6.8b, #0             \n"
-      "ushll       v19.8h, v7.8b, #0             \n"
-
-      // 3 * line_0 + line_1
-      "umlal       v16.8h, v0.8b, v20.8b         \n"
-      "umlal       v17.8h, v1.8b, v20.8b         \n"
-      "umlal       v18.8h, v2.8b, v20.8b         \n"
-      "umlal       v19.8h, v3.8b, v20.8b         \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      // (3 * line_0 + line_1 + 2) >> 2
-      "uqrshrn     v0.8b, v16.8h, #2             \n"
-      "uqrshrn     v1.8b, v17.8h, #2             \n"
-      "uqrshrn     v2.8b, v18.8h, #2             \n"
-      "uqrshrn     v3.8b, v19.8h, #2             \n"
-      "prfm        pldl1keep, [%3, 448]          \n"
-
-      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
-      "ushll       v16.8h, v1.8b, #0             \n"
-      "umlal       v16.8h, v0.8b, v20.8b         \n"
-      "uqrshrn     v0.8b, v16.8h, #2             \n"
-
-      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
-      "urhadd      v1.8b, v1.8b, v2.8b           \n"
-
-      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
-      "ushll       v16.8h, v2.8b, #0             \n"
-      "umlal       v16.8h, v3.8b, v20.8b         \n"
-      "uqrshrn     v2.8b, v16.8h, #2             \n"
-
-      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
-
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_stride)  // %3
-      :
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
-        "v19", "v20", "memory", "cc");
-}
-
-void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "movi        v20.8b, #3                    \n"
-      "add         %3, %3, %0                    \n"
-      "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
-      "subs        %w2, %w2, #24                 \n"
-      // average src line 0 with src line 1
-      "urhadd      v0.8b, v0.8b, v4.8b           \n"
-      "urhadd      v1.8b, v1.8b, v5.8b           \n"
-      "urhadd      v2.8b, v2.8b, v6.8b           \n"
-      "urhadd      v3.8b, v3.8b, v7.8b           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
-      "ushll       v4.8h, v1.8b, #0              \n"
-      "umlal       v4.8h, v0.8b, v20.8b          \n"
-      "uqrshrn     v0.8b, v4.8h, #2              \n"
-      "prfm        pldl1keep, [%3, 448]          \n"
-
-      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
-      "urhadd      v1.8b, v1.8b, v2.8b           \n"
-
-      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
-      "ushll       v4.8h, v2.8b, #0              \n"
-      "umlal       v4.8h, v3.8b, v20.8b          \n"
-      "uqrshrn     v2.8b, v4.8h, #2              \n"
-
-      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_stride)  // %3
-      :
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
-}
-
-static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
-                              22, 24, 27, 30, 0,  0,  0,  0};
-static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
-                                34, 6,  22, 35, 0,  0,  0, 0};
-static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
-                                   65536 / 12, 65536 / 12, 65536 / 12,
-                                   65536 / 12, 65536 / 12};
-static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
-                                   65536 / 18, 65536 / 18, 65536 / 18,
-                                   65536 / 18, 65536 / 18};
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "ld1         {v3.16b}, [%3]                \n"
-      "1:                                        \n"
-      "ld1         {v0.16b,v1.16b}, [%0], #32    \n"
-      "subs        %w2, %w2, #12                 \n"
-      "tbl         v2.16b, {v0.16b,v1.16b}, v3.16b \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "st1         {v2.8b}, [%1], #8             \n"
-      "st1         {v2.s}[2], [%1], #4           \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "r"(&kShuf38)    // %3
-      : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8_t* dst_ptr,
-                                      int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
-  ptrdiff_t tmp_src_stride = src_stride;
-
-  asm volatile(
-      "ld1         {v29.8h}, [%5]                \n"
-      "ld1         {v30.16b}, [%6]               \n"
-      "ld1         {v31.8h}, [%7]                \n"
-      "add         %2, %2, %0                    \n"
-      "1:                                        \n"
-
-      // 00 40 01 41 02 42 03 43
-      // 10 50 11 51 12 52 13 53
-      // 20 60 21 61 22 62 23 63
-      // 30 70 31 71 32 72 33 73
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
-      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
-      "subs        %w4, %w4, #12                 \n"
-
-      // Shuffle the input data around to get align the data
-      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-      // 00 10 01 11 02 12 03 13
-      // 40 50 41 51 42 52 43 53
-      "trn1        v20.8b, v0.8b, v1.8b          \n"
-      "trn2        v21.8b, v0.8b, v1.8b          \n"
-      "trn1        v22.8b, v4.8b, v5.8b          \n"
-      "trn2        v23.8b, v4.8b, v5.8b          \n"
-      "trn1        v24.8b, v16.8b, v17.8b        \n"
-      "trn2        v25.8b, v16.8b, v17.8b        \n"
-
-      // 20 30 21 31 22 32 23 33
-      // 60 70 61 71 62 72 63 73
-      "trn1        v0.8b, v2.8b, v3.8b           \n"
-      "trn2        v1.8b, v2.8b, v3.8b           \n"
-      "trn1        v4.8b, v6.8b, v7.8b           \n"
-      "trn2        v5.8b, v6.8b, v7.8b           \n"
-      "trn1        v16.8b, v18.8b, v19.8b        \n"
-      "trn2        v17.8b, v18.8b, v19.8b        \n"
-
-      // 00+10 01+11 02+12 03+13
-      // 40+50 41+51 42+52 43+53
-      "uaddlp      v20.4h, v20.8b                \n"
-      "uaddlp      v21.4h, v21.8b                \n"
-      "uaddlp      v22.4h, v22.8b                \n"
-      "uaddlp      v23.4h, v23.8b                \n"
-      "uaddlp      v24.4h, v24.8b                \n"
-      "uaddlp      v25.4h, v25.8b                \n"
-
-      // 60+70 61+71 62+72 63+73
-      "uaddlp      v1.4h, v1.8b                  \n"
-      "uaddlp      v5.4h, v5.8b                  \n"
-      "uaddlp      v17.4h, v17.8b                \n"
-
-      // combine source lines
-      "add         v20.4h, v20.4h, v22.4h        \n"
-      "add         v21.4h, v21.4h, v23.4h        \n"
-      "add         v20.4h, v20.4h, v24.4h        \n"
-      "add         v21.4h, v21.4h, v25.4h        \n"
-      "add         v2.4h, v1.4h, v5.4h           \n"
-      "add         v2.4h, v2.4h, v17.4h          \n"
-
-      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-      //             + s[6 + st * 1] + s[7 + st * 1]
-      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-      "sqrdmulh    v2.8h, v2.8h, v29.8h          \n"
-      "xtn         v2.8b,  v2.8h                 \n"
-
-      // Shuffle 2,3 reg around so that 2 can be added to the
-      //  0,1 reg and 3 can be added to the 4,5 reg. This
-      //  requires expanding from u8 to u16 as the 0,1 and 4,5
-      //  registers are already expanded. Then do transposes
-      //  to get aligned.
-      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-      "ushll       v16.8h, v16.8b, #0            \n"
-      "uaddl       v0.8h, v0.8b, v4.8b           \n"
-
-      // combine source lines
-      "add         v0.8h, v0.8h, v16.8h          \n"
-
-      // xx 20 xx 21 xx 22 xx 23
-      // xx 30 xx 31 xx 32 xx 33
-      "trn1        v1.8h, v0.8h, v0.8h           \n"
-      "trn2        v4.8h, v0.8h, v0.8h           \n"
-      "xtn         v0.4h, v1.4s                  \n"
-      "xtn         v4.4h, v4.4s                  \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      // 0+1+2, 3+4+5
-      "add         v20.8h, v20.8h, v0.8h         \n"
-      "add         v21.8h, v21.8h, v4.8h         \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-
-      // Need to divide, but can't downshift as the the value
-      //  isn't a power of 2. So multiply by 65536 / n
-      //  and take the upper 16 bits.
-      "sqrdmulh    v0.8h, v20.8h, v31.8h         \n"
-      "sqrdmulh    v1.8h, v21.8h, v31.8h         \n"
-      "prfm        pldl1keep, [%3, 448]          \n"
-
-      // Align for table lookup, vtbl requires registers to be adjacent
-      "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
-
-      "st1         {v3.8b}, [%1], #8             \n"
-      "st1         {v3.s}[2], [%1], #4           \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),         // %0
-        "+r"(dst_ptr),         // %1
-        "+r"(tmp_src_stride),  // %2
-        "+r"(src_ptr1),        // %3
-        "+r"(dst_width)        // %4
-      : "r"(&kMult38_Div6),    // %5
-        "r"(&kShuf38_2),       // %6
-        "r"(&kMult38_Div9)     // %7
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
-        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
-        "memory", "cc");
-}
-
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  // TODO(fbarchard): use src_stride directly for clang 3.5+.
-  ptrdiff_t tmp_src_stride = src_stride;
-  asm volatile(
-      "ld1         {v30.8h}, [%4]                \n"
-      "ld1         {v31.16b}, [%5]               \n"
-      "add         %2, %2, %0                    \n"
-      "1:                                        \n"
-
-      // 00 40 01 41 02 42 03 43
-      // 10 50 11 51 12 52 13 53
-      // 20 60 21 61 22 62 23 63
-      // 30 70 31 71 32 72 33 73
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
-      "subs        %w3, %w3, #12                 \n"
-
-      // Shuffle the input data around to get align the data
-      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-      // 00 10 01 11 02 12 03 13
-      // 40 50 41 51 42 52 43 53
-      "trn1        v16.8b, v0.8b, v1.8b          \n"
-      "trn2        v17.8b, v0.8b, v1.8b          \n"
-      "trn1        v18.8b, v4.8b, v5.8b          \n"
-      "trn2        v19.8b, v4.8b, v5.8b          \n"
-
-      // 20 30 21 31 22 32 23 33
-      // 60 70 61 71 62 72 63 73
-      "trn1        v0.8b, v2.8b, v3.8b           \n"
-      "trn2        v1.8b, v2.8b, v3.8b           \n"
-      "trn1        v4.8b, v6.8b, v7.8b           \n"
-      "trn2        v5.8b, v6.8b, v7.8b           \n"
-
-      // 00+10 01+11 02+12 03+13
-      // 40+50 41+51 42+52 43+53
-      "uaddlp      v16.4h, v16.8b                \n"
-      "uaddlp      v17.4h, v17.8b                \n"
-      "uaddlp      v18.4h, v18.8b                \n"
-      "uaddlp      v19.4h, v19.8b                \n"
-
-      // 60+70 61+71 62+72 63+73
-      "uaddlp      v1.4h, v1.8b                  \n"
-      "uaddlp      v5.4h, v5.8b                  \n"
-
-      // combine source lines
-      "add         v16.4h, v16.4h, v18.4h        \n"
-      "add         v17.4h, v17.4h, v19.4h        \n"
-      "add         v2.4h, v1.4h, v5.4h           \n"
-
-      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-      "uqrshrn     v2.8b, v2.8h, #2              \n"
-
-      // Shuffle 2,3 reg around so that 2 can be added to the
-      //  0,1 reg and 3 can be added to the 4,5 reg. This
-      //  requires expanding from u8 to u16 as the 0,1 and 4,5
-      //  registers are already expanded. Then do transposes
-      //  to get aligned.
-      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-
-      // combine source lines
-      "uaddl       v0.8h, v0.8b, v4.8b           \n"
-
-      // xx 20 xx 21 xx 22 xx 23
-      // xx 30 xx 31 xx 32 xx 33
-      "trn1        v1.8h, v0.8h, v0.8h           \n"
-      "trn2        v4.8h, v0.8h, v0.8h           \n"
-      "xtn         v0.4h, v1.4s                  \n"
-      "xtn         v4.4h, v4.4s                  \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      // 0+1+2, 3+4+5
-      "add         v16.8h, v16.8h, v0.8h         \n"
-      "add         v17.8h, v17.8h, v4.8h         \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-
-      // Need to divide, but can't downshift as the the value
-      //  isn't a power of 2. So multiply by 65536 / n
-      //  and take the upper 16 bits.
-      "sqrdmulh    v0.8h, v16.8h, v30.8h         \n"
-      "sqrdmulh    v1.8h, v17.8h, v30.8h         \n"
-
-      // Align for table lookup, vtbl requires registers to
-      //  be adjacent
-
-      "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
-
-      "st1         {v3.8b}, [%1], #8             \n"
-      "st1         {v3.s}[2], [%1], #4           \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),         // %0
-        "+r"(dst_ptr),         // %1
-        "+r"(tmp_src_stride),  // %2
-        "+r"(dst_width)        // %3
-      : "r"(&kMult38_Div6),    // %4
-        "r"(&kShuf38_2)        // %5
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
-        "v19", "v30", "v31", "memory", "cc");
-}
-
-void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int dst_width) {
-  const uint8_t* src_temp = src_ptr + 1;
-  asm volatile(
-      "movi        v31.8b, #3                    \n"
-
-      "1:                                        \n"
-      "ldr         d0, [%0], #8                  \n"  // 01234567
-      "ldr         d1, [%1], #8                  \n"  // 12345678
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      "ushll       v2.8h, v0.8b, #0              \n"  // 01234567 (16b)
-      "ushll       v3.8h, v1.8b, #0              \n"  // 12345678 (16b)
-
-      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (odd)
-      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (even)
-
-      "rshrn       v2.8b, v2.8h, #2              \n"  // 3/4*near+1/4*far (odd)
-      "rshrn       v1.8b, v3.8h, #2              \n"  // 3/4*near+1/4*far (even)
-
-      "st2         {v1.8b, v2.8b}, [%2], #16     \n"  // store
-      "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(src_temp),  // %1
-        "+r"(dst_ptr),   // %2
-        "+r"(dst_width)  // %3
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v31"  // Clobber List
-  );
-}
-
-void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
-  const uint8_t* src_temp = src_ptr + 1;
-  const uint8_t* src_temp1 = src_ptr1 + 1;
-
-  asm volatile(
-      "movi        v31.8b, #3                    \n"
-      "movi        v30.8h, #3                    \n"
-
-      "1:                                        \n"
-      "ldr         d0, [%0], #8                  \n"  // 01234567
-      "ldr         d1, [%2], #8                  \n"  // 12345678
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      "ushll       v2.8h, v0.8b, #0              \n"  // 01234567 (16b)
-      "ushll       v3.8h, v1.8b, #0              \n"  // 12345678 (16b)
-      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (1, odd)
-      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (1, even)
-
-      "ldr         d0, [%1], #8                  \n"
-      "ldr         d1, [%3], #8                  \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-
-      "ushll       v4.8h, v0.8b, #0              \n"  // 01234567 (16b)
-      "ushll       v5.8h, v1.8b, #0              \n"  // 12345678 (16b)
-      "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
-      "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
-
-      "mov         v0.8h, v4.8h                  \n"
-      "mov         v1.8h, v5.8h                  \n"
-      "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
-      "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
-      "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
-      "mla         v3.8h, v1.8h, v30.8h          \n"  // 9 3 3 1 (2, even)
-
-      "rshrn       v2.8b, v2.8h, #4              \n"  // 2, odd
-      "rshrn       v1.8b, v3.8h, #4              \n"  // 2, even
-      "rshrn       v4.8b, v4.8h, #4              \n"  // 1, odd
-      "rshrn       v3.8b, v5.8h, #4              \n"  // 1, even
-
-      "st2         {v1.8b, v2.8b}, [%5], #16     \n"  // store 1
-      "st2         {v3.8b, v4.8b}, [%4], #16     \n"  // store 2
-      "subs        %w6, %w6, #16                 \n"  // 8 sample -> 16 sample
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(src_ptr1),   // %1
-        "+r"(src_temp),   // %2
-        "+r"(src_temp1),  // %3
-        "+r"(dst_ptr),    // %4
-        "+r"(dst_ptr1),   // %5
-        "+r"(dst_width)   // %6
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
-        "v31"  // Clobber List
-  );
-}
-
-void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width) {
-  const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
-      "movi        v31.8h, #3                    \n"
-
-      "1:                                        \n"
-      "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
-      "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      "mov         v2.8h, v0.8h                  \n"
-      "mla         v0.8h, v1.8h, v31.8h          \n"  // 3*near+far (odd)
-      "mla         v1.8h, v2.8h, v31.8h          \n"  // 3*near+far (even)
-
-      "urshr       v2.8h, v0.8h, #2              \n"  // 3/4*near+1/4*far (odd)
-      "urshr       v1.8h, v1.8h, #2              \n"  // 3/4*near+1/4*far (even)
-
-      "st2         {v1.8h, v2.8h}, [%2], #32     \n"  // store
-      "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(src_temp),  // %1
-        "+r"(dst_ptr),   // %2
-        "+r"(dst_width)  // %3
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v31"  // Clobber List
-  );
-}
-
-void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
-  const uint16_t* src_temp = src_ptr + 1;
-  const uint16_t* src_temp1 = src_ptr1 + 1;
-
-  asm volatile(
-      "movi        v31.8h, #3                    \n"
-
-      "1:                                        \n"
-      "ld1         {v2.8h}, [%0], #16            \n"  // 01234567 (16b)
-      "ld1         {v3.8h}, [%2], #16            \n"  // 12345678 (16b)
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      "mov         v0.8h, v2.8h                  \n"
-      "mla         v2.8h, v3.8h, v31.8h          \n"  // 3*near+far (odd)
-      "mla         v3.8h, v0.8h, v31.8h          \n"  // 3*near+far (even)
-
-      "ld1         {v4.8h}, [%1], #16            \n"  // 01234567 (16b)
-      "ld1         {v5.8h}, [%3], #16            \n"  // 12345678 (16b)
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-
-      "mov         v0.8h, v4.8h                  \n"
-      "mla         v4.8h, v5.8h, v31.8h          \n"  // 3*near+far (odd)
-      "mla         v5.8h, v0.8h, v31.8h          \n"  // 3*near+far (even)
-
-      "mov         v0.8h, v4.8h                  \n"
-      "mov         v1.8h, v5.8h                  \n"
-      "mla         v4.8h, v2.8h, v31.8h          \n"  // 9 3 3 1 (1, odd)
-      "mla         v5.8h, v3.8h, v31.8h          \n"  // 9 3 3 1 (1, even)
-      "mla         v2.8h, v0.8h, v31.8h          \n"  // 9 3 3 1 (2, odd)
-      "mla         v3.8h, v1.8h, v31.8h          \n"  // 9 3 3 1 (2, even)
-
-      "urshr       v2.8h, v2.8h, #4              \n"  // 2, odd
-      "urshr       v1.8h, v3.8h, #4              \n"  // 2, even
-      "urshr       v4.8h, v4.8h, #4              \n"  // 1, odd
-      "urshr       v3.8h, v5.8h, #4              \n"  // 1, even
-
-      "st2         {v3.8h, v4.8h}, [%4], #32     \n"  // store 1
-      "st2         {v1.8h, v2.8h}, [%5], #32     \n"  // store 2
-
-      "subs        %w6, %w6, #16                 \n"  // 8 sample -> 16 sample
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(src_ptr1),   // %1
-        "+r"(src_temp),   // %2
-        "+r"(src_temp1),  // %3
-        "+r"(dst_ptr),    // %4
-        "+r"(dst_ptr1),   // %5
-        "+r"(dst_width)   // %6
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
-        "v31"  // Clobber List
-  );
-}
-
-void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
-                                uint16_t* dst_ptr,
-                                int dst_width) {
-  const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
-      "movi        v31.8h, #3                    \n"
-
-      "1:                                        \n"
-      "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
-      "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      "ushll       v2.4s, v0.4h, #0              \n"  // 0123 (32b)
-      "ushll2      v3.4s, v0.8h, #0              \n"  // 4567 (32b)
-      "ushll       v4.4s, v1.4h, #0              \n"  // 1234 (32b)
-      "ushll2      v5.4s, v1.8h, #0              \n"  // 5678 (32b)
-
-      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
-      "umlal2      v3.4s, v1.8h, v31.8h          \n"  // 3*near+far (2, odd)
-      "umlal       v4.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
-      "umlal2      v5.4s, v0.8h, v31.8h          \n"  // 3*near+far (2, even)
-
-      "rshrn       v0.4h, v4.4s, #2              \n"  // 3/4*near+1/4*far
-      "rshrn2      v0.8h, v5.4s, #2              \n"  // 3/4*near+1/4*far (even)
-      "rshrn       v1.4h, v2.4s, #2              \n"  // 3/4*near+1/4*far
-      "rshrn2      v1.8h, v3.4s, #2              \n"  // 3/4*near+1/4*far (odd)
-
-      "st2         {v0.8h, v1.8h}, [%2], #32     \n"  // store
-      "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(src_temp),  // %1
-        "+r"(dst_ptr),   // %2
-        "+r"(dst_width)  // %3
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v31"  // Clobber List
-  );
-}
-
-void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t* dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
-  const uint16_t* src_temp = src_ptr + 1;
-  const uint16_t* src_temp1 = src_ptr1 + 1;
-
-  asm volatile(
-      "movi        v31.4h, #3                    \n"
-      "movi        v30.4s, #3                    \n"
-
-      "1:                                        \n"
-      "ldr         d0, [%0], #8                  \n"  // 0123 (16b)
-      "ldr         d1, [%2], #8                  \n"  // 1234 (16b)
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "ushll       v2.4s, v0.4h, #0              \n"  // 0123 (32b)
-      "ushll       v3.4s, v1.4h, #0              \n"  // 1234 (32b)
-      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
-      "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
-
-      "ldr         d0, [%1], #8                  \n"  // 0123 (16b)
-      "ldr         d1, [%3], #8                  \n"  // 1234 (16b)
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "ushll       v4.4s, v0.4h, #0              \n"  // 0123 (32b)
-      "ushll       v5.4s, v1.4h, #0              \n"  // 1234 (32b)
-      "umlal       v4.4s, v1.4h, v31.4h          \n"  // 3*near+far (2, odd)
-      "umlal       v5.4s, v0.4h, v31.4h          \n"  // 3*near+far (2, even)
-
-      "mov         v0.16b, v4.16b                \n"
-      "mov         v1.16b, v5.16b                \n"
-      "mla         v4.4s, v2.4s, v30.4s          \n"  // 9 3 3 1 (1, odd)
-      "mla         v5.4s, v3.4s, v30.4s          \n"  // 9 3 3 1 (1, even)
-      "mla         v2.4s, v0.4s, v30.4s          \n"  // 9 3 3 1 (2, odd)
-      "mla         v3.4s, v1.4s, v30.4s          \n"  // 9 3 3 1 (2, even)
-
-      "rshrn       v1.4h, v4.4s, #4              \n"  // 3/4*near+1/4*far
-      "rshrn       v0.4h, v5.4s, #4              \n"  // 3/4*near+1/4*far
-      "rshrn       v5.4h, v2.4s, #4              \n"  // 3/4*near+1/4*far
-      "rshrn       v4.4h, v3.4s, #4              \n"  // 3/4*near+1/4*far
-
-      "st2         {v0.4h, v1.4h}, [%4], #16     \n"  // store 1
-      "st2         {v4.4h, v5.4h}, [%5], #16     \n"  // store 2
-
-      "subs        %w6, %w6, #8                  \n"  // 4 sample -> 8 sample
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(src_ptr1),   // %1
-        "+r"(src_temp),   // %2
-        "+r"(src_temp1),  // %3
-        "+r"(dst_ptr),    // %4
-        "+r"(dst_ptr1),   // %5
-        "+r"(dst_width)   // %6
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
-        "v31"  // Clobber List
-  );
-}
-
-void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  const uint8_t* src_temp = src_ptr + 2;
-  asm volatile(
-      "movi        v31.8b, #3                    \n"
-
-      "1:                                        \n"
-      "ldr         d0, [%0], #8                  \n"  // 00112233 (1u1v)
-      "ldr         d1, [%1], #8                  \n"  // 11223344 (1u1v)
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      "ushll       v2.8h, v0.8b, #0              \n"  // 00112233 (1u1v, 16b)
-      "ushll       v3.8h, v1.8b, #0              \n"  // 11223344 (1u1v, 16b)
-
-      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (odd)
-      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (even)
-
-      "rshrn       v2.8b, v2.8h, #2              \n"  // 3/4*near+1/4*far (odd)
-      "rshrn       v1.8b, v3.8h, #2              \n"  // 3/4*near+1/4*far (even)
-
-      "st2         {v1.4h, v2.4h}, [%2], #16     \n"  // store
-      "subs        %w3, %w3, #8                  \n"  // 4 uv -> 8 uv
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(src_temp),  // %1
-        "+r"(dst_ptr),   // %2
-        "+r"(dst_width)  // %3
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v31"  // Clobber List
-  );
-}
-
-void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
-  const uint8_t* src_temp = src_ptr + 2;
-  const uint8_t* src_temp1 = src_ptr1 + 2;
-
-  asm volatile(
-      "movi        v31.8b, #3                    \n"
-      "movi        v30.8h, #3                    \n"
-
-      "1:                                        \n"
-      "ldr         d0, [%0], #8                  \n"
-      "ldr         d1, [%2], #8                  \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      "ushll       v2.8h, v0.8b, #0              \n"
-      "ushll       v3.8h, v1.8b, #0              \n"
-      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (1, odd)
-      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (1, even)
-
-      "ldr         d0, [%1], #8                  \n"
-      "ldr         d1, [%3], #8                  \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-
-      "ushll       v4.8h, v0.8b, #0              \n"
-      "ushll       v5.8h, v1.8b, #0              \n"
-      "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
-      "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
-
-      "mov         v0.8h, v4.8h                  \n"
-      "mov         v1.8h, v5.8h                  \n"
-      "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
-      "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
-      "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
-      "mla         v3.8h, v1.8h, v30.8h          \n"  // 9 3 3 1 (2, even)
-
-      "rshrn       v2.8b, v2.8h, #4              \n"  // 2, odd
-      "rshrn       v1.8b, v3.8h, #4              \n"  // 2, even
-      "rshrn       v4.8b, v4.8h, #4              \n"  // 1, odd
-      "rshrn       v3.8b, v5.8h, #4              \n"  // 1, even
-
-      "st2         {v1.4h, v2.4h}, [%5], #16     \n"  // store 2
-      "st2         {v3.4h, v4.4h}, [%4], #16     \n"  // store 1
-      "subs        %w6, %w6, #8                  \n"  // 4 uv -> 8 uv
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(src_ptr1),   // %1
-        "+r"(src_temp),   // %2
-        "+r"(src_temp1),  // %3
-        "+r"(dst_ptr),    // %4
-        "+r"(dst_ptr1),   // %5
-        "+r"(dst_width)   // %6
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
-        "v31"  // Clobber List
-  );
-}
-
-void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
-                                  uint16_t* dst_ptr,
-                                  int dst_width) {
-  const uint16_t* src_temp = src_ptr + 2;
-  asm volatile(
-      "movi        v31.8h, #3                    \n"
-
-      "1:                                        \n"
-      "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
-      "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-
-      "ushll       v2.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
-      "ushll       v3.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
-      "ushll2      v4.4s, v0.8h, #0              \n"  // 2233 (1u1v, 32b)
-      "ushll2      v5.4s, v1.8h, #0              \n"  // 3344 (1u1v, 32b)
-
-      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (odd)
-      "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (even)
-      "umlal2      v4.4s, v1.8h, v31.8h          \n"  // 3*near+far (odd)
-      "umlal2      v5.4s, v0.8h, v31.8h          \n"  // 3*near+far (even)
-
-      "rshrn       v2.4h, v2.4s, #2              \n"  // 3/4*near+1/4*far (odd)
-      "rshrn       v1.4h, v3.4s, #2              \n"  // 3/4*near+1/4*far (even)
-      "rshrn       v4.4h, v4.4s, #2              \n"  // 3/4*near+1/4*far (odd)
-      "rshrn       v3.4h, v5.4s, #2              \n"  // 3/4*near+1/4*far (even)
-
-      "st2         {v1.2s, v2.2s}, [%2], #16     \n"  // store
-      "st2         {v3.2s, v4.2s}, [%2], #16     \n"  // store
-      "subs        %w3, %w3, #8                  \n"  // 4 uv -> 8 uv
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(src_temp),  // %1
-        "+r"(dst_ptr),   // %2
-        "+r"(dst_width)  // %3
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
-        "v31"  // Clobber List
-  );
-}
-
-void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width) {
-  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
-  const uint16_t* src_temp = src_ptr + 2;
-  const uint16_t* src_temp1 = src_ptr1 + 2;
-
-  asm volatile(
-      "movi        v31.4h, #3                    \n"
-      "movi        v30.4s, #3                    \n"
-
-      "1:                                        \n"
-      "ldr         d0, [%0], #8                  \n"
-      "ldr         d1, [%2], #8                  \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "ushll       v2.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
-      "ushll       v3.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
-      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
-      "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
-
-      "ldr         d0, [%1], #8                  \n"
-      "ldr         d1, [%3], #8                  \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "ushll       v4.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
-      "ushll       v5.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
-      "umlal       v4.4s, v1.4h, v31.4h          \n"  // 3*near+far (2, odd)
-      "umlal       v5.4s, v0.4h, v31.4h          \n"  // 3*near+far (2, even)
-
-      "mov         v0.16b, v4.16b                \n"
-      "mov         v1.16b, v5.16b                \n"
-      "mla         v4.4s, v2.4s, v30.4s          \n"  // 9 3 3 1 (1, odd)
-      "mla         v5.4s, v3.4s, v30.4s          \n"  // 9 3 3 1 (1, even)
-      "mla         v2.4s, v0.4s, v30.4s          \n"  // 9 3 3 1 (2, odd)
-      "mla         v3.4s, v1.4s, v30.4s          \n"  // 9 3 3 1 (2, even)
-
-      "rshrn       v1.4h, v2.4s, #4              \n"  // 2, odd
-      "rshrn       v0.4h, v3.4s, #4              \n"  // 2, even
-      "rshrn       v3.4h, v4.4s, #4              \n"  // 1, odd
-      "rshrn       v2.4h, v5.4s, #4              \n"  // 1, even
-
-      "st2         {v0.2s, v1.2s}, [%5], #16     \n"  // store 2
-      "st2         {v2.2s, v3.2s}, [%4], #16     \n"  // store 1
-      "subs        %w6, %w6, #4                  \n"  // 2 uv -> 4 uv
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(src_ptr1),   // %1
-        "+r"(src_temp),   // %2
-        "+r"(src_temp1),  // %3
-        "+r"(dst_ptr),    // %4
-        "+r"(dst_ptr1),   // %5
-        "+r"(dst_width)   // %6
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
-        "v31"  // Clobber List
-  );
-}
-
-// Add a row of bytes to a row of shorts.  Used for box filter.
-// Reads 16 bytes and accumulates to 16 shorts at a time.
-void ScaleAddRow_NEON(const uint8_t* src_ptr,
-                      uint16_t* dst_ptr,
-                      int src_width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v1.8h, v2.8h}, [%1]          \n"  // load accumulator
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 bytes
-      "uaddw2      v2.8h, v2.8h, v0.16b          \n"  // add
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "uaddw       v1.8h, v1.8h, v0.8b           \n"
-      "st1         {v1.8h, v2.8h}, [%1], #32     \n"  // store accumulator
-      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_width)  // %2
-      :
-      : "memory", "cc", "v0", "v1", "v2"  // Clobber List
-  );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                      \
-  "lsr        %5, %3, #16                    \n" \
-  "add        %6, %1, %5                     \n" \
-  "add        %3, %3, %4                     \n" \
-  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
-
-// The NEON version mimics this formula (from row_common.cc):
-// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
-//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-
-void ScaleFilterCols_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          int dst_width,
-                          int x,
-                          int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8_t* src_tmp = src_ptr;
-  int64_t x64 = (int64_t)x;    // NOLINT
-  int64_t dx64 = (int64_t)dx;  // NOLINT
-  asm volatile (
-      "dup         v0.4s, %w3                    \n"  // x
-      "dup         v1.4s, %w4                    \n"  // dx
-      "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
-      "shl         v3.4s, v1.4s, #2              \n"  // 4 * dx
-      "mul         v1.4s, v1.4s, v2.4s           \n"
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-      "add         v1.4s, v1.4s, v0.4s           \n"
-    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-      "add         v2.4s, v1.4s, v3.4s           \n"
-      "shl         v0.4s, v3.4s, #1              \n"  // 8 * dx
-      "1:                                        \n"
-    LOAD2_DATA8_LANE(0)
-    LOAD2_DATA8_LANE(1)
-    LOAD2_DATA8_LANE(2)
-    LOAD2_DATA8_LANE(3)
-    LOAD2_DATA8_LANE(4)
-    LOAD2_DATA8_LANE(5)
-    LOAD2_DATA8_LANE(6)
-    LOAD2_DATA8_LANE(7)
-      "mov         v6.16b, v1.16b                \n"
-      "mov         v7.16b, v2.16b                \n"
-      "uzp1        v6.8h, v6.8h, v7.8h           \n"
-      "ushll       v4.8h, v4.8b, #0              \n"
-      "ushll       v5.8h, v5.8b, #0              \n"
-      "ssubl       v16.4s, v5.4h, v4.4h          \n"
-      "ssubl2      v17.4s, v5.8h, v4.8h          \n"
-      "ushll       v7.4s, v6.4h, #0              \n"
-      "ushll2      v6.4s, v6.8h, #0              \n"
-      "mul         v16.4s, v16.4s, v7.4s         \n"
-      "mul         v17.4s, v17.4s, v6.4s         \n"
-      "rshrn       v6.4h, v16.4s, #16            \n"
-      "rshrn2      v6.8h, v17.4s, #16            \n"
-      "add         v4.8h, v4.8h, v6.8h           \n"
-      "xtn         v4.8b, v4.8h                  \n"
-
-      "st1         {v4.8b}, [%0], #8             \n"  // store pixels
-      "add         v1.4s, v1.4s, v0.4s           \n"
-      "add         v2.4s, v2.4s, v0.4s           \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
-      "b.gt        1b                            \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(x64),              // %3
-    "+r"(dx64),             // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3",
-    "v4", "v5", "v6", "v7", "v16", "v17"
-  );
-}
-
-#undef LOAD2_DATA8_LANE
-
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction) {
-  int y_fraction = 256 - source_y_fraction;
-  asm volatile(
-      "cmp         %w4, #0                       \n"
-      "b.eq        100f                          \n"
-      "add         %2, %2, %1                    \n"
-      "cmp         %w4, #64                      \n"
-      "b.eq        75f                           \n"
-      "cmp         %w4, #128                     \n"
-      "b.eq        50f                           \n"
-      "cmp         %w4, #192                     \n"
-      "b.eq        25f                           \n"
-
-      "dup         v5.8b, %w4                    \n"
-      "dup         v4.8b, %w5                    \n"
-      // General purpose row blend.
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "umull       v6.8h, v0.8b, v4.8b           \n"
-      "umull2      v7.8h, v0.16b, v4.16b         \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "umlal       v6.8h, v1.8b, v5.8b           \n"
-      "umlal2      v7.8h, v1.16b, v5.16b         \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "rshrn       v0.8b, v6.8h, #8              \n"
-      "rshrn2      v0.16b, v7.8h, #8             \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        1b                            \n"
-      "b           99f                           \n"
-
-      // Blend 25 / 75.
-      "25:                                       \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        25b                           \n"
-      "b           99f                           \n"
-
-      // Blend 50 / 50.
-      "50:                                       \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        50b                           \n"
-      "b           99f                           \n"
-
-      // Blend 75 / 25.
-      "75:                                       \n"
-      "ld1         {v1.16b}, [%1], #16           \n"
-      "ld1         {v0.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        75b                           \n"
-      "b           99f                           \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      "100:                                      \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        100b                          \n"
-
-      "99:                                       \n"
-      "st1         {v0.b}[15], [%0]              \n"
-      : "+r"(dst_ptr),            // %0
-        "+r"(src_ptr),            // %1
-        "+r"(src_stride),         // %2
-        "+r"(dst_width),          // %3
-        "+r"(source_y_fraction),  // %4
-        "+r"(y_fraction)          // %5
-      :
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
-}
-
-void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst,
-                            int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
-      "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
-      "mov         v2.16b, v3.16b                \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "st2         {v1.4s,v2.4s}, [%1], #32      \n"  // store 8 odd pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
-      "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
-
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "urhadd      v1.16b, v2.16b, v3.16b        \n"
-      "st2         {v0.4s,v1.4s}, [%1], #32      \n"  // store 8 pixels
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst,
-                               int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add         %1, %1, %0                    \n"
-      "1:                                        \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "uaddlp      v3.8h, v3.16b                 \n"  // A 16 bytes -> 8 shorts.
-      "ld4         {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
-      "uadalp      v0.8h, v16.16b                \n"  // B 16 bytes -> 8 shorts.
-      "uadalp      v1.8h, v17.16b                \n"  // G 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "uadalp      v2.8h, v18.16b                \n"  // R 16 bytes -> 8 shorts.
-      "uadalp      v3.8h, v19.16b                \n"  // A 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
-      "rshrn       v1.8b, v1.8h, #2              \n"
-      "rshrn       v2.8b, v2.8h, #2              \n"
-      "rshrn       v3.8b, v3.8h, #2              \n"
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_argb,
-                               int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v0.s}[0], [%0], %3           \n"
-      "ld1         {v0.s}[1], [%0], %3           \n"
-      "ld1         {v0.s}[2], [%0], %3           \n"
-      "ld1         {v0.s}[3], [%0], %3           \n"
-      "subs        %w2, %w2, #4                  \n"  // 4 pixels per loop.
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "st1         {v0.16b}, [%1], #16           \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),                // %0
-        "+r"(dst_argb),                // %1
-        "+r"(dst_width)                // %2
-      : "r"((int64_t)(src_stepx * 4))  // %3
-      : "memory", "cc", "v0");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-// TODO(Yang Zhang): Might be worth another optimization pass in future.
-// It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  asm volatile(
-      "add         %1, %1, %0                    \n"
-      "1:                                        \n"
-      "ld1         {v0.8b}, [%0], %4             \n"  // Read 4 2x2 -> 2x1
-      "ld1         {v1.8b}, [%1], %4             \n"
-      "ld1         {v2.8b}, [%0], %4             \n"
-      "ld1         {v3.8b}, [%1], %4             \n"
-      "ld1         {v4.8b}, [%0], %4             \n"
-      "ld1         {v5.8b}, [%1], %4             \n"
-      "ld1         {v6.8b}, [%0], %4             \n"
-      "ld1         {v7.8b}, [%1], %4             \n"
-      "uaddl       v0.8h, v0.8b, v1.8b           \n"
-      "uaddl       v2.8h, v2.8b, v3.8b           \n"
-      "uaddl       v4.8h, v4.8b, v5.8b           \n"
-      "uaddl       v6.8h, v6.8b, v7.8b           \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "mov         v16.d[1], v0.d[1]             \n"  // ab_cd -> ac_bd
-      "mov         v0.d[1], v2.d[0]              \n"
-      "mov         v2.d[0], v16.d[1]             \n"
-      "mov         v16.d[1], v4.d[1]             \n"  // ef_gh -> eg_fh
-      "mov         v4.d[1], v6.d[0]              \n"
-      "mov         v6.d[0], v16.d[1]             \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "add         v0.8h, v0.8h, v2.8h           \n"  // (a+b)_(c+d)
-      "add         v4.8h, v4.8h, v6.8h           \n"  // (e+f)_(g+h)
-      "rshrn       v0.8b, v0.8h, #2              \n"  // first 2 pixels.
-      "rshrn2      v0.16b, v4.8h, #2             \n"  // next 2 pixels.
-      "subs        %w3, %w3, #4                  \n"  // 4 pixels per loop.
-      "st1         {v0.16b}, [%2], #16           \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_argb),                // %0
-        "+r"(src_stride),              // %1
-        "+r"(dst_argb),                // %2
-        "+r"(dst_width)                // %3
-      : "r"((int64_t)(src_stepx * 4))  // %4
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD1_DATA32_LANE(vn, n)                 \
-  "lsr        %5, %3, #16                    \n" \
-  "add        %6, %1, %5, lsl #2             \n" \
-  "add        %3, %3, %4                     \n" \
-  "ld1        {" #vn ".s}[" #n "], [%6]      \n"
-
-void ScaleARGBCols_NEON(uint8_t* dst_argb,
-                        const uint8_t* src_argb,
-                        int dst_width,
-                        int x,
-                        int dx) {
-  const uint8_t* src_tmp = src_argb;
-  int64_t x64 = (int64_t)x;    // NOLINT
-  int64_t dx64 = (int64_t)dx;  // NOLINT
-  int64_t tmp64;
-  asm volatile(
-      "1:                                        \n"
-      // clang-format off
-      LOAD1_DATA32_LANE(v0, 0)
-      LOAD1_DATA32_LANE(v0, 1)
-      LOAD1_DATA32_LANE(v0, 2)
-      LOAD1_DATA32_LANE(v0, 3)
-      LOAD1_DATA32_LANE(v1, 0)
-      LOAD1_DATA32_LANE(v1, 1)
-      LOAD1_DATA32_LANE(v1, 2)
-      LOAD1_DATA32_LANE(v1, 3)
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      // clang-format on
-      "st1         {v0.4s, v1.4s}, [%0], #32     \n"  // store pixels
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
-      "b.gt        1b                            \n"
-      : "+r"(dst_argb),   // %0
-        "+r"(src_argb),   // %1
-        "+r"(dst_width),  // %2
-        "+r"(x64),        // %3
-        "+r"(dx64),       // %4
-        "=&r"(tmp64),     // %5
-        "+r"(src_tmp)     // %6
-      :
-      : "memory", "cc", "v0", "v1");
-}
-
-#undef LOAD1_DATA32_LANE
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
-  "lsr        %5, %3, #16                           \n" \
-  "add        %6, %1, %5, lsl #2                    \n" \
-  "add        %3, %3, %4                            \n" \
-  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
-
-void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
-                              const uint8_t* src_argb,
-                              int dst_width,
-                              int x,
-                              int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8_t* src_tmp = src_argb;
-  int64_t x64 = (int64_t)x;    // NOLINT
-  int64_t dx64 = (int64_t)dx;  // NOLINT
-  asm volatile (
-      "dup         v0.4s, %w3                    \n"  // x
-      "dup         v1.4s, %w4                    \n"  // dx
-      "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
-      "shl         v6.4s, v1.4s, #2              \n"  // 4 * dx
-      "mul         v1.4s, v1.4s, v2.4s           \n"
-      "movi        v3.16b, #0x7f                 \n"  // 0x7F
-      "movi        v4.8h, #0x7f                  \n"  // 0x7F
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-      "add         v5.4s, v1.4s, v0.4s           \n"
-      "1:                                        \n"
-    // d0, d1: a
-    // d2, d3: b
-    LOAD2_DATA32_LANE(v0, v1, 0)
-    LOAD2_DATA32_LANE(v0, v1, 1)
-    LOAD2_DATA32_LANE(v0, v1, 2)
-    LOAD2_DATA32_LANE(v0, v1, 3)
-    "shrn       v2.4h, v5.4s, #9               \n"
-    "and        v2.8b, v2.8b, v4.8b            \n"
-    "dup        v16.8b, v2.b[0]                \n"
-    "dup        v17.8b, v2.b[2]                \n"
-    "dup        v18.8b, v2.b[4]                \n"
-    "dup        v19.8b, v2.b[6]                \n"
-    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
-    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
-    "ins        v2.d[1], v17.d[0]              \n"  // f
-    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
-    "umull      v16.8h, v0.8b, v7.8b           \n"
-    "umull2     v17.8h, v0.16b, v7.16b         \n"
-    "umull      v18.8h, v1.8b, v2.8b           \n"
-    "umull2     v19.8h, v1.16b, v2.16b         \n"
-    "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
-    "add        v16.8h, v16.8h, v18.8h         \n"
-    "add        v17.8h, v17.8h, v19.8h         \n"
-    "shrn       v0.8b, v16.8h, #7              \n"
-    "shrn2      v0.16b, v17.8h, #7             \n"
-    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
-    "add     v5.4s, v5.4s, v6.4s               \n"
-    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
-    "b.gt       1b                             \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width),        // %2
-    "+r"(x64),              // %3
-    "+r"(dx64),             // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
-    "v6", "v7", "v16", "v17", "v18", "v19"
-  );
-}
-
-#undef LOAD2_DATA32_LANE
-
-// Read 16x2 average down and write 8x1.
-void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint16_t* dst,
-                              int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
-      "1:                                        \n"
-      "ld1         {v0.8h, v1.8h}, [%0], #32     \n"  // load row 1 and post inc
-      "ld1         {v2.8h, v3.8h}, [%1], #32     \n"  // load row 2 and post inc
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop
-      "uaddlp      v0.4s, v0.8h                  \n"  // row 1 add adjacent
-      "uaddlp      v1.4s, v1.8h                  \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "uadalp      v0.4s, v2.8h                  \n"  // +row 2 add adjacent
-      "uadalp      v1.4s, v3.8h                  \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "rshrn       v0.4h, v0.4s, #2              \n"  // round and pack
-      "rshrn2      v0.8h, v1.4s, #2              \n"
-      "st1         {v0.8h}, [%2], #16            \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// Read 8x2 upsample with filtering and write 16x1.
-// Actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint16_t* dst,
-                         int dst_width) {
-  asm volatile(
-      "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
-      "movi        v0.8h, #9                     \n"  // constants
-      "movi        v1.4s, #3                     \n"
-
-      "1:                                        \n"
-      "ld1         {v3.8h}, [%0], %4             \n"  // TL read first 8
-      "ld1         {v4.8h}, [%0], %5             \n"  // TR read 8 offset by 1
-      "ld1         {v5.8h}, [%1], %4             \n"  // BL read 8 from next row
-      "ld1         {v6.8h}, [%1], %5             \n"  // BR offset by 1
-      "subs        %w3, %w3, #16                 \n"  // 16 dst pixels per loop
-      "umull       v16.4s, v3.4h, v0.4h          \n"
-      "umull2      v7.4s, v3.8h, v0.8h           \n"
-      "umull       v18.4s, v4.4h, v0.4h          \n"
-      "umull2      v17.4s, v4.8h, v0.8h          \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "uaddw       v16.4s, v16.4s, v6.4h         \n"
-      "uaddl2      v19.4s, v6.8h, v3.8h          \n"
-      "uaddl       v3.4s, v6.4h, v3.4h           \n"
-      "uaddw2      v6.4s, v7.4s, v6.8h           \n"
-      "uaddl2      v7.4s, v5.8h, v4.8h           \n"
-      "uaddl       v4.4s, v5.4h, v4.4h           \n"
-      "uaddw       v18.4s, v18.4s, v5.4h         \n"
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "mla         v16.4s, v4.4s, v1.4s          \n"
-      "mla         v18.4s, v3.4s, v1.4s          \n"
-      "mla         v6.4s, v7.4s, v1.4s           \n"
-      "uaddw2      v4.4s, v17.4s, v5.8h          \n"
-      "uqrshrn     v16.4h,  v16.4s, #4           \n"
-      "mla         v4.4s, v19.4s, v1.4s          \n"
-      "uqrshrn2    v16.8h, v6.4s, #4             \n"
-      "uqrshrn     v17.4h, v18.4s, #4            \n"
-      "uqrshrn2    v17.8h, v4.4s, #4             \n"
-      "st2         {v16.8h-v17.8h}, [%2], #32    \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      : "r"(2LL),          // %4
-        "r"(14LL)          // %5
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
-        "v19"  // Clobber List
-  );
-}
-
-void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add         %1, %1, %0                    \n"
-      "1:                                        \n"
-      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 UV
-      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "uaddlp      v0.8h, v0.16b                 \n"  // U 16 bytes -> 8 shorts.
-      "uaddlp      v1.8h, v1.16b                 \n"  // V 16 bytes -> 8 shorts.
-      "ld2         {v16.16b,v17.16b}, [%1], #32  \n"  // load 16
-      "uadalp      v0.8h, v16.16b                \n"  // U 16 bytes -> 8 shorts.
-      "uadalp      v1.8h, v17.16b                \n"  // V 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "rshrn       v1.8b, v1.8h, #2              \n"
-      "st2         {v0.8b,v1.8b}, [%2], #16      \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "memory", "cc", "v0", "v1", "v16", "v17");
-}
-
-// Reads 4 pixels at a time.
-void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             int src_stepx,  // pixel step
-                             uint8_t* dst_ptr,
-                             int dst_width) {
-  const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
-  const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
-  const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "ld1         {v0.h}[0], [%0], %6           \n"
-      "ld1         {v1.h}[0], [%1], %6           \n"
-      "ld1         {v2.h}[0], [%2], %6           \n"
-      "ld1         {v3.h}[0], [%3], %6           \n"
-      "subs        %w5, %w5, #4                  \n"  // 4 pixels per loop.
-      "st4         {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),                 // %0
-        "+r"(src1_ptr),                // %1
-        "+r"(src2_ptr),                // %2
-        "+r"(src3_ptr),                // %3
-        "+r"(dst_ptr),                 // %4
-        "+r"(dst_width)                // %5
-      : "r"((int64_t)(src_stepx * 8))  // %6
-      : "memory", "cc", "v0", "v1", "v2", "v3");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/scale_uv.cc b/thirdparty/libyuv/source/scale_uv.cc
deleted file mode 100644
index d9a3144..0000000
--- a/thirdparty/libyuv/source/scale_uv.cc
+++ /dev/null
@@ -1,1197 +0,0 @@
-/*
- *  Copyright 2020 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyUV
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Macros to enable specialized scalers
-
-#ifndef HAS_SCALEUVDOWN2
-#define HAS_SCALEUVDOWN2 1
-#endif
-#ifndef HAS_SCALEUVDOWN4BOX
-#define HAS_SCALEUVDOWN4BOX 1
-#endif
-#ifndef HAS_SCALEUVDOWNEVEN
-#define HAS_SCALEUVDOWNEVEN 1
-#endif
-#ifndef HAS_SCALEUVBILINEARDOWN
-#define HAS_SCALEUVBILINEARDOWN 1
-#endif
-#ifndef HAS_SCALEUVBILINEARUP
-#define HAS_SCALEUVBILINEARUP 1
-#endif
-#ifndef HAS_UVCOPY
-#define HAS_UVCOPY 1
-#endif
-#ifndef HAS_SCALEPLANEVERTICAL
-#define HAS_SCALEPLANEVERTICAL 1
-#endif
-
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// ScaleUV, 1/2
-// This is an optimized version for scaling down a UV to 1/2 of
-// its original size.
-#if HAS_SCALEUVDOWN2
-static void ScaleUVDown2(int src_width,
-                         int src_height,
-                         int dst_width,
-                         int dst_height,
-                         int src_stride,
-                         int dst_stride,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_uv,
-                         int x,
-                         int dx,
-                         int y,
-                         int dy,
-                         enum FilterMode filtering) {
-  int j;
-  int row_stride = src_stride * (dy >> 16);
-  void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
-                          uint8_t* dst_uv, int dst_width) =
-      filtering == kFilterNone
-          ? ScaleUVRowDown2_C
-          : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C
-                                        : ScaleUVRowDown2Box_C);
-  (void)src_width;
-  (void)src_height;
-  (void)dx;
-  assert(dx == 65536 * 2);      // Test scale factor of 2.
-  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
-  // Advance to odd row, even column.
-  if (filtering == kFilterBilinear) {
-    src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
-  } else {
-    src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
-  }
-
-#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
-    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && filtering) {
-    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && filtering) {
-    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
-    }
-  }
-#endif
-
-// This code is not enabled.  Only box filter is available at this time.
-#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleUVRowDown2 =
-        filtering == kFilterNone
-            ? ScaleUVRowDown2_Any_SSSE3
-            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
-                                          : ScaleUVRowDown2Box_Any_SSSE3);
-    if (IS_ALIGNED(dst_width, 2)) {
-      ScaleUVRowDown2 =
-          filtering == kFilterNone
-              ? ScaleUVRowDown2_SSSE3
-              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
-                                            : ScaleUVRowDown2Box_SSSE3);
-    }
-  }
-#endif
-// This code is not enabled.  Only box filter is available at this time.
-#if defined(HAS_SCALEUVROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleUVRowDown2 =
-        filtering == kFilterNone
-            ? ScaleUVRowDown2_Any_NEON
-            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
-                                          : ScaleUVRowDown2Box_Any_NEON);
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVRowDown2 =
-          filtering == kFilterNone
-              ? ScaleUVRowDown2_NEON
-              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
-                                            : ScaleUVRowDown2Box_NEON);
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVROWDOWN2_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleUVRowDown2 =
-        filtering == kFilterNone
-            ? ScaleUVRowDown2_Any_MMI
-            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MMI
-                                          : ScaleUVRowDown2Box_Any_MMI);
-    if (IS_ALIGNED(dst_width, 2)) {
-      ScaleUVRowDown2 =
-          filtering == kFilterNone
-              ? ScaleUVRowDown2_MMI
-              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MMI
-                                            : ScaleUVRowDown2Box_MMI);
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVROWDOWN2_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleUVRowDown2 =
-        filtering == kFilterNone
-            ? ScaleUVRowDown2_Any_MSA
-            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA
-                                          : ScaleUVRowDown2Box_Any_MSA);
-    if (IS_ALIGNED(dst_width, 2)) {
-      ScaleUVRowDown2 =
-          filtering == kFilterNone
-              ? ScaleUVRowDown2_MSA
-              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA
-                                            : ScaleUVRowDown2Box_MSA);
-    }
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  for (j = 0; j < dst_height; ++j) {
-    ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width);
-    src_uv += row_stride;
-    dst_uv += dst_stride;
-  }
-}
-#endif  // HAS_SCALEUVDOWN2
-
-// ScaleUV, 1/4
-// This is an optimized version for scaling down a UV to 1/4 of
-// its original size.
-#if HAS_SCALEUVDOWN4BOX
-static void ScaleUVDown4Box(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_uv,
-                            uint8_t* dst_uv,
-                            int x,
-                            int dx,
-                            int y,
-                            int dy) {
-  int j;
-  // Allocate 2 rows of UV.
-  const int kRowSize = (dst_width * 2 * 2 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
-  int row_stride = src_stride * (dy >> 16);
-  void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
-                          uint8_t* dst_uv, int dst_width) =
-      ScaleUVRowDown2Box_C;
-  // Advance to odd row, even column.
-  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
-  (void)src_width;
-  (void)src_height;
-  (void)dx;
-  assert(dx == 65536 * 4);      // Test scale factor of 4.
-  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
-
-#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
-    }
-  }
-#endif
-
-  for (j = 0; j < dst_height; ++j) {
-    ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
-    ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize,
-                    dst_width * 2);
-    ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width);
-    src_uv += row_stride;
-    dst_uv += dst_stride;
-  }
-  free_aligned_buffer_64(row);
-}
-#endif  // HAS_SCALEUVDOWN4BOX
-
-// ScaleUV Even
-// This is an optimized version for scaling down a UV to even
-// multiple of its original size.
-#if HAS_SCALEUVDOWNEVEN
-static void ScaleUVDownEven(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_uv,
-                            uint8_t* dst_uv,
-                            int x,
-                            int dx,
-                            int y,
-                            int dy,
-                            enum FilterMode filtering) {
-  int j;
-  int col_step = dx >> 16;
-  int row_stride = (dy >> 16) * src_stride;
-  void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
-                             int src_step, uint8_t* dst_uv, int dst_width) =
-      filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
-  (void)src_width;
-  (void)src_height;
-  assert(IS_ALIGNED(src_width, 2));
-  assert(IS_ALIGNED(src_height, 2));
-  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
-#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
-                                   : ScaleUVRowDownEven_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleUVRowDownEven =
-          filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && !filtering) {
-    ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
-    }
-  }
-#endif  // TODO(fbarchard): Enable Box filter
-#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
-                                   : ScaleUVRowDownEven_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleUVRowDownEven =
-          filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVROWDOWNEVEN_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleUVRowDownEven =
-        filtering ? ScaleUVRowDownEvenBox_Any_MMI : ScaleUVRowDownEven_Any_MMI;
-    if (IS_ALIGNED(dst_width, 2)) {
-      ScaleUVRowDownEven =
-          filtering ? ScaleUVRowDownEvenBox_MMI : ScaleUVRowDownEven_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVROWDOWNEVEN_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleUVRowDownEven =
-        filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleUVRowDownEven =
-          filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA;
-    }
-  }
-#endif
-
-  if (filtering == kFilterLinear) {
-    src_stride = 0;
-  }
-  for (j = 0; j < dst_height; ++j) {
-    ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width);
-    src_uv += row_stride;
-    dst_uv += dst_stride;
-  }
-}
-#endif
-
-// Scale UV down with bilinear interpolation.
-#if HAS_SCALEUVBILINEARDOWN
-static void ScaleUVBilinearDown(int src_width,
-                                int src_height,
-                                int dst_width,
-                                int dst_height,
-                                int src_stride,
-                                int dst_stride,
-                                const uint8_t* src_uv,
-                                uint8_t* dst_uv,
-                                int x,
-                                int dx,
-                                int y,
-                                int dy,
-                                enum FilterMode filtering) {
-  int j;
-  void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
-                            int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
-  int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
-  int64_t xl = (dx >= 0) ? x : xlast;
-  int64_t xr = (dx >= 0) ? xlast : x;
-  int clip_src_width;
-  xl = (xl >> 16) & ~3;    // Left edge aligned.
-  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.
-  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
-  if (xr > src_width) {
-    xr = src_width;
-  }
-  clip_src_width = (int)(xr - xl) * 2;  // Width aligned to 2.
-  src_uv += xl * 2;
-  x -= (int)(xl << 16);
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(clip_src_width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(clip_src_width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEUVFILTERCOLS_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleUVFilterCols = ScaleUVFilterCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVFILTERCOLS_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVFilterCols = ScaleUVFilterCols_MSA;
-    }
-  }
-#endif
-  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
-  // Allocate a row of UV.
-  {
-    align_buffer_64(row, clip_src_width * 2);
-
-    const int max_y = (src_height - 1) << 16;
-    if (y > max_y) {
-      y = max_y;
-    }
-    for (j = 0; j < dst_height; ++j) {
-      int yi = y >> 16;
-      const uint8_t* src = src_uv + yi * src_stride;
-      if (filtering == kFilterLinear) {
-        ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(row, src, src_stride, clip_src_width, yf);
-        ScaleUVFilterCols(dst_uv, row, dst_width, x, dx);
-      }
-      dst_uv += dst_stride;
-      y += dy;
-      if (y > max_y) {
-        y = max_y;
-      }
-    }
-    free_aligned_buffer_64(row);
-  }
-}
-#endif
-
-// Scale UV up with bilinear interpolation.
-#if HAS_SCALEUVBILINEARUP
-static void ScaleUVBilinearUp(int src_width,
-                              int src_height,
-                              int dst_width,
-                              int dst_height,
-                              int src_stride,
-                              int dst_stride,
-                              const uint8_t* src_uv,
-                              uint8_t* dst_uv,
-                              int x,
-                              int dx,
-                              int y,
-                              int dy,
-                              enum FilterMode filtering) {
-  int j;
-  void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
-                            int dst_width, int x, int dx) =
-      filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
-  const int max_y = (src_height - 1) << 16;
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(dst_width, 8)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-  if (src_width >= 32768) {
-    ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
-  }
-#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEUVFILTERCOLS_NEON)
-  if (filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVFilterCols = ScaleUVFilterCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVFILTERCOLS_MSA)
-  if (filtering && TestCpuFlag(kCpuHasMSA)) {
-    ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleUVFilterCols = ScaleUVFilterCols_MSA;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVCOLS_SSSE3)
-  if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleUVFilterCols = ScaleUVCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEUVCOLS_NEON)
-  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleUVFilterCols = ScaleUVCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 16)) {
-      ScaleUVFilterCols = ScaleUVCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVCOLS_MMI)
-  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
-    ScaleUVFilterCols = ScaleUVCols_Any_MMI;
-    if (IS_ALIGNED(dst_width, 1)) {
-      ScaleUVFilterCols = ScaleUVCols_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVCOLS_MSA)
-  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
-    ScaleUVFilterCols = ScaleUVCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVFilterCols = ScaleUVCols_MSA;
-    }
-  }
-#endif
-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
-    ScaleUVFilterCols = ScaleUVColsUp2_C;
-#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
-      ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
-    }
-#endif
-#if defined(HAS_SCALEUVCOLSUP2_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
-      ScaleUVFilterCols = ScaleUVColsUp2_MMI;
-    }
-#endif
-  }
-
-  if (y > max_y) {
-    y = max_y;
-  }
-
-  {
-    int yi = y >> 16;
-    const uint8_t* src = src_uv + yi * src_stride;
-
-    // Allocate 2 rows of UV.
-    const int kRowSize = (dst_width * 2 + 15) & ~15;
-    align_buffer_64(row, kRowSize * 2);
-
-    uint8_t* rowptr = row;
-    int rowstride = kRowSize;
-    int lasty = yi;
-
-    ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
-    if (src_height > 1) {
-      src += src_stride;
-    }
-    ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-    src += src_stride;
-
-    for (j = 0; j < dst_height; ++j) {
-      yi = y >> 16;
-      if (yi != lasty) {
-        if (y > max_y) {
-          y = max_y;
-          yi = y >> 16;
-          src = src_uv + yi * src_stride;
-        }
-        if (yi != lasty) {
-          ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
-          rowptr += rowstride;
-          rowstride = -rowstride;
-          lasty = yi;
-          src += src_stride;
-        }
-      }
-      if (filtering == kFilterLinear) {
-        InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf);
-      }
-      dst_uv += dst_stride;
-      y += dy;
-    }
-    free_aligned_buffer_64(row);
-  }
-}
-#endif  // HAS_SCALEUVBILINEARUP
-
-// Scale UV, horizontally up by 2 times.
-// Uses linear filter horizontally, nearest vertically.
-// This is an optimized version for scaling up a plane to 2 times of
-// its original width, using linear interpolation.
-// This is used to scale U and V planes of NV16 to NV24.
-void ScaleUVLinearUp2(int src_width,
-                      int src_height,
-                      int dst_width,
-                      int dst_height,
-                      int src_stride,
-                      int dst_stride,
-                      const uint8_t* src_uv,
-                      uint8_t* dst_uv) {
-  void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
-      ScaleUVRowUp2_Linear_Any_C;
-  int i;
-  int y;
-  int dy;
-
-  // This function can only scale up by 2 times horizontally.
-  assert(src_width == ((dst_width + 1) / 2));
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3;
-  }
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2;
-  }
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON;
-  }
-#endif
-
-  if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
-  } else {
-    dy = FixedDiv(src_height - 1, dst_height - 1);
-    y = (1 << 15) - 1;
-    for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
-      dst_uv += dst_stride;
-      y += dy;
-    }
-  }
-}
-
-// Scale plane, up by 2 times.
-// This is an optimized version for scaling up a plane to 2 times of
-// its original size, using bilinear interpolation.
-// This is used to scale U and V planes of NV12 to NV24.
-void ScaleUVBilinearUp2(int src_width,
-                        int src_height,
-                        int dst_width,
-                        int dst_height,
-                        int src_stride,
-                        int dst_stride,
-                        const uint8_t* src_ptr,
-                        uint8_t* dst_ptr) {
-  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
-                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleUVRowUp2_Bilinear_Any_C;
-  int x;
-
-  // This function can only scale up by 2 times.
-  assert(src_width == ((dst_width + 1) / 2));
-  assert(src_height == ((dst_height + 1) / 2));
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3;
-  }
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2;
-  }
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
-  if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON;
-  }
-#endif
-
-  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-  dst_ptr += dst_stride;
-  for (x = 0; x < src_height - 1; ++x) {
-    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
-    src_ptr += src_stride;
-    // TODO(fbarchard): Test performance of writing one row of destination at a
-    // time.
-    dst_ptr += 2 * dst_stride;
-  }
-  if (!(dst_height & 1)) {
-    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-  }
-}
-
-// Scale 16 bit UV, horizontally up by 2 times.
-// Uses linear filter horizontally, nearest vertically.
-// This is an optimized version for scaling up a plane to 2 times of
-// its original width, using linear interpolation.
-// This is used to scale U and V planes of P210 to P410.
-void ScaleUVLinearUp2_16(int src_width,
-                         int src_height,
-                         int dst_width,
-                         int dst_height,
-                         int src_stride,
-                         int dst_stride,
-                         const uint16_t* src_uv,
-                         uint16_t* dst_uv) {
-  void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
-      ScaleUVRowUp2_Linear_16_Any_C;
-  int i;
-  int y;
-  int dy;
-
-  // This function can only scale up by 2 times horizontally.
-  assert(src_width == ((dst_width + 1) / 2));
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2;
-  }
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
-  }
-#endif
-
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
-  }
-#endif
-
-  if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
-  } else {
-    dy = FixedDiv(src_height - 1, dst_height - 1);
-    y = (1 << 15) - 1;
-    for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
-      dst_uv += dst_stride;
-      y += dy;
-    }
-  }
-}
-
-// Scale 16 bit UV, up by 2 times.
-// This is an optimized version for scaling up a plane to 2 times of
-// its original size, using bilinear interpolation.
-// This is used to scale U and V planes of P010 to P410.
-void ScaleUVBilinearUp2_16(int src_width,
-                           int src_height,
-                           int dst_width,
-                           int dst_height,
-                           int src_stride,
-                           int dst_stride,
-                           const uint16_t* src_ptr,
-                           uint16_t* dst_ptr) {
-  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
-                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-      ScaleUVRowUp2_Bilinear_16_Any_C;
-  int x;
-
-  // This function can only scale up by 2 times.
-  assert(src_width == ((dst_width + 1) / 2));
-  assert(src_height == ((dst_height + 1) / 2));
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2;
-  }
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
-  }
-#endif
-
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
-  if (TestCpuFlag(kCpuHasNEON)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
-  }
-#endif
-
-  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-  dst_ptr += dst_stride;
-  for (x = 0; x < src_height - 1; ++x) {
-    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
-    src_ptr += src_stride;
-    // TODO(fbarchard): Test performance of writing one row of destination at a
-    // time.
-    dst_ptr += 2 * dst_stride;
-  }
-  if (!(dst_height & 1)) {
-    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-  }
-}
-
-// Scale UV to/from any dimensions, without interpolation.
-// Fixed point math is used for performance: The upper 16 bits
-// of x and dx is the integer part of the source position and
-// the lower 16 bits are the fixed decimal part.
-
-static void ScaleUVSimple(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_uv,
-                          uint8_t* dst_uv,
-                          int x,
-                          int dx,
-                          int y,
-                          int dy) {
-  int j;
-  void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width,
-                      int x, int dx) =
-      (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
-  (void)src_height;
-#if defined(HAS_SCALEUVCOLS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-    ScaleUVCols = ScaleUVCols_SSSE3;
-  }
-#endif
-#if defined(HAS_SCALEUVCOLS_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleUVCols = ScaleUVCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVCols = ScaleUVCols_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVCOLS_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleUVCols = ScaleUVCols_Any_MMI;
-    if (IS_ALIGNED(dst_width, 1)) {
-      ScaleUVCols = ScaleUVCols_MMI;
-    }
-  }
-#endif
-#if defined(HAS_SCALEUVCOLS_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ScaleUVCols = ScaleUVCols_Any_MSA;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleUVCols = ScaleUVCols_MSA;
-    }
-  }
-#endif
-  if (src_width * 2 == dst_width && x < 0x8000) {
-    ScaleUVCols = ScaleUVColsUp2_C;
-#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
-      ScaleUVCols = ScaleUVColsUp2_SSSE3;
-    }
-#endif
-#if defined(HAS_SCALEUVCOLSUP2_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
-      ScaleUVCols = ScaleUVColsUp2_MMI;
-    }
-#endif
-  }
-
-  for (j = 0; j < dst_height; ++j) {
-    ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx);
-    dst_uv += dst_stride;
-    y += dy;
-  }
-}
-
-// Copy UV with optional flipping
-#if HAS_UVCOPY
-static int UVCopy(const uint8_t* src_UV,
-                  int src_stride_uv,
-                  uint8_t* dst_UV,
-                  int dst_stride_uv,
-                  int width,
-                  int height) {
-  if (!src_UV || !dst_UV || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_UV = src_UV + (height - 1) * src_stride_uv;
-    src_stride_uv = -src_stride_uv;
-  }
-
-  CopyPlane(src_UV, src_stride_uv, dst_UV, dst_stride_uv, width * 2, height);
-  return 0;
-}
-
-static int UVCopy_16(const uint16_t* src_UV,
-                     int src_stride_uv,
-                     uint16_t* dst_UV,
-                     int dst_stride_uv,
-                     int width,
-                     int height) {
-  if (!src_UV || !dst_UV || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_UV = src_UV + (height - 1) * src_stride_uv;
-    src_stride_uv = -src_stride_uv;
-  }
-
-  CopyPlane_16(src_UV, src_stride_uv, dst_UV, dst_stride_uv, width * 2, height);
-  return 0;
-}
-#endif  // HAS_UVCOPY
-
-// Scale a UV plane (from NV12)
-// This function in turn calls a scaling function
-// suitable for handling the desired resolutions.
-static void ScaleUV(const uint8_t* src,
-                    int src_stride,
-                    int src_width,
-                    int src_height,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int dst_width,
-                    int dst_height,
-                    int clip_x,
-                    int clip_y,
-                    int clip_width,
-                    int clip_height,
-                    enum FilterMode filtering) {
-  // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int x = 0;
-  int y = 0;
-  int dx = 0;
-  int dy = 0;
-  // UV does not support box filter yet, but allow the user to pass it.
-  // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
-                                filtering);
-
-  // Negative src_height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
-             &dx, &dy);
-  src_width = Abs(src_width);
-  if (clip_x) {
-    int64_t clipf = (int64_t)(clip_x)*dx;
-    x += (clipf & 0xffff);
-    src += (clipf >> 16) * 2;
-    dst += clip_x * 2;
-  }
-  if (clip_y) {
-    int64_t clipf = (int64_t)(clip_y)*dy;
-    y += (clipf & 0xffff);
-    src += (clipf >> 16) * src_stride;
-    dst += clip_y * dst_stride;
-  }
-
-  // Special case for integer step values.
-  if (((dx | dy) & 0xffff) == 0) {
-    if (!dx || !dy) {  // 1 pixel wide and/or tall.
-      filtering = kFilterNone;
-    } else {
-      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
-      if (!(dx & 0x10000) && !(dy & 0x10000)) {
-#if HAS_SCALEUVDOWN2
-        if (dx == 0x20000) {
-          // Optimized 1/2 downsample.
-          ScaleUVDown2(src_width, src_height, clip_width, clip_height,
-                       src_stride, dst_stride, src, dst, x, dx, y, dy,
-                       filtering);
-          return;
-        }
-#endif
-#if HAS_SCALEUVDOWN4BOX
-        if (dx == 0x40000 && filtering == kFilterBox) {
-          // Optimized 1/4 box downsample.
-          ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
-                          src_stride, dst_stride, src, dst, x, dx, y, dy);
-          return;
-        }
-#endif
-#if HAS_SCALEUVDOWNEVEN
-        ScaleUVDownEven(src_width, src_height, clip_width, clip_height,
-                        src_stride, dst_stride, src, dst, x, dx, y, dy,
-                        filtering);
-        return;
-#endif
-      }
-      // Optimized odd scale down. ie 3, 5, 7, 9x.
-      if ((dx & 0x10000) && (dy & 0x10000)) {
-        filtering = kFilterNone;
-#ifdef HAS_UVCOPY
-        if (dx == 0x10000 && dy == 0x10000) {
-          // Straight copy.
-          UVCopy(src + (y >> 16) * src_stride + (x >> 16) * 2, src_stride, dst,
-                 dst_stride, clip_width, clip_height);
-          return;
-        }
-#endif
-      }
-    }
-  }
-  // HAS_SCALEPLANEVERTICAL
-  if (dx == 0x10000 && (x & 0xffff) == 0) {
-    // Arbitrary scale vertically, but unscaled horizontally.
-    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
-                       dst_stride, src, dst, x, y, dy, 4, filtering);
-    return;
-  }
-  if (filtering && (dst_width + 1) / 2 == src_width) {
-    ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
-                     dst_stride, src, dst);
-    return;
-  }
-  if ((clip_height + 1) / 2 == src_height &&
-      (clip_width + 1) / 2 == src_width &&
-      (filtering == kFilterBilinear || filtering == kFilterBox)) {
-    ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height,
-                       src_stride, dst_stride, src, dst);
-    return;
-  }
-#if HAS_SCALEUVBILINEARUP
-  if (filtering && dy < 65536) {
-    ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
-                      src_stride, dst_stride, src, dst, x, dx, y, dy,
-                      filtering);
-    return;
-  }
-#endif
-#if HAS_SCALEUVBILINEARDOWN
-  if (filtering) {
-    ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
-                        src_stride, dst_stride, src, dst, x, dx, y, dy,
-                        filtering);
-    return;
-  }
-#endif
-  ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride,
-                dst_stride, src, dst, x, dx, y, dy);
-}
-
-// Scale an UV image.
-LIBYUV_API
-int UVScale(const uint8_t* src_uv,
-            int src_stride_uv,
-            int src_width,
-            int src_height,
-            uint8_t* dst_uv,
-            int dst_stride_uv,
-            int dst_width,
-            int dst_height,
-            enum FilterMode filtering) {
-  if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-  ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv,
-          dst_width, dst_height, 0, 0, dst_width, dst_height, filtering);
-  return 0;
-}
-
-// Scale a 16 bit UV image.
-// This function is currently incomplete, it can't handle all cases.
-LIBYUV_API
-int UVScale_16(const uint16_t* src_uv,
-               int src_stride_uv,
-               int src_width,
-               int src_height,
-               uint16_t* dst_uv,
-               int dst_stride_uv,
-               int dst_width,
-               int dst_height,
-               enum FilterMode filtering) {
-  int dy = 0;
-
-  if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
-    return -1;
-  }
-
-  // UV does not support box filter yet, but allow the user to pass it.
-  // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
-                                filtering);
-
-  // Negative src_height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    src_uv = src_uv + (src_height - 1) * src_stride_uv;
-    src_stride_uv = -src_stride_uv;
-  }
-  src_width = Abs(src_width);
-
-#ifdef HAS_UVCOPY
-  if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
-    if (dst_height == 1) {
-      UVCopy_16(src_uv + ((src_height - 1) / 2) * src_stride_uv, src_stride_uv,
-                dst_uv, dst_stride_uv, dst_width, dst_height);
-    } else {
-      dy = src_height / dst_height;
-      UVCopy_16(src_uv + src_stride_uv * ((dy - 1) / 2), src_stride_uv * dy,
-                dst_uv, dst_stride_uv, dst_width, dst_height);
-    }
-
-    return 0;
-  }
-#endif
-
-  if (filtering && (dst_width + 1) / 2 == src_width) {
-    ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height,
-                        src_stride_uv, dst_stride_uv, src_uv, dst_uv);
-    return 0;
-  }
-
-  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
-      (filtering == kFilterBilinear || filtering == kFilterBox)) {
-    ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height,
-                          src_stride_uv, dst_stride_uv, src_uv, dst_uv);
-    return 0;
-  }
-
-  return -1;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/scale_win.cc b/thirdparty/libyuv/source/scale_win.cc
deleted file mode 100644
index ea1f95c..0000000
--- a/thirdparty/libyuv/source/scale_win.cc
+++ /dev/null
@@ -1,1392 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for 32 bit Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    !defined(__clang__) && defined(_M_IX86)
-
-// Offsets for source bytes 0 to 9
-static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 0 to 10
-static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
-                              8, 9, 9, 10, 10, 11, 12, 13};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
-                              10, 11, 12, 13, 13, 14, 14, 15};
-
-// Coefficients for source bytes 0 to 10
-static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
-
-// Coefficients for source bytes 10 to 21
-static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
-
-// Coefficients for source bytes 21 to 31
-static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
-
-// Coefficients for source bytes 21 to 31
-static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
-
-static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
-                               128, 128, 128, 128, 128, 128, 128, 128};
-
-static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
-                               6,   8,   11,  14,  128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 0,1,2
-static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
-                              128, 128, 128, 128, 128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 3,4,5
-static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
-                               6,   7,   12,  13,  128, 128, 128, 128};
-
-// Scaling values for boxes of 3x3 and 2x3
-static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
-                                  65536 / 9, 65536 / 6, 0,         0};
-
-// Arrange first value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
-                               11, 128, 14, 128, 128, 128, 128, 128};
-
-// Arrange second value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
-                               12, 128, 15, 128, 128, 128, 128, 128};
-
-// Arrange third value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
-                               13, 128, 128, 128, 128, 128, 128, 128};
-
-// Scaling values for boxes of 3x2 and 2x2
-static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
-                                 65536 / 3, 65536 / 2, 0,         0};
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
-                                           ptrdiff_t src_stride,
-                                           uint8_t* dst_ptr,
-                                           int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8          // isolate odd pixels.
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x1 rectangle to 16x1.
-__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
-                                                 ptrdiff_t src_stride,
-                                                 uint8_t* dst_ptr,
-                                                 int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    // src_stride
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-
-    pcmpeqb    xmm4, xmm4  // constant 0x0101
-    psrlw      xmm4, 15
-    packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5  // constant 0
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4  // horizontal add
-    pmaddubsw  xmm1, xmm4
-    pavgw      xmm0, xmm5       // (x + 1) / 2
-    pavgw      xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 16x1.
-__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
-                                              ptrdiff_t src_stride,
-                                              uint8_t* dst_ptr,
-                                              int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_ptr
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_ptr
-    mov        ecx, [esp + 4 + 16]  // dst_width
-
-    pcmpeqb    xmm4, xmm4  // constant 0x0101
-    psrlw      xmm4, 15
-    packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5  // constant 0
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4  // horizontal add
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2  // vertical add
-    paddw      xmm1, xmm3
-    psrlw      xmm0, 1
-    psrlw      xmm1, 1
-    pavgw      xmm0, xmm5  // (x + 1) / 2
-    pavgw      xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_SCALEROWDOWN2_AVX2
-// Reads 64 pixels, throws half away and writes 32 pixels.
-__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
-                                          ptrdiff_t src_stride,
-                                          uint8_t* dst_ptr,
-                                          int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    lea         eax,  [eax + 64]
-    vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
-    vpsrlw      ymm1, ymm1, 8
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
-    vmovdqu     [edx], ymm0
-    lea         edx, [edx + 32]
-    sub         ecx, 32
-    jg          wloop
-
-    vzeroupper
-    ret
-  }
-}
-
-// Blends 64x1 rectangle to 32x1.
-__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
-                                                ptrdiff_t src_stride,
-                                                uint8_t* dst_ptr,
-                                                int dst_width) {
-  __asm {
-    mov         eax, [esp + 4]  // src_ptr
-    // src_stride
-    mov         edx, [esp + 12]  // dst_ptr
-    mov         ecx, [esp + 16]  // dst_width
-
-    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
-    vpsrlw      ymm4, ymm4, 15
-    vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5  // constant 0
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
-    vpmaddubsw  ymm1, ymm1, ymm4
-    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
-    vpavgw      ymm1, ymm1, ymm5
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
-    vmovdqu     [edx], ymm0
-    lea         edx, [edx + 32]
-    sub         ecx, 32
-    jg          wloop
-
-    vzeroupper
-    ret
-  }
-}
-
-// For rounding, average = (sum + 2) / 4
-// becomes average((sum >> 1), 0)
-// Blends 64x2 rectangle to 32x1.
-__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
-                                             ptrdiff_t src_stride,
-                                             uint8_t* dst_ptr,
-                                             int dst_width) {
-  __asm {
-    push        esi
-    mov         eax, [esp + 4 + 4]  // src_ptr
-    mov         esi, [esp + 4 + 8]  // src_stride
-    mov         edx, [esp + 4 + 12]  // dst_ptr
-    mov         ecx, [esp + 4 + 16]  // dst_width
-
-    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
-    vpsrlw      ymm4, ymm4, 15
-    vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5  // constant 0
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    vmovdqu     ymm2, [eax + esi]
-    vmovdqu     ymm3, [eax + esi + 32]
-    lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
-    vpmaddubsw  ymm1, ymm1, ymm4
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2  // vertical add
-    vpaddw      ymm1, ymm1, ymm3
-    vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
-    vpsrlw      ymm1, ymm1, 1
-    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
-    vpavgw      ymm1, ymm1, ymm5
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
-    vmovdqu     [edx], ymm0
-    lea         edx, [edx + 32]
-    sub         ecx, 32
-    jg          wloop
-
-    pop         esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SCALEROWDOWN2_AVX2
-
-// Point samples 32 pixels to 8 pixels.
-__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
-                                           ptrdiff_t src_stride,
-                                           uint8_t* dst_ptr,
-                                           int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
-    psrld      xmm5, 24
-    pslld      xmm5, 16
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    psrlw      xmm0, 8
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 8
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x4 rectangle to 8x1.
-__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
-                                              ptrdiff_t src_stride,
-                                              uint8_t* dst_ptr,
-                                              int dst_width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_ptr
-    mov        esi, [esp + 8 + 8]  // src_stride
-    mov        edx, [esp + 8 + 12]  // dst_ptr
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    lea        edi, [esi + esi * 2]  // src_stride * 3
-    pcmpeqb    xmm4, xmm4  // constant 0x0101
-    psrlw      xmm4, 15
-    movdqa     xmm5, xmm4
-    packuswb   xmm4, xmm4
-    psllw      xmm5, 3  // constant 0x0008
-
-  wloop:
-    movdqu     xmm0, [eax]  // average rows
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    pmaddubsw  xmm0, xmm4  // horizontal add
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2  // vertical add rows 0, 1
-    paddw      xmm1, xmm3
-    movdqu     xmm2, [eax + esi * 2]
-    movdqu     xmm3, [eax + esi * 2 + 16]
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2  // add row 2
-    paddw      xmm1, xmm3
-    movdqu     xmm2, [eax + edi]
-    movdqu     xmm3, [eax + edi + 16]
-    lea        eax, [eax + 32]
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2  // add row 3
-    paddw      xmm1, xmm3
-    phaddw     xmm0, xmm1
-    paddw      xmm0, xmm5  // + 8 for round
-    psrlw      xmm0, 4  // /16 for average of 4 * 4
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 8
-    jg         wloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_SCALEROWDOWN4_AVX2
-// Point samples 64 pixels to 16 pixels.
-__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
-                                          ptrdiff_t src_stride,
-                                          uint8_t* dst_ptr,
-                                          int dst_width) {
-  __asm {
-    mov         eax, [esp + 4]  // src_ptr
-    // src_stride ignored
-    mov         edx, [esp + 12]  // dst_ptr
-    mov         ecx, [esp + 16]  // dst_width
-    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
-    vpsrld      ymm5, ymm5, 24
-    vpslld      ymm5, ymm5, 16
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    lea         eax,  [eax + 64]
-    vpand       ymm0, ymm0, ymm5
-    vpand       ymm1, ymm1, ymm5
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
-    vpsrlw      ymm0, ymm0, 8
-    vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
-    vmovdqu     [edx], xmm0
-    lea         edx, [edx + 16]
-    sub         ecx, 16
-    jg          wloop
-
-    vzeroupper
-    ret
-  }
-}
-
-// Blends 64x4 rectangle to 16x1.
-__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
-                                             ptrdiff_t src_stride,
-                                             uint8_t* dst_ptr,
-                                             int dst_width) {
-  __asm {
-    push        esi
-    push        edi
-    mov         eax, [esp + 8 + 4]  // src_ptr
-    mov         esi, [esp + 8 + 8]  // src_stride
-    mov         edx, [esp + 8 + 12]  // dst_ptr
-    mov         ecx, [esp + 8 + 16]  // dst_width
-    lea         edi, [esi + esi * 2]  // src_stride * 3
-    vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
-    vpsrlw      ymm4, ymm4, 15
-    vpsllw      ymm5, ymm4, 3  // constant 0x0008
-    vpackuswb   ymm4, ymm4, ymm4
-
-  wloop:
-    vmovdqu     ymm0, [eax]  // average rows
-    vmovdqu     ymm1, [eax + 32]
-    vmovdqu     ymm2, [eax + esi]
-    vmovdqu     ymm3, [eax + esi + 32]
-    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
-    vpmaddubsw  ymm1, ymm1, ymm4
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
-    vpaddw      ymm1, ymm1, ymm3
-    vmovdqu     ymm2, [eax + esi * 2]
-    vmovdqu     ymm3, [eax + esi * 2 + 32]
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2  // add row 2
-    vpaddw      ymm1, ymm1, ymm3
-    vmovdqu     ymm2, [eax + edi]
-    vmovdqu     ymm3, [eax + edi + 32]
-    lea         eax,  [eax + 64]
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2  // add row 3
-    vpaddw      ymm1, ymm1, ymm3
-    vphaddw     ymm0, ymm0, ymm1  // mutates
-    vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
-    vpaddw      ymm0, ymm0, ymm5  // + 8 for round
-    vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
-    vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
-    vmovdqu     [edx], xmm0
-    lea         edx, [edx + 16]
-    sub         ecx, 16
-    jg          wloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SCALEROWDOWN4_AVX2
-
-// Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
-                                            ptrdiff_t src_stride,
-                                            uint8_t* dst_ptr,
-                                            int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_ptr
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-    movdqa     xmm3, xmmword ptr kShuf0
-    movdqa     xmm4, xmmword ptr kShuf1
-    movdqa     xmm5, xmmword ptr kShuf2
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm1
-    palignr    xmm1, xmm0, 8
-    pshufb     xmm0, xmm3
-    pshufb     xmm1, xmm4
-    pshufb     xmm2, xmm5
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + 8], xmm1
-    movq       qword ptr [edx + 16], xmm2
-    lea        edx, [edx + 24]
-    sub        ecx, 24
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Register usage:
-// xmm0 src_row 0
-// xmm1 src_row 1
-// xmm2 shuf 0
-// xmm3 shuf 1
-// xmm4 shuf 2
-// xmm5 madd 0
-// xmm6 madd 1
-// xmm7 kRound34
-
-// Note that movdqa+palign may be better than movdqu.
-__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
-                                                  ptrdiff_t src_stride,
-                                                  uint8_t* dst_ptr,
-                                                  int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_ptr
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_ptr
-    mov        ecx, [esp + 4 + 16]  // dst_width
-    movdqa     xmm2, xmmword ptr kShuf01
-    movdqa     xmm3, xmmword ptr kShuf11
-    movdqa     xmm4, xmmword ptr kShuf21
-    movdqa     xmm5, xmmword ptr kMadd01
-    movdqa     xmm6, xmmword ptr kMadd11
-    movdqa     xmm7, xmmword ptr kRound34
-
-  wloop:
-    movdqu     xmm0, [eax]  // pixels 0..7
-    movdqu     xmm1, [eax + esi]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]  // pixels 8..15
-    movdqu     xmm1, [eax + esi + 8]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]  // pixels 16..23
-    movdqu     xmm1, [eax + esi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm4
-    movdqa     xmm1, xmmword ptr kMadd21
-    pmaddubsw  xmm0, xmm1
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 16], xmm0
-    lea        edx, [edx + 24]
-    sub        ecx, 24
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Note that movdqa+palign may be better than movdqu.
-__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
-                                                  ptrdiff_t src_stride,
-                                                  uint8_t* dst_ptr,
-                                                  int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_ptr
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_ptr
-    mov        ecx, [esp + 4 + 16]  // dst_width
-    movdqa     xmm2, xmmword ptr kShuf01
-    movdqa     xmm3, xmmword ptr kShuf11
-    movdqa     xmm4, xmmword ptr kShuf21
-    movdqa     xmm5, xmmword ptr kMadd01
-    movdqa     xmm6, xmmword ptr kMadd11
-    movdqa     xmm7, xmmword ptr kRound34
-
-  wloop:
-    movdqu     xmm0, [eax]  // pixels 0..7
-    movdqu     xmm1, [eax + esi]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]  // pixels 8..15
-    movdqu     xmm1, [eax + esi + 8]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]  // pixels 16..23
-    movdqu     xmm1, [eax + esi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm4
-    movdqa     xmm1, xmmword ptr kMadd21
-    pmaddubsw  xmm0, xmm1
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 16], xmm0
-    lea        edx, [edx+24]
-    sub        ecx, 24
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// 3/8 point sampler
-
-// Scale 32 pixels to 12
-__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
-                                            ptrdiff_t src_stride,
-                                            uint8_t* dst_ptr,
-                                            int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-    movdqa     xmm4, xmmword ptr kShuf38a
-    movdqa     xmm5, xmmword ptr kShuf38b
-
-  xloop:
-    movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
-    movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm4
-    pshufb     xmm1, xmm5
-    paddusb    xmm0, xmm1
-
-    movq       qword ptr [edx], xmm0       // write 12 pixels
-    movhlps    xmm1, xmm0
-    movd       [edx + 8], xmm1
-    lea        edx, [edx + 12]
-    sub        ecx, 12
-    jg         xloop
-
-    ret
-  }
-}
-
-// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
-                                                  ptrdiff_t src_stride,
-                                                  uint8_t* dst_ptr,
-                                                  int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_ptr
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_ptr
-    mov        ecx, [esp + 4 + 16]  // dst_width
-    movdqa     xmm2, xmmword ptr kShufAc
-    movdqa     xmm3, xmmword ptr kShufAc3
-    movdqa     xmm4, xmmword ptr kScaleAc33
-    pxor       xmm5, xmm5
-
-  xloop:
-    movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
-    movdqu     xmm6, [eax + esi]
-    movhlps    xmm1, xmm0
-    movhlps    xmm7, xmm6
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm6, xmm5
-    punpcklbw  xmm7, xmm5
-    paddusw    xmm0, xmm6
-    paddusw    xmm1, xmm7
-    movdqu     xmm6, [eax + esi * 2]
-    lea        eax, [eax + 16]
-    movhlps    xmm7, xmm6
-    punpcklbw  xmm6, xmm5
-    punpcklbw  xmm7, xmm5
-    paddusw    xmm0, xmm6
-    paddusw    xmm1, xmm7
-
-    movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
-    psrldq     xmm0, 2
-    paddusw    xmm6, xmm0
-    psrldq     xmm0, 2
-    paddusw    xmm6, xmm0
-    pshufb     xmm6, xmm2
-
-    movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
-    psrldq     xmm1, 2
-    paddusw    xmm7, xmm1
-    psrldq     xmm1, 2
-    paddusw    xmm7, xmm1
-    pshufb     xmm7, xmm3
-    paddusw    xmm6, xmm7
-
-    pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
-    packuswb   xmm6, xmm6
-
-    movd       [edx], xmm6  // write 6 pixels
-    psrlq      xmm6, 16
-    movd       [edx + 2], xmm6
-    lea        edx, [edx + 6]
-    sub        ecx, 6
-    jg         xloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
-                                                  ptrdiff_t src_stride,
-                                                  uint8_t* dst_ptr,
-                                                  int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_ptr
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_ptr
-    mov        ecx, [esp + 4 + 16]  // dst_width
-    movdqa     xmm2, xmmword ptr kShufAb0
-    movdqa     xmm3, xmmword ptr kShufAb1
-    movdqa     xmm4, xmmword ptr kShufAb2
-    movdqa     xmm5, xmmword ptr kScaleAb2
-
-  xloop:
-    movdqu     xmm0, [eax]  // average 2 rows into xmm0
-    movdqu     xmm1, [eax + esi]
-    lea        eax, [eax + 16]
-    pavgb      xmm0, xmm1
-
-    movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
-    pshufb     xmm1, xmm2
-    movdqa     xmm6, xmm0
-    pshufb     xmm6, xmm3
-    paddusw    xmm1, xmm6
-    pshufb     xmm0, xmm4
-    paddusw    xmm1, xmm0
-
-    pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
-    packuswb   xmm1, xmm1
-
-    movd       [edx], xmm1  // write 6 pixels
-    psrlq      xmm1, 16
-    movd       [edx + 2], xmm1
-    lea        edx, [edx + 6]
-    sub        ecx, 6
-    jg         xloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Reads 16 bytes and accumulates to 16 shorts at a time.
-__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
-                                        uint16_t* dst_ptr,
-                                        int src_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    mov        edx, [esp + 8]  // dst_ptr
-    mov        ecx, [esp + 12]  // src_width
-    pxor       xmm5, xmm5
-
-        // sum rows
-  xloop:
-    movdqu     xmm3, [eax]  // read 16 bytes
-    lea        eax, [eax + 16]
-    movdqu     xmm0, [edx]  // read 16 words from destination
-    movdqu     xmm1, [edx + 16]
-    movdqa     xmm2, xmm3
-    punpcklbw  xmm2, xmm5
-    punpckhbw  xmm3, xmm5
-    paddusw    xmm0, xmm2  // sum 16 words
-    paddusw    xmm1, xmm3
-    movdqu     [edx], xmm0  // write 16 words to destination
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 16
-    jg         xloop
-    ret
-  }
-}
-
-#ifdef HAS_SCALEADDROW_AVX2
-// Reads 32 bytes and accumulates to 32 shorts at a time.
-__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
-                                        uint16_t* dst_ptr,
-                                        int src_width) {
-  __asm {
-    mov         eax, [esp + 4]  // src_ptr
-    mov         edx, [esp + 8]  // dst_ptr
-    mov         ecx, [esp + 12]  // src_width
-    vpxor       ymm5, ymm5, ymm5
-
-        // sum rows
-  xloop:
-    vmovdqu     ymm3, [eax]  // read 32 bytes
-    lea         eax, [eax + 32]
-    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
-    vpunpcklbw  ymm2, ymm3, ymm5
-    vpunpckhbw  ymm3, ymm3, ymm5
-    vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
-    vpaddusw    ymm1, ymm3, [edx + 32]
-    vmovdqu     [edx], ymm0  // write 32 words to destination
-    vmovdqu     [edx + 32], ymm1
-    lea         edx, [edx + 64]
-    sub         ecx, 32
-    jg          xloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SCALEADDROW_AVX2
-
-// Constant for making pixels signed to avoid pmaddubsw
-// saturation.
-static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-// Constant for making pixels unsigned and adding .5 for rounding.
-static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
-                               0x4040, 0x4040, 0x4040, 0x4040};
-
-// Bilinear column filtering. SSSE3 version.
-__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
-                                             const uint8_t* src_ptr,
-                                             int dst_width,
-                                             int x,
-                                             int dx) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        edi, [esp + 12 + 4]  // dst_ptr
-    mov        esi, [esp + 12 + 8]  // src_ptr
-    mov        ecx, [esp + 12 + 12]  // dst_width
-    movd       xmm2, [esp + 12 + 16]  // x
-    movd       xmm3, [esp + 12 + 20]  // dx
-    mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
-    movd       xmm5, eax
-    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
-    psrlw      xmm6, 9
-    pcmpeqb    xmm7, xmm7  // generate 0x0001
-    psrlw      xmm7, 15
-    pextrw     eax, xmm2, 1  // get x0 integer. preroll
-    sub        ecx, 2
-    jl         xloop29
-
-    movdqa     xmm0, xmm2  // x1 = x0 + dx
-    paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0  // x0 x1
-    punpckldq  xmm3, xmm3  // dx dx
-    paddd      xmm3, xmm3  // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3  // get x1 integer. preroll
-
-    // 2 Pixel loop.
-  xloop2:
-    movdqa     xmm1, xmm2  // x0, x1 fractions.
-    paddd      xmm2, xmm3  // x += dx
-    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
-    movd       xmm0, ebx
-    psrlw      xmm1, 9  // 7 bit fractions.
-    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
-    movd       xmm4, ebx
-    pshufb     xmm1, xmm5  // 0011
-    punpcklwd  xmm0, xmm4
-    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
-    pxor       xmm1, xmm6  // 0..7f and 7f..0
-    paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
-    pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
-    paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
-    psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
-    packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
-    movd       ebx, xmm1
-    mov        [edi], bx
-    lea        edi, [edi + 2]
-    sub        ecx, 2  // 2 pixels
-    jge        xloop2
-
- xloop29:
-    add        ecx, 2 - 1
-    jl         xloop99
-
-            // 1 pixel remainder
-    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
-    movd       xmm0, ebx
-    psrlw      xmm2, 9  // 7 bit fractions.
-    pshufb     xmm2, xmm5  // 0011
-    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
-    pxor       xmm2, xmm6  // 0..7f and 7f..0
-    paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
-    pmaddubsw  xmm2, xmm0  // 16 bit
-    paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
-    psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
-    packuswb   xmm2, xmm2  // 8 bits
-    movd       ebx, xmm2
-    mov        [edi], bl
-
- xloop99:
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
-// Reads 16 pixels, duplicates them and writes 32 pixels.
-__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
-                                         const uint8_t* src_ptr,
-                                         int dst_width,
-                                         int x,
-                                         int dx) {
-  __asm {
-    mov        edx, [esp + 4]  // dst_ptr
-    mov        eax, [esp + 8]  // src_ptr
-    mov        ecx, [esp + 12]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    lea        eax,  [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0
-    punpckhbw  xmm1, xmm1
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         wloop
-
-    ret
-  }
-}
-
-// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
-                                              ptrdiff_t src_stride,
-                                              uint8_t* dst_argb,
-                                              int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_argb
-    mov        ecx, [esp + 16]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    shufps     xmm0, xmm1, 0xdd
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 8x1 rectangle to 4x1.
-__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
-                                                    ptrdiff_t src_stride,
-                                                    uint8_t* dst_argb,
-                                                    int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_argb
-    mov        ecx, [esp + 16]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0
-    shufps     xmm0, xmm1, 0x88  // even pixels
-    shufps     xmm2, xmm1, 0xdd       // odd pixels
-    pavgb      xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 8x2 rectangle to 4x1.
-__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
-                                                 ptrdiff_t src_stride,
-                                                 uint8_t* dst_argb,
-                                                 int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2  // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88  // even pixels
-    shufps     xmm2, xmm1, 0xdd  // odd pixels
-    pavgb      xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Reads 4 pixels at a time.
-__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
-                                                 ptrdiff_t src_stride,
-                                                 int src_stepx,
-                                                 uint8_t* dst_argb,
-                                                 int dst_width) {
-  __asm {
-    push       ebx
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    // src_stride ignored
-    mov        ebx, [esp + 8 + 12]  // src_stepx
-    mov        edx, [esp + 8 + 16]  // dst_argb
-    mov        ecx, [esp + 8 + 20]  // dst_width
-    lea        ebx, [ebx * 4]
-    lea        edi, [ebx + ebx * 2]
-
-  wloop:
-    movd       xmm0, [eax]
-    movd       xmm1, [eax + ebx]
-    punpckldq  xmm0, xmm1
-    movd       xmm2, [eax + ebx * 2]
-    movd       xmm3, [eax + edi]
-    lea        eax,  [eax + ebx * 4]
-    punpckldq  xmm2, xmm3
-    punpcklqdq xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    pop        edi
-    pop        ebx
-    ret
-  }
-}
-
-// Blends four 2x2 to 4x1.
-__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
-                                                    ptrdiff_t src_stride,
-                                                    int src_stepx,
-                                                    uint8_t* dst_argb,
-                                                    int dst_width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]  // src_argb
-    mov        esi, [esp + 12 + 8]  // src_stride
-    mov        ebx, [esp + 12 + 12]  // src_stepx
-    mov        edx, [esp + 12 + 16]  // dst_argb
-    mov        ecx, [esp + 12 + 20]  // dst_width
-    lea        esi, [eax + esi]  // row1 pointer
-    lea        ebx, [ebx * 4]
-    lea        edi, [ebx + ebx * 2]
-
-  wloop:
-    movq       xmm0, qword ptr [eax]  // row0 4 pairs
-    movhps     xmm0, qword ptr [eax + ebx]
-    movq       xmm1, qword ptr [eax + ebx * 2]
-    movhps     xmm1, qword ptr [eax + edi]
-    lea        eax,  [eax + ebx * 4]
-    movq       xmm2, qword ptr [esi]  // row1 4 pairs
-    movhps     xmm2, qword ptr [esi + ebx]
-    movq       xmm3, qword ptr [esi + ebx * 2]
-    movhps     xmm3, qword ptr [esi + edi]
-    lea        esi,  [esi + ebx * 4]
-    pavgb      xmm0, xmm2  // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88  // even pixels
-    shufps     xmm2, xmm1, 0xdd  // odd pixels
-    pavgb      xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
-// Column scaling unfiltered. SSE2 version.
-__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
-                                          const uint8_t* src_argb,
-                                          int dst_width,
-                                          int x,
-                                          int dx) {
-  __asm {
-    push       edi
-    push       esi
-    mov        edi, [esp + 8 + 4]  // dst_argb
-    mov        esi, [esp + 8 + 8]  // src_argb
-    mov        ecx, [esp + 8 + 12]  // dst_width
-    movd       xmm2, [esp + 8 + 16]  // x
-    movd       xmm3, [esp + 8 + 20]  // dx
-
-    pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
-    pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
-    paddd      xmm2, xmm0
-    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
-    pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
-    paddd      xmm2, xmm0  // x3 x2 x1 x0
-    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
-    pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
-
-    pextrw     eax, xmm2, 1  // get x0 integer.
-    pextrw     edx, xmm2, 3  // get x1 integer.
-
-    cmp        ecx, 0
-    jle        xloop99
-    sub        ecx, 4
-    jl         xloop49
-
-        // 4 Pixel loop.
- xloop4:
-    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
-    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5  // get x2 integer.
-    pextrw     edx, xmm2, 7  // get x3 integer.
-    paddd      xmm2, xmm3  // x += dx
-    punpckldq  xmm0, xmm1  // x0 x1
-
-    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
-    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
-    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
-    punpckldq  xmm1, xmm4  // x2 x3
-    punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
-    movdqu     [edi], xmm0
-    lea        edi, [edi + 16]
-    sub        ecx, 4  // 4 pixels
-    jge        xloop4
-
- xloop49:
-    test       ecx, 2
-    je         xloop29
-
-        // 2 Pixels.
-    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
-    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5  // get x2 integer.
-    punpckldq  xmm0, xmm1  // x0 x1
-
-    movq       qword ptr [edi], xmm0
-    lea        edi, [edi + 8]
-
- xloop29:
-    test       ecx, 1
-    je         xloop99
-
-        // 1 Pixels.
-    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
-    movd       dword ptr [edi], xmm0
- xloop99:
-
-    pop        esi
-    pop        edi
-    ret
-  }
-}
-
-// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static const uvec8 kShuffleColARGB = {
-    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
-    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static const uvec8 kShuffleFractions = {
-    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
-                                                 const uint8_t* src_argb,
-                                                 int dst_width,
-                                                 int x,
-                                                 int dx) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]  // dst_argb
-    mov        esi, [esp + 8 + 8]  // src_argb
-    mov        ecx, [esp + 8 + 12]  // dst_width
-    movd       xmm2, [esp + 8 + 16]  // x
-    movd       xmm3, [esp + 8 + 20]  // dx
-    movdqa     xmm4, xmmword ptr kShuffleColARGB
-    movdqa     xmm5, xmmword ptr kShuffleFractions
-    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
-    psrlw      xmm6, 9
-    pextrw     eax, xmm2, 1  // get x0 integer. preroll
-    sub        ecx, 2
-    jl         xloop29
-
-    movdqa     xmm0, xmm2  // x1 = x0 + dx
-    paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0  // x0 x1
-    punpckldq  xmm3, xmm3  // dx dx
-    paddd      xmm3, xmm3  // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3  // get x1 integer. preroll
-
-    // 2 Pixel loop.
-  xloop2:
-    movdqa     xmm1, xmm2  // x0, x1 fractions.
-    paddd      xmm2, xmm3  // x += dx
-    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    psrlw      xmm1, 9  // 7 bit fractions.
-    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
-    pshufb     xmm1, xmm5  // 0000000011111111
-    pshufb     xmm0, xmm4  // arrange pixels into pairs
-    pxor       xmm1, xmm6  // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
-    psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
-    movq       qword ptr [edi], xmm0
-    lea        edi, [edi + 8]
-    sub        ecx, 2  // 2 pixels
-    jge        xloop2
-
- xloop29:
-
-    add        ecx, 2 - 1
-    jl         xloop99
-
-            // 1 pixel remainder
-    psrlw      xmm2, 9  // 7 bit fractions.
-    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    pshufb     xmm2, xmm5  // 00000000
-    pshufb     xmm0, xmm4  // arrange pixels into pairs
-    pxor       xmm2, xmm6  // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
-    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
-    movd       [edi], xmm0
-
- xloop99:
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
-                                             const uint8_t* src_argb,
-                                             int dst_width,
-                                             int x,
-                                             int dx) {
-  __asm {
-    mov        edx, [esp + 4]  // dst_argb
-    mov        eax, [esp + 8]  // src_argb
-    mov        ecx, [esp + 12]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    lea        eax,  [eax + 16]
-    movdqa     xmm1, xmm0
-    punpckldq  xmm0, xmm0
-    punpckhdq  xmm1, xmm1
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         wloop
-
-    ret
-  }
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) int FixedDiv_X86(int num, int div) {
-  __asm {
-    mov        eax, [esp + 4]  // num
-    cdq  // extend num to 64 bits
-    shld       edx, eax, 16  // 32.16
-    shl        eax, 16
-    idiv       dword ptr [esp + 8]
-    ret
-  }
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) int FixedDiv1_X86(int num, int div) {
-  __asm {
-    mov        eax, [esp + 4]  // num
-    mov        ecx, [esp + 8]  // denom
-    cdq  // extend num to 64 bits
-    shld       edx, eax, 16  // 32.16
-    shl        eax, 16
-    sub        eax, 0x00010001
-    sbb        edx, 0
-    sub        ecx, 1
-    idiv       ecx
-    ret
-  }
-}
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/source/test.sh b/thirdparty/libyuv/source/test.sh
deleted file mode 100644
index 7f12c3c..0000000
--- a/thirdparty/libyuv/source/test.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-set -x
-
-function runbenchmark1 {
-  perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1
-  perf report | grep AVX
-}
-
-runbenchmark1 ABGRToI420
-runbenchmark1 Android420ToI420
-runbenchmark1 ARGBToI420
-runbenchmark1 Convert16To8Plane
-runbenchmark1 ConvertToARGB
-runbenchmark1 ConvertToI420
-runbenchmark1 CopyPlane
-runbenchmark1 H010ToAB30
-runbenchmark1 H010ToAR30
-runbenchmark1 HalfFloatPlane
-runbenchmark1 I010ToAB30
-runbenchmark1 I010ToAR30
-runbenchmark1 I420Copy
-runbenchmark1 I420Psnr
-runbenchmark1 I420Scale
-runbenchmark1 I420Ssim
-runbenchmark1 I420ToARGB
-runbenchmark1 I420ToNV12
-runbenchmark1 I420ToUYVY
-runbenchmark1 I422ToI420
-runbenchmark1 InitCpuFlags
-runbenchmark1 J420ToARGB
-runbenchmark1 NV12ToARGB
-runbenchmark1 NV12ToI420
-runbenchmark1 NV12ToI420Rotate
-runbenchmark1 SetCpuFlags
-runbenchmark1 YUY2ToI420
diff --git a/thirdparty/libyuv/source/video_common.cc b/thirdparty/libyuv/source/video_common.cc
deleted file mode 100644
index 92384c0..0000000
--- a/thirdparty/libyuv/source/video_common.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-struct FourCCAliasEntry {
-  uint32_t alias;
-  uint32_t canonical;
-};
-
-#define NUM_ALIASES 18
-static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
-    {FOURCC_IYUV, FOURCC_I420},
-    {FOURCC_YU12, FOURCC_I420},
-    {FOURCC_YU16, FOURCC_I422},
-    {FOURCC_YU24, FOURCC_I444},
-    {FOURCC_YUYV, FOURCC_YUY2},
-    {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
-    {FOURCC_HDYC, FOURCC_UYVY},
-    {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
-    {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
-    {FOURCC_DMB1, FOURCC_MJPG},
-    {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
-    {FOURCC_RGB3, FOURCC_RAW},
-    {FOURCC_BGR3, FOURCC_24BG},
-    {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
-    {FOURCC_CM24, FOURCC_RAW},   // kCMPixelFormat_24RGB
-    {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
-    {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
-    {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
-};
-// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
-//  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
-
-LIBYUV_API
-uint32_t CanonicalFourCC(uint32_t fourcc) {
-  int i;
-  for (i = 0; i < NUM_ALIASES; ++i) {
-    if (kFourCCAliases[i].alias == fourcc) {
-      return kFourCCAliases[i].canonical;
-    }
-  }
-  // Not an alias, so return it as-is.
-  return fourcc;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/thirdparty/libyuv/util/Makefile b/thirdparty/libyuv/util/Makefile
deleted file mode 100644
index 40e74b6..0000000
--- a/thirdparty/libyuv/util/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-psnr: psnr.cc ssim.cc psnr_main.cc
-ifeq ($(CXX),icl)
-	$(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
-else
-	$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
-endif
-
-# for MacOS
-# /usr/local/bin/g++-7 -msse2 -O3 -fopenmp -Bstatic -o psnr psnr.cc ssim.cc psnr_main.cc
diff --git a/thirdparty/libyuv/util/color.cc b/thirdparty/libyuv/util/color.cc
deleted file mode 100644
index 8c3bbef..0000000
--- a/thirdparty/libyuv/util/color.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- *  Copyright 2021 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-// This utility computes values needed to generate yuvconstants based on
-// white point values.
-// The yuv formulas are tuned for 8 bit YUV channels.
-
-// For those MCs that can be represented as kr and kb:
-// Full range
-// float M[3][3]
-// {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
-// float B[3]
-// {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
-// Limited range
-// float M[3][3]
-// {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
-// float B[3]
-// {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
-
-// mc bt
-// 1 bt.709      KR = 0.2126; KB = 0.0722
-// 4 fcc         KR = 0.30;   KB = 0.11
-// 6 bt.601      KR = 0.299;  KB = 0.114
-// 7 SMPTE 240M  KR = 0.212;  KB = 0.087
-// 10 bt2020     KR = 0.2627; KB = 0.0593
-
-// BT.709 full range YUV to RGB reference
-//  R = Y               + V * 1.5748
-//  G = Y - U * 0.18732 - V * 0.46812
-//  B = Y + U * 1.8556
-//  KR = 0.2126
-//  KB = 0.0722
-
-// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
-
-// // Y contribution to R,G,B.  Scale and bias.
-// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-// #define YB 32    /* 64 / 2 */
-//
-// // U and V contributions to R,G,B.
-// #define UB 113 /* round(1.77200 * 64) */
-// #define UG 22  /* round(0.34414 * 64) */
-// #define VG 46  /* round(0.71414 * 64) */
-// #define VR 90  /* round(1.40200 * 64) */
-//
-// // Bias values to round, and subtract 128 from U and V.
-// #define BB (-UB * 128 + YB)
-// #define BG (UG * 128 + VG * 128 + YB)
-// #define BR (-VR * 128 + YB)
-
-int round(float v) {
-  return (int)(v + 0.5);
-}
-
-int main(int argc, const char* argv[]) {
-  if (argc < 2) {
-    printf("color kr kb\n");
-    return -1;
-  }
-  float kr = atof(argv[1]);
-  float kb = atof(argv[2]);
-  float kg = 1 - kr - kb;
-
-  float vr = 2 * (1 - kr);
-  float ug = 2 * ((1 - kb) * kb / kg);
-  float vg = 2 * ((1 - kr) * kr / kg);
-  float ub = 2 * (1 - kb);
-
-  printf("Full range\n");
-  printf("R = Y                + V * %5f\n", vr);
-  printf("G = Y - U * %6f - V * %6f\n", ug, vg);
-  printf("B = Y + U * %5f\n", ub);
-
-  printf("KR = %4f; ", kr);
-  printf("KB = %4f\n", kb);
-  //  printf("KG = %4f\n", kg);
-  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-  // #define YB 32    /* 64 / 2 */
-  //
-  // // U and V contributions to R,G,B.
-
-  printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
-  printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
-  printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
-  printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
-
-  vr = 255.f / 224.f * 2 * (1 - kr);
-  ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
-  vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
-  ub = 255.f / 224.f * 2 * (1 - kb);
-
-  printf("Limited range\n");
-  printf("R = (Y - 16) * 1.164                + V * %5f\n", vr);
-  printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
-  printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
-
-  //  printf("KG = %4f\n", kg);
-  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-  // #define YB 32    /* 64 / 2 */
-  //
-  // // U and V contributions to R,G,B.
-
-  printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
-  printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
-  printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
-  printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
-
-  return 0;
-}
diff --git a/thirdparty/libyuv/util/compare.cc b/thirdparty/libyuv/util/compare.cc
deleted file mode 100644
index a16613e..0000000
--- a/thirdparty/libyuv/util/compare.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-
-#include "libyuv/basic_types.h"
-#include "libyuv/compare.h"
-#include "libyuv/version.h"
-
-int main(int argc, char** argv) {
-  if (argc < 1) {
-    printf("libyuv compare v%d\n", LIBYUV_VERSION);
-    printf("compare file1.yuv file2.yuv\n");
-    return -1;
-  }
-  char* name1 = argv[1];
-  char* name2 = (argc > 2) ? argv[2] : NULL;
-  FILE* fin1 = fopen(name1, "rb");
-  FILE* fin2 = name2 ? fopen(name2, "rb") : NULL;
-
-  const int kBlockSize = 32768;
-  uint8_t buf1[kBlockSize];
-  uint8_t buf2[kBlockSize];
-  uint32_t hash1 = 5381;
-  uint32_t hash2 = 5381;
-  uint64_t sum_square_err = 0;
-  uint64_t size_min = 0;
-  int amt1 = 0;
-  int amt2 = 0;
-  do {
-    amt1 = static_cast<int>(fread(buf1, 1, kBlockSize, fin1));
-    if (amt1 > 0) {
-      hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
-    }
-    if (fin2) {
-      amt2 = static_cast<int>(fread(buf2, 1, kBlockSize, fin2));
-      if (amt2 > 0) {
-        hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
-      }
-      int amt_min = (amt1 < amt2) ? amt1 : amt2;
-      size_min += amt_min;
-      sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min);
-    }
-  } while (amt1 > 0 || amt2 > 0);
-
-  printf("hash1 %x", hash1);
-  if (fin2) {
-    printf(", hash2 %x", hash2);
-    double mse =
-        static_cast<double>(sum_square_err) / static_cast<double>(size_min);
-    printf(", mse %.2f", mse);
-    double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min);
-    printf(", psnr %.2f\n", psnr);
-    fclose(fin2);
-  }
-  fclose(fin1);
-}
diff --git a/thirdparty/libyuv/util/cpuid.c b/thirdparty/libyuv/util/cpuid.c
deleted file mode 100644
index 46f9c1b..0000000
--- a/thirdparty/libyuv/util/cpuid.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-
-#ifdef __cplusplus
-using namespace libyuv;
-#endif
-
-int main(int argc, const char* argv[]) {
-  int cpu_flags = TestCpuFlag(-1);
-  int has_arm = TestCpuFlag(kCpuHasARM);
-  int has_mips = TestCpuFlag(kCpuHasMIPS);
-  int has_x86 = TestCpuFlag(kCpuHasX86);
-  (void)argc;
-  (void)argv;
-
-#if defined(__i386__) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_X64)
-  if (has_x86) {
-    int family, model, cpu_info[4];
-    // Vendor ID:
-    // AuthenticAMD AMD processor
-    // CentaurHauls Centaur processor
-    // CyrixInstead Cyrix processor
-    // GenuineIntel Intel processor
-    // GenuineTMx86 Transmeta processor
-    // Geode by NSC National Semiconductor processor
-    // NexGenDriven NexGen processor
-    // RiseRiseRise Rise Technology processor
-    // SiS SiS SiS  SiS processor
-    // UMC UMC UMC  UMC processor
-    CpuId(0, 0, &cpu_info[0]);
-    cpu_info[0] = cpu_info[1];  // Reorder output
-    cpu_info[1] = cpu_info[3];
-    cpu_info[3] = 0;
-    printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0]));
-
-    // CPU Family and Model
-    // 3:0 - Stepping
-    // 7:4 - Model
-    // 11:8 - Family
-    // 13:12 - Processor Type
-    // 19:16 - Extended Model
-    // 27:20 - Extended Family
-    CpuId(1, 0, &cpu_info[0]);
-    family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
-    model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
-    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
-           model, model);
-  }
-#endif
-  printf("Cpu Flags %x\n", cpu_flags);
-  printf("Has ARM %x\n", has_arm);
-  printf("Has MIPS %x\n", has_mips);
-  printf("Has X86 %x\n", has_x86);
-  if (has_arm) {
-    int has_neon = TestCpuFlag(kCpuHasNEON);
-    printf("Has NEON %x\n", has_neon);
-  }
-  if (has_mips) {
-    int has_msa = TestCpuFlag(kCpuHasMSA);
-    printf("Has MSA %x\n", has_msa);
-    int has_mmi = TestCpuFlag(kCpuHasMMI);
-    printf("Has MMI %x\n", has_mmi);
-  }
-  if (has_x86) {
-    int has_sse2 = TestCpuFlag(kCpuHasSSE2);
-    int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-    int has_sse41 = TestCpuFlag(kCpuHasSSE41);
-    int has_sse42 = TestCpuFlag(kCpuHasSSE42);
-    int has_avx = TestCpuFlag(kCpuHasAVX);
-    int has_avx2 = TestCpuFlag(kCpuHasAVX2);
-    int has_erms = TestCpuFlag(kCpuHasERMS);
-    int has_fma3 = TestCpuFlag(kCpuHasFMA3);
-    int has_f16c = TestCpuFlag(kCpuHasF16C);
-    int has_gfni = TestCpuFlag(kCpuHasGFNI);
-    int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
-    int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
-    int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
-    int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
-    int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
-    int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
-    printf("Has SSE2 %x\n", has_sse2);
-    printf("Has SSSE3 %x\n", has_ssse3);
-    printf("Has SSE4.1 %x\n", has_sse41);
-    printf("Has SSE4.2 %x\n", has_sse42);
-    printf("Has AVX %x\n", has_avx);
-    printf("Has AVX2 %x\n", has_avx2);
-    printf("Has ERMS %x\n", has_erms);
-    printf("Has FMA3 %x\n", has_fma3);
-    printf("Has F16C %x\n", has_f16c);
-    printf("Has GFNI %x\n", has_gfni);
-    printf("Has AVX512BW %x\n", has_avx512bw);
-    printf("Has AVX512VL %x\n", has_avx512vl);
-    printf("Has AVX512VBMI %x\n", has_avx512vbmi);
-    printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2);
-    printf("Has AVX512VBITALG %x\n", has_avx512vbitalg);
-    printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq);
-  }
-  return 0;
-}
-
diff --git a/thirdparty/libyuv/util/i444tonv12_eg.cc b/thirdparty/libyuv/util/i444tonv12_eg.cc
deleted file mode 100644
index 0fcb409..0000000
--- a/thirdparty/libyuv/util/i444tonv12_eg.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-#include "libyuv/convert.h"
-
-#include <stdio.h>   // for printf
-#include <string.h>  // for memset
-
-int main(int, char**) {
-  unsigned char src_i444[640 * 400 * 3];
-  unsigned char dst_nv12[640 * 400 * 3 / 2];
-
-  for (size_t i = 0; i < sizeof(src_i444); ++i) {
-    src_i444[i] = i & 255;
-  }
-  memset(dst_nv12, 0, sizeof(dst_nv12));
-  libyuv::I444ToNV12(&src_i444[0], 640,              // source Y
-                     &src_i444[640 * 400], 640,      // source U
-                     &src_i444[640 * 400 * 2], 640,  // source V
-                     &dst_nv12[0], 640,              // dest Y
-                     &dst_nv12[640 * 400], 640,      // dest UV
-                     640, 400);                      // width and height
-
-  int checksum = 0;
-  for (size_t i = 0; i < sizeof(dst_nv12); ++i) {
-    checksum += dst_nv12[i];
-  }
-  printf("checksum %x %s\n", checksum, checksum == 0x2ec0c00 ? "PASS" : "FAIL");
-  return 0;
-}
\ No newline at end of file
diff --git a/thirdparty/libyuv/util/psnr.cc b/thirdparty/libyuv/util/psnr.cc
deleted file mode 100644
index c7bee7f..0000000
--- a/thirdparty/libyuv/util/psnr.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./psnr.h"  // NOLINT
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#ifdef _MSC_VER
-#include <intrin.h>  // For __cpuid()
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned int uint32_t;  // NOLINT
-#ifdef _MSC_VER
-typedef unsigned __int64 uint64_t;
-#else  // COMPILER_MSVC
-#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long uint64_t;  // NOLINT
-#else   // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long long uint64_t;  // NOLINT
-#endif  // __LP64__
-#endif  // _MSC_VER
-
-// libyuv provides this function when linking library for jpeg support.
-#if !defined(HAVE_JPEG)
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-#define HAS_SUMSQUAREERROR_NEON
-static uint32_t SumSquareError_NEON(const uint8_t* src_a,
-                                    const uint8_t* src_b,
-                                    int count) {
-  volatile uint32_t sse;
-  asm volatile(
-      "vmov.u8    q7, #0                         \n"
-      "vmov.u8    q9, #0                         \n"
-      "vmov.u8    q8, #0                         \n"
-      "vmov.u8    q10, #0                        \n"
-
-      "1:                                        \n"
-      "vld1.u8    {q0}, [%0]!                    \n"
-      "vld1.u8    {q1}, [%1]!                    \n"
-      "vsubl.u8   q2, d0, d2                     \n"
-      "vsubl.u8   q3, d1, d3                     \n"
-      "vmlal.s16  q7, d4, d4                     \n"
-      "vmlal.s16  q8, d6, d6                     \n"
-      "vmlal.s16  q8, d5, d5                     \n"
-      "vmlal.s16  q10, d7, d7                    \n"
-      "subs       %2, %2, #16                    \n"
-      "bhi        1b                             \n"
-
-      "vadd.u32   q7, q7, q8                     \n"
-      "vadd.u32   q9, q9, q10                    \n"
-      "vadd.u32   q10, q7, q9                    \n"
-      "vpaddl.u32 q1, q10                        \n"
-      "vadd.u64   d0, d2, d3                     \n"
-      "vmov.32    %3, d0[0]                      \n"
-      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
-  return sse;
-}
-#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-#define HAS_SUMSQUAREERROR_NEON
-static uint32_t SumSquareError_NEON(const uint8_t* src_a,
-                                    const uint8_t* src_b,
-                                    int count) {
-  volatile uint32_t sse;
-  asm volatile(
-      "eor        v16.16b, v16.16b, v16.16b      \n"
-      "eor        v18.16b, v18.16b, v18.16b      \n"
-      "eor        v17.16b, v17.16b, v17.16b      \n"
-      "eor        v19.16b, v19.16b, v19.16b      \n"
-
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"
-      "ld1        {v1.16b}, [%1], #16            \n"
-      "subs       %w2, %w2, #16                  \n"
-      "usubl      v2.8h, v0.8b, v1.8b            \n"
-      "usubl2     v3.8h, v0.16b, v1.16b          \n"
-      "smlal      v16.4s, v2.4h, v2.4h           \n"
-      "smlal      v17.4s, v3.4h, v3.4h           \n"
-      "smlal2     v18.4s, v2.8h, v2.8h           \n"
-      "smlal2     v19.4s, v3.8h, v3.8h           \n"
-      "b.gt       1b                             \n"
-
-      "add        v16.4s, v16.4s, v17.4s         \n"
-      "add        v18.4s, v18.4s, v19.4s         \n"
-      "add        v19.4s, v16.4s, v18.4s         \n"
-      "addv       s0, v19.4s                     \n"
-      "fmov       %w3, s0                        \n"
-      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
-      :
-      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
-  return sse;
-}
-#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-#define HAS_SUMSQUAREERROR_SSE2
-__declspec(naked) static uint32_t SumSquareError_SSE2(const uint8_t* /*src_a*/,
-                                                      const uint8_t* /*src_b*/,
-                                                      int /*count*/) {
-  __asm {
-    mov        eax, [esp + 4]  // src_a
-    mov        edx, [esp + 8]  // src_b
-    mov        ecx, [esp + 12]  // count
-    pxor       xmm0, xmm0
-    pxor       xmm5, xmm5
-    sub        edx, eax
-
-  wloop:
-    movdqu     xmm1, [eax]
-    movdqu     xmm2, [eax + edx]
-    lea        eax,  [eax + 16]
-    movdqu     xmm3, xmm1
-    psubusb    xmm1, xmm2
-    psubusb    xmm2, xmm3
-    por        xmm1, xmm2
-    movdqu     xmm2, xmm1
-    punpcklbw  xmm1, xmm5
-    punpckhbw  xmm2, xmm5
-    pmaddwd    xmm1, xmm1
-    pmaddwd    xmm2, xmm2
-    paddd      xmm0, xmm1
-    paddd      xmm0, xmm2
-    sub        ecx, 16
-    ja         wloop
-
-    pshufd     xmm1, xmm0, 0EEh
-    paddd      xmm0, xmm1
-    pshufd     xmm1, xmm0, 01h
-    paddd      xmm0, xmm1
-    movd       eax, xmm0
-    ret
-  }
-}
-#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_SUMSQUAREERROR_SSE2
-static uint32_t SumSquareError_SSE2(const uint8_t* src_a,
-                                    const uint8_t* src_b,
-                                    int count) {
-  uint32_t sse;
-  asm volatile(  // NOLINT
-      "pxor      %%xmm0,%%xmm0                   \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
-      "sub       %0,%1                           \n"
-
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "movdqu    (%0,%1,1),%%xmm2                \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqu    %%xmm1,%%xmm3                   \n"
-      "psubusb   %%xmm2,%%xmm1                   \n"
-      "psubusb   %%xmm3,%%xmm2                   \n"
-      "por       %%xmm2,%%xmm1                   \n"
-      "movdqu    %%xmm1,%%xmm2                   \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "punpckhbw %%xmm5,%%xmm2                   \n"
-      "pmaddwd   %%xmm1,%%xmm1                   \n"
-      "pmaddwd   %%xmm2,%%xmm2                   \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "paddd     %%xmm2,%%xmm0                   \n"
-      "sub       $0x10,%2                        \n"
-      "ja        1b                              \n"
-
-      "pshufd    $0xee,%%xmm0,%%xmm1             \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "pshufd    $0x1,%%xmm0,%%xmm1              \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "movd      %%xmm0,%3                       \n"
-
-      : "+r"(src_a),  // %0
-        "+r"(src_b),  // %1
-        "+r"(count),  // %2
-        "=g"(sse)     // %3
-      :
-      : "memory", "cc"
-#if defined(__SSE2__)
-        ,
-        "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );  // NOLINT
-  return sse;
-}
-#endif  // LIBYUV_DISABLE_X86 etc
-
-#if defined(HAS_SUMSQUAREERROR_SSE2)
-#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
-static __inline void __cpuid(int cpu_info[4], int info_type) {
-  asm volatile(  // NOLINT
-      "mov %%ebx, %%edi                          \n"
-      "cpuid                                     \n"
-      "xchg %%edi, %%ebx                         \n"
-      : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]),
-        "=d"(cpu_info[3])
-      : "a"(info_type));
-}
-// For gcc/clang but not clangcl.
-#elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
-static __inline void __cpuid(int cpu_info[4], int info_type) {
-  asm volatile(  // NOLINT
-      "cpuid                                     \n"
-      : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
-        "=d"(cpu_info[3])
-      : "a"(info_type));
-}
-#endif
-
-static int CpuHasSSE2() {
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
-  int cpu_info[4];
-  __cpuid(cpu_info, 1);
-  if (cpu_info[3] & 0x04000000) {
-    return 1;
-  }
-#endif
-  return 0;
-}
-#endif  // HAS_SUMSQUAREERROR_SSE2
-
-static uint32_t SumSquareError_C(const uint8_t* src_a,
-                                 const uint8_t* src_b,
-                                 int count) {
-  uint32_t sse = 0u;
-  for (int x = 0; x < count; ++x) {
-    int diff = src_a[x] - src_b[x];
-    sse += static_cast<uint32_t>(diff * diff);
-  }
-  return sse;
-}
-
-double ComputeSumSquareError(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
-                             int count) = SumSquareError_C;
-#if defined(HAS_SUMSQUAREERROR_NEON)
-  SumSquareError = SumSquareError_NEON;
-#endif
-#if defined(HAS_SUMSQUAREERROR_SSE2)
-  if (CpuHasSSE2()) {
-    SumSquareError = SumSquareError_SSE2;
-  }
-#endif
-  const int kBlockSize = 1 << 15;
-  uint64_t sse = 0;
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+ : sse)
-#endif
-  for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
-    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
-  }
-  src_a += count & ~(kBlockSize - 1);
-  src_b += count & ~(kBlockSize - 1);
-  int remainder = count & (kBlockSize - 1) & ~15;
-  if (remainder) {
-    sse += SumSquareError(src_a, src_b, remainder);
-    src_a += remainder;
-    src_b += remainder;
-  }
-  remainder = count & 15;
-  if (remainder) {
-    sse += SumSquareError_C(src_a, src_b, remainder);
-  }
-  return static_cast<double>(sse);
-}
-#endif
-
-// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
-// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
-double ComputePSNR(double sse, double size) {
-  const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
-  if (sse <= kMINSSE) {
-    sse = kMINSSE;  // Produces max PSNR of 128
-  }
-  return 10.0 * log10(255.0 * 255.0 * size / sse);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/thirdparty/libyuv/util/psnr.h b/thirdparty/libyuv/util/psnr.h
deleted file mode 100644
index aac128c..0000000
--- a/thirdparty/libyuv/util/psnr.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
-
-#ifndef UTIL_PSNR_H_  // NOLINT
-#define UTIL_PSNR_H_
-
-#include <math.h>  // For log10()
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
-typedef unsigned char uint8_t;
-#define UINT8_TYPE_DEFINED
-#endif
-
-static const double kMaxPSNR = 128.0;
-
-// libyuv provides this function when linking library for jpeg support.
-// TODO(fbarchard): make psnr lib compatible subset of libyuv.
-#if !defined(HAVE_JPEG)
-// Computer Sum of Squared Error (SSE).
-// Pass this to ComputePSNR for final result.
-double ComputeSumSquareError(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count);
-#endif
-
-// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
-// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
-double ComputePSNR(double sse, double size);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // UTIL_PSNR_H_  // NOLINT
diff --git a/thirdparty/libyuv/util/psnr_main.cc b/thirdparty/libyuv/util/psnr_main.cc
deleted file mode 100644
index a930b20..0000000
--- a/thirdparty/libyuv/util/psnr_main.cc
+++ /dev/null
@@ -1,633 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Get PSNR or SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
-// To build: g++ -O3 -o psnr psnr.cc ssim.cc psnr_main.cc
-// or VisualC: cl /Ox psnr.cc ssim.cc psnr_main.cc
-//
-// To enable OpenMP and SSE2
-// gcc: g++ -msse2 -O3 -fopenmp -o psnr psnr.cc ssim.cc psnr_main.cc
-// vc:  cl /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
-//
-// Usage: psnr org_seq rec_seq -s width height [-skip skip_org skip_rec]
-
-#ifndef _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_WARNINGS
-#endif
-
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "./psnr.h"
-#include "./ssim.h"
-#ifdef HAVE_JPEG
-#include "libyuv/compare.h"
-#include "libyuv/convert.h"
-#endif
-
-struct metric {
-  double y, u, v, all;
-  double min_y, min_u, min_v, min_all;
-  double global_y, global_u, global_v, global_all;
-  int min_frame;
-};
-
-// options
-bool verbose = false;
-bool quiet = false;
-bool show_name = false;
-bool do_swap_uv = false;
-bool do_psnr = false;
-bool do_ssim = false;
-bool do_mse = false;
-bool do_lssim = false;
-int image_width = 0, image_height = 0;
-int fileindex_org = 0;  // argv argument contains the source file name.
-int fileindex_rec = 0;  // argv argument contains the destination file name.
-int num_rec = 0;
-int num_skip_org = 0;
-int num_skip_rec = 0;
-int num_frames = 0;
-#ifdef _OPENMP
-int num_threads = 0;
-#endif
-
-// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
-bool ExtractResolutionFromFilename(const char* name,
-                                   int* width_ptr,
-                                   int* height_ptr) {
-  // Isolate the .width_height. section of the filename by searching for a
-  // dot or underscore followed by a digit.
-  for (int i = 0; name[i]; ++i) {
-    if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' &&
-        name[i + 1] <= '9') {
-      int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr);  // NOLINT
-      if (2 == n) {
-        return true;
-      }
-    }
-  }
-
-#ifdef HAVE_JPEG
-  // Try parsing file as a jpeg.
-  FILE* const file_org = fopen(name, "rb");
-  if (file_org == NULL) {
-    fprintf(stderr, "Cannot open %s\n", name);
-    return false;
-  }
-  fseek(file_org, 0, SEEK_END);
-  size_t total_size = ftell(file_org);
-  fseek(file_org, 0, SEEK_SET);
-  uint8_t* const ch_org = new uint8_t[total_size];
-  memset(ch_org, 0, total_size);
-  size_t bytes_org = fread(ch_org, sizeof(uint8_t), total_size, file_org);
-  fclose(file_org);
-  if (bytes_org == total_size) {
-    if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) {
-      delete[] ch_org;
-      return true;
-    }
-  }
-  delete[] ch_org;
-#endif  // HAVE_JPEG
-  return false;
-}
-
-// Scale Y channel from 16..240 to 0..255.
-// This can be useful when comparing codecs that are inconsistant about Y
-uint8_t ScaleY(uint8_t y) {
-  int ny = (y - 16) * 256 / 224;
-  if (ny < 0) {
-    ny = 0;
-  }
-  if (ny > 255) {
-    ny = 255;
-  }
-  return static_cast<uint8_t>(ny);
-}
-
-// MSE = Mean Square Error
-double GetMSE(double sse, double size) {
-  return sse / size;
-}
-
-void PrintHelp(const char* program) {
-  printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program);
-#ifdef HAVE_JPEG
-  printf("jpeg or raw YUV 420 supported.\n");
-#endif
-  printf("options:\n");
-  printf(
-      " -s <width> <height> .... specify YUV size, mandatory if none of the "
-      "sequences have the\n");
-  printf(
-      "                          resolution embedded in their filename (ie. "
-      "name.1920x800_24Hz_P420.yuv)\n");
-  printf(" -psnr .................. compute PSNR (default)\n");
-  printf(" -ssim .................. compute SSIM\n");
-  printf(" -mse ................... compute MSE\n");
-  printf(" -swap .................. Swap U and V plane\n");
-  printf(" -skip <org> <rec> ...... Number of frame to skip of org and rec\n");
-  printf(" -frames <num> .......... Number of frames to compare\n");
-#ifdef _OPENMP
-  printf(" -t <num> ............... Number of threads\n");
-#endif
-  printf(" -n ..................... Show file name\n");
-  printf(" -v ..................... verbose++\n");
-  printf(" -q ..................... quiet\n");
-  printf(" -h ..................... this help\n");
-  exit(0);
-}
-
-void ParseOptions(int argc, const char* argv[]) {
-  if (argc <= 1) {
-    PrintHelp(argv[0]);
-  }
-  for (int c = 1; c < argc; ++c) {
-    if (!strcmp(argv[c], "-v")) {
-      verbose = true;
-    } else if (!strcmp(argv[c], "-q")) {
-      quiet = true;
-    } else if (!strcmp(argv[c], "-n")) {
-      show_name = true;
-    } else if (!strcmp(argv[c], "-psnr")) {
-      do_psnr = true;
-    } else if (!strcmp(argv[c], "-mse")) {
-      do_mse = true;
-    } else if (!strcmp(argv[c], "-ssim")) {
-      do_ssim = true;
-    } else if (!strcmp(argv[c], "-lssim")) {
-      do_ssim = true;
-      do_lssim = true;
-    } else if (!strcmp(argv[c], "-swap")) {
-      do_swap_uv = true;
-    } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
-      PrintHelp(argv[0]);
-    } else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
-      image_width = atoi(argv[++c]);   // NOLINT
-      image_height = atoi(argv[++c]);  // NOLINT
-    } else if (!strcmp(argv[c], "-skip") && c + 2 < argc) {
-      num_skip_org = atoi(argv[++c]);  // NOLINT
-      num_skip_rec = atoi(argv[++c]);  // NOLINT
-    } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
-      num_frames = atoi(argv[++c]);  // NOLINT
-#ifdef _OPENMP
-    } else if (!strcmp(argv[c], "-t") && c + 1 < argc) {
-      num_threads = atoi(argv[++c]);  // NOLINT
-#endif
-    } else if (argv[c][0] == '-') {
-      fprintf(stderr, "Unknown option. %s\n", argv[c]);
-    } else if (fileindex_org == 0) {
-      fileindex_org = c;
-    } else if (fileindex_rec == 0) {
-      fileindex_rec = c;
-      num_rec = 1;
-    } else {
-      ++num_rec;
-    }
-  }
-  if (fileindex_org == 0 || fileindex_rec == 0) {
-    fprintf(stderr, "Missing filenames\n");
-    PrintHelp(argv[0]);
-  }
-  if (num_skip_org < 0 || num_skip_rec < 0) {
-    fprintf(stderr, "Skipped frames incorrect\n");
-    PrintHelp(argv[0]);
-  }
-  if (num_frames < 0) {
-    fprintf(stderr, "Number of frames incorrect\n");
-    PrintHelp(argv[0]);
-  }
-  if (image_width == 0 || image_height == 0) {
-    int org_width, org_height;
-    int rec_width, rec_height;
-    bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
-                                                       &org_width, &org_height);
-    bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
-                                                       &rec_width, &rec_height);
-    if (org_res_avail) {
-      if (rec_res_avail) {
-        if ((org_width == rec_width) && (org_height == rec_height)) {
-          image_width = org_width;
-          image_height = org_height;
-        } else {
-          fprintf(stderr, "Sequences have different resolutions.\n");
-          PrintHelp(argv[0]);
-        }
-      } else {
-        image_width = org_width;
-        image_height = org_height;
-      }
-    } else if (rec_res_avail) {
-      image_width = rec_width;
-      image_height = rec_height;
-    } else {
-      fprintf(stderr, "Missing dimensions.\n");
-      PrintHelp(argv[0]);
-    }
-  }
-}
-
-bool UpdateMetrics(uint8_t* ch_org,
-                   uint8_t* ch_rec,
-                   const int y_size,
-                   const int uv_size,
-                   const size_t total_size,
-                   int number_of_frames,
-                   metric* cur_distortion_psnr,
-                   metric* distorted_frame,
-                   bool do_psnr) {
-  const int uv_offset = (do_swap_uv ? uv_size : 0);
-  const uint8_t* const u_org = ch_org + y_size + uv_offset;
-  const uint8_t* const u_rec = ch_rec + y_size;
-  const uint8_t* const v_org = ch_org + y_size + (uv_size - uv_offset);
-  const uint8_t* const v_rec = ch_rec + y_size + uv_size;
-  if (do_psnr) {
-#ifdef HAVE_JPEG
-    double y_err = static_cast<double>(
-        libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
-    double u_err = static_cast<double>(
-        libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
-    double v_err = static_cast<double>(
-        libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
-#else
-    double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
-    double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
-    double v_err = ComputeSumSquareError(v_org, v_rec, uv_size);
-#endif
-    const double total_err = y_err + u_err + v_err;
-    cur_distortion_psnr->global_y += y_err;
-    cur_distortion_psnr->global_u += u_err;
-    cur_distortion_psnr->global_v += v_err;
-    cur_distortion_psnr->global_all += total_err;
-    distorted_frame->y = ComputePSNR(y_err, static_cast<double>(y_size));
-    distorted_frame->u = ComputePSNR(u_err, static_cast<double>(uv_size));
-    distorted_frame->v = ComputePSNR(v_err, static_cast<double>(uv_size));
-    distorted_frame->all =
-        ComputePSNR(total_err, static_cast<double>(total_size));
-  } else {
-    distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
-    distorted_frame->u =
-        CalcSSIM(u_org, u_rec, (image_width + 1) / 2, (image_height + 1) / 2);
-    distorted_frame->v =
-        CalcSSIM(v_org, v_rec, (image_width + 1) / 2, (image_height + 1) / 2);
-    distorted_frame->all =
-        (distorted_frame->y + distorted_frame->u + distorted_frame->v) /
-        total_size;
-    distorted_frame->y /= y_size;
-    distorted_frame->u /= uv_size;
-    distorted_frame->v /= uv_size;
-
-    if (do_lssim) {
-      distorted_frame->all = CalcLSSIM(distorted_frame->all);
-      distorted_frame->y = CalcLSSIM(distorted_frame->y);
-      distorted_frame->u = CalcLSSIM(distorted_frame->u);
-      distorted_frame->v = CalcLSSIM(distorted_frame->v);
-    }
-  }
-
-  cur_distortion_psnr->y += distorted_frame->y;
-  cur_distortion_psnr->u += distorted_frame->u;
-  cur_distortion_psnr->v += distorted_frame->v;
-  cur_distortion_psnr->all += distorted_frame->all;
-
-  bool ismin = false;
-  if (distorted_frame->y < cur_distortion_psnr->min_y) {
-    cur_distortion_psnr->min_y = distorted_frame->y;
-  }
-  if (distorted_frame->u < cur_distortion_psnr->min_u) {
-    cur_distortion_psnr->min_u = distorted_frame->u;
-  }
-  if (distorted_frame->v < cur_distortion_psnr->min_v) {
-    cur_distortion_psnr->min_v = distorted_frame->v;
-  }
-  if (distorted_frame->all < cur_distortion_psnr->min_all) {
-    cur_distortion_psnr->min_all = distorted_frame->all;
-    cur_distortion_psnr->min_frame = number_of_frames;
-    ismin = true;
-  }
-  return ismin;
-}
-
-int main(int argc, const char* argv[]) {
-  ParseOptions(argc, argv);
-  if (!do_psnr && !do_ssim) {
-    do_psnr = true;
-  }
-
-#ifdef _OPENMP
-  if (num_threads) {
-    omp_set_num_threads(num_threads);
-  }
-  if (verbose) {
-    printf("OpenMP %d procs\n", omp_get_num_procs());
-  }
-#endif
-  // Open original file (first file argument)
-  FILE* const file_org = fopen(argv[fileindex_org], "rb");
-  if (file_org == NULL) {
-    fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]);
-    exit(1);
-  }
-
-  // Open all files to compare to
-  FILE** file_rec = new FILE*[num_rec];
-  memset(file_rec, 0, num_rec * sizeof(FILE*));  // NOLINT
-  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
-    file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "rb");
-    if (file_rec[cur_rec] == NULL) {
-      fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]);
-      fclose(file_org);
-      for (int i = 0; i < cur_rec; ++i) {
-        fclose(file_rec[i]);
-      }
-      delete[] file_rec;
-      exit(1);
-    }
-  }
-
-  const int y_size = image_width * image_height;
-  const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2);
-  const size_t total_size = y_size + 2 * uv_size;  // NOLINT
-#if defined(_MSC_VER)
-  _fseeki64(
-      file_org,
-      static_cast<__int64>(num_skip_org) * static_cast<__int64>(total_size),
-      SEEK_SET);
-#else
-  fseek(file_org, num_skip_org * total_size, SEEK_SET);
-#endif
-  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
-#if defined(_MSC_VER)
-    _fseeki64(
-        file_rec[cur_rec],
-        static_cast<__int64>(num_skip_rec) * static_cast<__int64>(total_size),
-        SEEK_SET);
-#else
-    fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET);
-#endif
-  }
-
-  uint8_t* const ch_org = new uint8_t[total_size];
-  uint8_t* const ch_rec = new uint8_t[total_size];
-  if (ch_org == NULL || ch_rec == NULL) {
-    fprintf(stderr, "No memory available\n");
-    fclose(file_org);
-    for (int i = 0; i < num_rec; ++i) {
-      fclose(file_rec[i]);
-    }
-    delete[] ch_org;
-    delete[] ch_rec;
-    delete[] file_rec;
-    exit(1);
-  }
-
-  metric* const distortion_psnr = new metric[num_rec];
-  metric* const distortion_ssim = new metric[num_rec];
-  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
-    metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
-    cur_distortion_psnr->y = 0.0;
-    cur_distortion_psnr->u = 0.0;
-    cur_distortion_psnr->v = 0.0;
-    cur_distortion_psnr->all = 0.0;
-    cur_distortion_psnr->min_y = kMaxPSNR;
-    cur_distortion_psnr->min_u = kMaxPSNR;
-    cur_distortion_psnr->min_v = kMaxPSNR;
-    cur_distortion_psnr->min_all = kMaxPSNR;
-    cur_distortion_psnr->min_frame = 0;
-    cur_distortion_psnr->global_y = 0.0;
-    cur_distortion_psnr->global_u = 0.0;
-    cur_distortion_psnr->global_v = 0.0;
-    cur_distortion_psnr->global_all = 0.0;
-    distortion_ssim[cur_rec] = cur_distortion_psnr[cur_rec];
-  }
-
-  if (verbose) {
-    printf("Size: %dx%d\n", image_width, image_height);
-  }
-
-  if (!quiet) {
-    printf("Frame");
-    if (do_psnr) {
-      printf("\t PSNR-Y \t PSNR-U \t PSNR-V \t PSNR-All \t Frame");
-    }
-    if (do_ssim) {
-      printf("\t  SSIM-Y\t  SSIM-U\t  SSIM-V\t  SSIM-All\t Frame");
-    }
-    if (show_name) {
-      printf("\tName\n");
-    } else {
-      printf("\n");
-    }
-  }
-
-  int number_of_frames;
-  for (number_of_frames = 0;; ++number_of_frames) {
-    if (num_frames && number_of_frames >= num_frames) {
-      break;
-    }
-
-    size_t bytes_org = fread(ch_org, sizeof(uint8_t), total_size, file_org);
-    if (bytes_org < total_size) {
-#ifdef HAVE_JPEG
-      // Try parsing file as a jpeg.
-      uint8_t* const ch_jpeg = new uint8_t[bytes_org];
-      memcpy(ch_jpeg, ch_org, bytes_org);
-      memset(ch_org, 0, total_size);
-
-      if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org, ch_org, image_width,
-                                  ch_org + y_size, (image_width + 1) / 2,
-                                  ch_org + y_size + uv_size,
-                                  (image_width + 1) / 2, image_width,
-                                  image_height, image_width, image_height)) {
-        delete[] ch_jpeg;
-        break;
-      }
-      delete[] ch_jpeg;
-#else
-      break;
-#endif  // HAVE_JPEG
-    }
-
-    for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
-      size_t bytes_rec =
-          fread(ch_rec, sizeof(uint8_t), total_size, file_rec[cur_rec]);
-      if (bytes_rec < total_size) {
-#ifdef HAVE_JPEG
-        // Try parsing file as a jpeg.
-        uint8_t* const ch_jpeg = new uint8_t[bytes_rec];
-        memcpy(ch_jpeg, ch_rec, bytes_rec);
-        memset(ch_rec, 0, total_size);
-
-        if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec, ch_rec, image_width,
-                                    ch_rec + y_size, (image_width + 1) / 2,
-                                    ch_rec + y_size + uv_size,
-                                    (image_width + 1) / 2, image_width,
-                                    image_height, image_width, image_height)) {
-          delete[] ch_jpeg;
-          break;
-        }
-        delete[] ch_jpeg;
-#else
-        break;
-#endif  // HAVE_JPEG
-      }
-
-      if (verbose) {
-        printf("%5d", number_of_frames);
-      }
-      if (do_psnr) {
-        metric distorted_frame = {};
-        metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
-        bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size,
-                                   number_of_frames, cur_distortion_psnr,
-                                   &distorted_frame, true);
-        if (verbose) {
-          printf("\t%10.6f", distorted_frame.y);
-          printf("\t%10.6f", distorted_frame.u);
-          printf("\t%10.6f", distorted_frame.v);
-          printf("\t%10.6f", distorted_frame.all);
-          printf("\t%5s", ismin ? "min" : "");
-        }
-      }
-      if (do_ssim) {
-        metric distorted_frame = {};
-        metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
-        bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size,
-                                   number_of_frames, cur_distortion_ssim,
-                                   &distorted_frame, false);
-        if (verbose) {
-          printf("\t%10.6f", distorted_frame.y);
-          printf("\t%10.6f", distorted_frame.u);
-          printf("\t%10.6f", distorted_frame.v);
-          printf("\t%10.6f", distorted_frame.all);
-          printf("\t%5s", ismin ? "min" : "");
-        }
-      }
-      if (verbose) {
-        if (show_name) {
-          printf("\t%s", argv[fileindex_rec + cur_rec]);
-        }
-        printf("\n");
-      }
-    }
-  }
-
-  // Final PSNR computation.
-  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
-    metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
-    metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
-    if (number_of_frames > 0) {
-      const double norm = 1. / static_cast<double>(number_of_frames);
-      cur_distortion_psnr->y *= norm;
-      cur_distortion_psnr->u *= norm;
-      cur_distortion_psnr->v *= norm;
-      cur_distortion_psnr->all *= norm;
-      cur_distortion_ssim->y *= norm;
-      cur_distortion_ssim->u *= norm;
-      cur_distortion_ssim->v *= norm;
-      cur_distortion_ssim->all *= norm;
-    }
-
-    if (do_psnr) {
-      const double global_psnr_y =
-          ComputePSNR(cur_distortion_psnr->global_y,
-                      static_cast<double>(y_size) * number_of_frames);
-      const double global_psnr_u =
-          ComputePSNR(cur_distortion_psnr->global_u,
-                      static_cast<double>(uv_size) * number_of_frames);
-      const double global_psnr_v =
-          ComputePSNR(cur_distortion_psnr->global_v,
-                      static_cast<double>(uv_size) * number_of_frames);
-      const double global_psnr_all =
-          ComputePSNR(cur_distortion_psnr->global_all,
-                      static_cast<double>(total_size) * number_of_frames);
-      printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_psnr_y,
-             global_psnr_u, global_psnr_v, global_psnr_all, number_of_frames);
-      if (show_name) {
-        printf("\t%s", argv[fileindex_rec + cur_rec]);
-      }
-      printf("\n");
-    }
-
-    if (!quiet) {
-      printf("Avg:");
-      if (do_psnr) {
-        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_psnr->y,
-               cur_distortion_psnr->u, cur_distortion_psnr->v,
-               cur_distortion_psnr->all, number_of_frames);
-      }
-      if (do_ssim) {
-        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_ssim->y,
-               cur_distortion_ssim->u, cur_distortion_ssim->v,
-               cur_distortion_ssim->all, number_of_frames);
-      }
-      if (show_name) {
-        printf("\t%s", argv[fileindex_rec + cur_rec]);
-      }
-      printf("\n");
-    }
-    if (!quiet) {
-      printf("Min:");
-      if (do_psnr) {
-        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
-               cur_distortion_psnr->min_y, cur_distortion_psnr->min_u,
-               cur_distortion_psnr->min_v, cur_distortion_psnr->min_all,
-               cur_distortion_psnr->min_frame);
-      }
-      if (do_ssim) {
-        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
-               cur_distortion_ssim->min_y, cur_distortion_ssim->min_u,
-               cur_distortion_ssim->min_v, cur_distortion_ssim->min_all,
-               cur_distortion_ssim->min_frame);
-      }
-      if (show_name) {
-        printf("\t%s", argv[fileindex_rec + cur_rec]);
-      }
-      printf("\n");
-    }
-
-    if (do_mse) {
-      double global_mse_y =
-          GetMSE(cur_distortion_psnr->global_y,
-                 static_cast<double>(y_size) * number_of_frames);
-      double global_mse_u =
-          GetMSE(cur_distortion_psnr->global_u,
-                 static_cast<double>(uv_size) * number_of_frames);
-      double global_mse_v =
-          GetMSE(cur_distortion_psnr->global_v,
-                 static_cast<double>(uv_size) * number_of_frames);
-      double global_mse_all =
-          GetMSE(cur_distortion_psnr->global_all,
-                 static_cast<double>(total_size) * number_of_frames);
-      printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_mse_y,
-             global_mse_u, global_mse_v, global_mse_all, number_of_frames);
-      if (show_name) {
-        printf("\t%s", argv[fileindex_rec + cur_rec]);
-      }
-      printf("\n");
-    }
-  }
-  fclose(file_org);
-  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
-    fclose(file_rec[cur_rec]);
-  }
-  delete[] distortion_psnr;
-  delete[] distortion_ssim;
-  delete[] ch_org;
-  delete[] ch_rec;
-  delete[] file_rec;
-  return 0;
-}
diff --git a/thirdparty/libyuv/util/ssim.cc b/thirdparty/libyuv/util/ssim.cc
deleted file mode 100644
index 096fbcf..0000000
--- a/thirdparty/libyuv/util/ssim.cc
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "../util/ssim.h"  // NOLINT
-
-#include <string.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned int uint32_t;    // NOLINT
-typedef unsigned short uint16_t;  // NOLINT
-
-#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
-    (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
-#define __SSE2__
-#endif
-#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
-#include <emmintrin.h>
-#endif
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-// SSIM
-enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
-
-// Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
-// The maximum value (11 x 11) must be less than 128 to avoid sign
-// problems during the calls to _mm_mullo_epi16().
-static const int K[KERNEL_SIZE] = {
-    1, 3, 7, 11, 7, 3, 1  // ~11 * exp(-0.3 * i * i)
-};
-static const double kiW[KERNEL + 1 + 1] = {
-    1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
-    1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
-    1. / 1056.,  // 1 / sum(i:0..5, j..6) K[i]*K[j]
-    1. / 957.,   // 1 / sum(i:0..4, j..6) K[i]*K[j]
-    1. / 726.,   // 1 / sum(i:0..3, j..6) K[i]*K[j]
-};
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
-
-#define PWEIGHT(A, B) static_cast<uint16_t>(K[(A)] * K[(B)])  // weight product
-#define MAKE_WEIGHT(L)                                                \
-  {                                                                   \
-    {                                                                 \
-      {                                                               \
-        PWEIGHT(L, 0)                                                 \
-        , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
-            PWEIGHT(L, 5), PWEIGHT(L, 6), 0                           \
-      }                                                               \
-    }                                                                 \
-  }
-
-// We need this union trick to be able to initialize constant static __m128i
-// values. We can't call _mm_set_epi16() for static compile-time initialization.
-static const struct {
-  union {
-    uint16_t i16_[8];
-    __m128i m_;
-  } values_;
-} W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
-  W3 = MAKE_WEIGHT(3);
-// ... the rest is symmetric.
-#undef MAKE_WEIGHT
-#undef PWEIGHT
-#endif
-
-// Common final expression for SSIM, once the weighted sums are known.
-static double FinalizeSSIM(double iw,
-                           double xm,
-                           double ym,
-                           double xxm,
-                           double xym,
-                           double yym) {
-  const double iwx = xm * iw;
-  const double iwy = ym * iw;
-  double sxx = xxm * iw - iwx * iwx;
-  double syy = yym * iw - iwy * iwy;
-  // small errors are possible, due to rounding. Clamp to zero.
-  if (sxx < 0.) {
-    sxx = 0.;
-  }
-  if (syy < 0.) {
-    syy = 0.;
-  }
-  const double sxsy = sqrt(sxx * syy);
-  const double sxy = xym * iw - iwx * iwy;
-  static const double C11 = (0.01 * 0.01) * (255 * 255);
-  static const double C22 = (0.03 * 0.03) * (255 * 255);
-  static const double C33 = (0.015 * 0.015) * (255 * 255);
-  const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
-  const double c = (2. * sxsy + C22) / (sxx + syy + C22);
-  const double s = (sxy + C33) / (sxsy + C33);
-  return l * c * s;
-}
-
-// GetSSIM() does clipping.  GetSSIMFullKernel() does not
-
-// TODO(skal): use summed tables?
-// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
-// with a diff of 255, squared. The maximum error is thus 0x4388241,
-// which fits into 32 bits integers.
-double GetSSIM(const uint8_t* org,
-               const uint8_t* rec,
-               int xo,
-               int yo,
-               int W,
-               int H,
-               int stride) {
-  uint32_t ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
-  org += (yo - KERNEL) * stride;
-  org += (xo - KERNEL);
-  rec += (yo - KERNEL) * stride;
-  rec += (xo - KERNEL);
-  for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
-    if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) {
-      continue;
-    }
-    const int Wy = K[y_];
-    for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
-      const int Wxy = Wy * K[x_];
-      if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
-        const int org_x = org[x_];
-        const int rec_x = rec[x_];
-        ws += Wxy;
-        xm += Wxy * org_x;
-        ym += Wxy * rec_x;
-        xxm += Wxy * org_x * org_x;
-        xym += Wxy * org_x * rec_x;
-        yym += Wxy * rec_x * rec_x;
-      }
-    }
-  }
-  return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
-}
-
-double GetSSIMFullKernel(const uint8_t* org,
-                         const uint8_t* rec,
-                         int xo,
-                         int yo,
-                         int stride,
-                         double area_weight) {
-  uint32_t xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
-
-#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
-
-  org += yo * stride + xo;
-  rec += yo * stride + xo;
-  for (int y = 1; y <= KERNEL; y++) {
-    const int dy1 = y * stride;
-    const int dy2 = y * stride;
-    const int Wy = K[KERNEL + y];
-
-    for (int x = 1; x <= KERNEL; x++) {
-      // Compute the contributions of upper-left (ul), upper-right (ur)
-      // lower-left (ll) and lower-right (lr) points (see the diagram below).
-      // Symmetric Kernel will have same weight on those points.
-      //       -  -  -  -  -  -  -
-      //       -  ul -  -  -  ur -
-      //       -  -  -  -  -  -  -
-      //       -  -  -  0  -  -  -
-      //       -  -  -  -  -  -  -
-      //       -  ll -  -  -  lr -
-      //       -  -  -  -  -  -  -
-      const int Wxy = Wy * K[KERNEL + x];
-      const int ul1 = org[-dy1 - x];
-      const int ur1 = org[-dy1 + x];
-      const int ll1 = org[dy1 - x];
-      const int lr1 = org[dy1 + x];
-
-      const int ul2 = rec[-dy2 - x];
-      const int ur2 = rec[-dy2 + x];
-      const int ll2 = rec[dy2 - x];
-      const int lr2 = rec[dy2 + x];
-
-      xm += Wxy * (ul1 + ur1 + ll1 + lr1);
-      ym += Wxy * (ul2 + ur2 + ll2 + lr2);
-      xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
-      xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
-      yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
-    }
-
-    // Compute the contributions of up (u), down (d), left (l) and right (r)
-    // points across the main axes (see the diagram below).
-    // Symmetric Kernel will have same weight on those points.
-    //       -  -  -  -  -  -  -
-    //       -  -  -  u  -  -  -
-    //       -  -  -  -  -  -  -
-    //       -  l  -  0  -  r  -
-    //       -  -  -  -  -  -  -
-    //       -  -  -  d  -  -  -
-    //       -  -  -  -  -  -  -
-    const int Wxy = Wy * K[KERNEL];
-    const int u1 = org[-dy1];
-    const int d1 = org[dy1];
-    const int l1 = org[-y];
-    const int r1 = org[y];
-
-    const int u2 = rec[-dy2];
-    const int d2 = rec[dy2];
-    const int l2 = rec[-y];
-    const int r2 = rec[y];
-
-    xm += Wxy * (u1 + d1 + l1 + r1);
-    ym += Wxy * (u2 + d2 + l2 + r2);
-    xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
-    xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
-    yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
-  }
-
-  // Lastly the contribution of (x0, y0) point.
-  const int Wxy = K[KERNEL] * K[KERNEL];
-  const int s1 = org[0];
-  const int s2 = rec[0];
-
-  xm += Wxy * s1;
-  ym += Wxy * s2;
-  xxm += Wxy * s1 * s1;
-  xym += Wxy * s1 * s2;
-  yym += Wxy * s2 * s2;
-
-#else  // __SSE2__
-
-  org += (yo - KERNEL) * stride + (xo - KERNEL);
-  rec += (yo - KERNEL) * stride + (xo - KERNEL);
-
-  const __m128i zero = _mm_setzero_si128();
-  __m128i x = zero;
-  __m128i y = zero;
-  __m128i xx = zero;
-  __m128i xy = zero;
-  __m128i yy = zero;
-
-// Read 8 pixels at line #L, and convert to 16bit, perform weighting
-// and acccumulate.
-#define LOAD_LINE_PAIR(L, WEIGHT)                                            \
-  do {                                                                       \
-    const __m128i v0 =                                                       \
-        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
-    const __m128i v1 =                                                       \
-        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
-    const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                          \
-    const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                          \
-    const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);            \
-    const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);            \
-    x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                     \
-    y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                     \
-    x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                     \
-    y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                     \
-    xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                         \
-    xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                         \
-    yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                         \
-  } while (0)
-
-#define ADD_AND_STORE_FOUR_EPI32(M, OUT)                    \
-  do {                                                      \
-    uint32_t tmp[4];                                        \
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
-    (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];              \
-  } while (0)
-
-  LOAD_LINE_PAIR(0, W0);
-  LOAD_LINE_PAIR(1, W1);
-  LOAD_LINE_PAIR(2, W2);
-  LOAD_LINE_PAIR(3, W3);
-  LOAD_LINE_PAIR(4, W2);
-  LOAD_LINE_PAIR(5, W1);
-  LOAD_LINE_PAIR(6, W0);
-
-  ADD_AND_STORE_FOUR_EPI32(x, xm);
-  ADD_AND_STORE_FOUR_EPI32(y, ym);
-  ADD_AND_STORE_FOUR_EPI32(xx, xxm);
-  ADD_AND_STORE_FOUR_EPI32(xy, xym);
-  ADD_AND_STORE_FOUR_EPI32(yy, yym);
-
-#undef LOAD_LINE_PAIR
-#undef ADD_AND_STORE_FOUR_EPI32
-#endif
-
-  return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
-}
-
-static int start_max(int x, int y) {
-  return (x > y) ? x : y;
-}
-
-double CalcSSIM(const uint8_t* org,
-                const uint8_t* rec,
-                const int image_width,
-                const int image_height) {
-  double SSIM = 0.;
-  const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
-  const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
-  const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
-  const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
-  const int stride = image_width;
-
-  for (int j = 0; j < KERNEL_Y; ++j) {
-    for (int i = 0; i < image_width; ++i) {
-      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
-    }
-  }
-
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+ : SSIM)
-#endif
-  for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
-    for (int i = 0; i < KERNEL_X; ++i) {
-      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
-    }
-    for (int i = KERNEL_X; i < start_x; ++i) {
-      SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
-    }
-    if (start_x < image_width) {
-      // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
-      // copy the 8 rightmost pixels on a cache area, and pad this area with
-      // zeros which won't contribute to the overall SSIM value (but we need
-      // to pass the correct normalizing constant!). By using this cache, we can
-      // still call GetSSIMFullKernel() instead of the slower GetSSIM().
-      // NOTE: we could use similar method for the left-most pixels too.
-      const int kScratchWidth = 8;
-      const int kScratchStride = kScratchWidth + KERNEL + 1;
-      uint8_t scratch_org[KERNEL_SIZE * kScratchStride] = {0};
-      uint8_t scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
-
-      for (int k = 0; k < KERNEL_SIZE; ++k) {
-        const int offset =
-            (j - KERNEL + k) * stride + image_width - kScratchWidth;
-        memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
-        memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
-      }
-      for (int k = 0; k <= KERNEL_X + 1; ++k) {
-        SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
-                                  kScratchStride, kiW[k]);
-      }
-    }
-  }
-
-  for (int j = start_y; j < image_height; ++j) {
-    for (int i = 0; i < image_width; ++i) {
-      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
-    }
-  }
-  return SSIM;
-}
-
-double CalcLSSIM(double ssim) {
-  return -10.0 * log10(1.0 - ssim);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/thirdparty/libyuv/util/ssim.h b/thirdparty/libyuv/util/ssim.h
deleted file mode 100644
index a855f1d..0000000
--- a/thirdparty/libyuv/util/ssim.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
-
-#ifndef UTIL_SSIM_H_
-#define UTIL_SSIM_H_
-
-#include <math.h>  // For log10()
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
-typedef unsigned char uint8_t;
-#define UINT8_TYPE_DEFINED
-#endif
-
-double CalcSSIM(const uint8_t* org,
-                const uint8_t* rec,
-                const int image_width,
-                const int image_height);
-
-double CalcLSSIM(double ssim);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // UTIL_SSIM_H_
diff --git a/thirdparty/libyuv/util/yuvconstants.c b/thirdparty/libyuv/util/yuvconstants.c
deleted file mode 100644
index 037e082..0000000
--- a/thirdparty/libyuv/util/yuvconstants.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- *  Copyright 2021 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-// This utility computes values needed to generate yuvconstants based on
-// white point values.
-// The yuv formulas are tuned for 8 bit YUV channels.
-
-// See Also
-// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
-
-// BT.709 full range YUV to RGB reference
-//  R = Y               + V * 1.5748
-//  G = Y - U * 0.18732 - V * 0.46812
-//  B = Y + U * 1.8556
-//  KR = 0.2126
-//  KB = 0.0722
-
-// // Y contribution to R,G,B.  Scale and bias.
-// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-// #define YB 32    /* 64 / 2 */
-//
-// // U and V contributions to R,G,B.
-// #define UB 113 /* round(1.77200 * 64) */
-// #define UG 22  /* round(0.34414 * 64) */
-// #define VG 46  /* round(0.71414 * 64) */
-// #define VR 90  /* round(1.40200 * 64) */
-//
-// // Bias values to round, and subtract 128 from U and V.
-// #define BB (-UB * 128 + YB)
-// #define BG (UG * 128 + VG * 128 + YB)
-// #define BR (-VR * 128 + YB)
-
-int main(int argc, const char* argv[]) {
-  if (argc < 2) {
-    printf("yuvconstants Kr Kb\n");
-    printf("  MC BT          KR = 0.2126; KB = 0.0722\n");
-    printf("  1  BT.709      KR = 0.2126; KB = 0.0722\n");
-    printf("  4  FCC         KR = 0.30;   KB = 0.11\n");
-    printf("  6  BT.601      KR = 0.299;  KB = 0.114\n");
-    printf("  7  SMPTE 240M  KR = 0.212;  KB = 0.087\n");
-    printf("  9  BT.2020     KR = 0.2627; KB = 0.0593\n");
-    return -1;
-  }
-  float kr = atof(argv[1]);
-  float kb = atof(argv[2]);
-  float kg = 1 - kr - kb;
-
-  float vr = 2 * (1 - kr);
-  float ug = 2 * ((1 - kb) * kb / kg);
-  float vg = 2 * ((1 - kr) * kr / kg);
-  float ub = 2 * (1 - kb);
-
-  printf("Full range\n");
-  printf("R = Y                + V * %5f\n", vr);
-  printf("G = Y - U * %6f - V * %6f\n", ug, vg);
-  printf("B = Y + U * %5f\n", ub);
-
-  printf("KR = %4f; ", kr);
-  printf("KB = %4f\n", kb);
-  // printf("KG = %4f\n", kg);
-  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-  // #define YB 32    /* 64 / 2 */
-  //
-  // // U and V contributions to R,G,B.
-
-  printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
-  printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
-  printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
-  printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
-
-  vr = 255.f / 224.f * 2 * (1 - kr);
-  ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
-  vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
-  ub = 255.f / 224.f * 2 * (1 - kb);
-
-  printf("\nLimited range\n");
-  printf("R = (Y - 16) * 1.164                + V * %5f\n", vr);
-  printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
-  printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
-
-  // printf("KG = %4f\n", kg);
-  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-  // #define YB 32    /* 64 / 2 */
-  //
-  // // U and V contributions to R,G,B.
-
-  printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
-  printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
-  printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
-  printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
-
-  return 0;
-}
diff --git a/thirdparty/libyuv/util/yuvconvert.cc b/thirdparty/libyuv/util/yuvconvert.cc
deleted file mode 100644
index 27cdfe9..0000000
--- a/thirdparty/libyuv/util/yuvconvert.cc
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Convert an ARGB image to YUV.
-// Usage: yuvconvert src_argb.raw dst_yuv.raw
-
-#ifndef _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_WARNINGS
-#endif
-
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "libyuv/convert.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/scale_argb.h"
-
-// options
-bool verbose = false;
-bool attenuate = false;
-bool unattenuate = false;
-int image_width = 0, image_height = 0;  // original width and height
-int dst_width = 0, dst_height = 0;      // new width and height
-int fileindex_org = 0;  // argv argument contains the original file name.
-int fileindex_rec = 0;  // argv argument contains the reconstructed file name.
-int num_rec = 0;        // Number of reconstructed images.
-int num_skip_org = 0;   // Number of frames to skip in original.
-int num_frames = 0;     // Number of frames to convert.
-int filter = 1;         // Bilinear filter for scaling.
-
-static __inline uint32_t Abs(int32_t v) {
-  return v >= 0 ? v : -v;
-}
-
-// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
-bool ExtractResolutionFromFilename(const char* name,
-                                   int* width_ptr,
-                                   int* height_ptr) {
-  // Isolate the .width_height. section of the filename by searching for a
-  // dot or underscore followed by a digit.
-  for (int i = 0; name[i]; ++i) {
-    if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' &&
-        name[i + 1] <= '9') {
-      int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr);  // NOLINT
-      if (2 == n) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-void PrintHelp(const char* program) {
-  printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
-  printf(
-      " -s <width> <height> .... specify source resolution.  "
-      "Optional if name contains\n"
-      "                          resolution (ie. "
-      "name.1920x800_24Hz_P420.yuv)\n"
-      "                          Negative value mirrors.\n");
-  printf(" -d <width> <height> .... specify destination resolution.\n");
-  printf(" -f <filter> ............ 0 = point, 1 = bilinear (default).\n");
-  printf(" -skip <src_argb> ....... Number of frame to skip of src_argb\n");
-  printf(" -frames <num> .......... Number of frames to convert\n");
-  printf(" -attenuate ............. Attenuate the ARGB image\n");
-  printf(" -unattenuate ........... Unattenuate the ARGB image\n");
-  printf(" -v ..................... verbose\n");
-  printf(" -h ..................... this help\n");
-  exit(0);
-}
-
-void ParseOptions(int argc, const char* argv[]) {
-  if (argc <= 1) {
-    PrintHelp(argv[0]);
-  }
-  for (int c = 1; c < argc; ++c) {
-    if (!strcmp(argv[c], "-v")) {
-      verbose = true;
-    } else if (!strcmp(argv[c], "-attenuate")) {
-      attenuate = true;
-    } else if (!strcmp(argv[c], "-unattenuate")) {
-      unattenuate = true;
-    } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
-      PrintHelp(argv[0]);
-    } else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
-      image_width = atoi(argv[++c]);   // NOLINT
-      image_height = atoi(argv[++c]);  // NOLINT
-    } else if (!strcmp(argv[c], "-d") && c + 2 < argc) {
-      dst_width = atoi(argv[++c]);   // NOLINT
-      dst_height = atoi(argv[++c]);  // NOLINT
-    } else if (!strcmp(argv[c], "-skip") && c + 1 < argc) {
-      num_skip_org = atoi(argv[++c]);  // NOLINT
-    } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
-      num_frames = atoi(argv[++c]);  // NOLINT
-    } else if (!strcmp(argv[c], "-f") && c + 1 < argc) {
-      filter = atoi(argv[++c]);  // NOLINT
-    } else if (argv[c][0] == '-') {
-      fprintf(stderr, "Unknown option. %s\n", argv[c]);
-    } else if (fileindex_org == 0) {
-      fileindex_org = c;
-    } else if (fileindex_rec == 0) {
-      fileindex_rec = c;
-      num_rec = 1;
-    } else {
-      ++num_rec;
-    }
-  }
-  if (fileindex_org == 0 || fileindex_rec == 0) {
-    fprintf(stderr, "Missing filenames\n");
-    PrintHelp(argv[0]);
-  }
-  if (num_skip_org < 0) {
-    fprintf(stderr, "Skipped frames incorrect\n");
-    PrintHelp(argv[0]);
-  }
-  if (num_frames < 0) {
-    fprintf(stderr, "Number of frames incorrect\n");
-    PrintHelp(argv[0]);
-  }
-
-  int org_width, org_height;
-  int rec_width, rec_height;
-  bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
-                                                     &org_width, &org_height);
-  bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
-                                                     &rec_width, &rec_height);
-  if (image_width == 0 || image_height == 0) {
-    if (org_res_avail) {
-      image_width = org_width;
-      image_height = org_height;
-    } else if (rec_res_avail) {
-      image_width = rec_width;
-      image_height = rec_height;
-    } else {
-      fprintf(stderr, "Missing dimensions.\n");
-      PrintHelp(argv[0]);
-    }
-  }
-  if (dst_width == 0 || dst_height == 0) {
-    if (rec_res_avail) {
-      dst_width = rec_width;
-      dst_height = rec_height;
-    } else {
-      dst_width = Abs(image_width);
-      dst_height = Abs(image_height);
-    }
-  }
-}
-
-static const int kTileX = 32;
-static const int kTileY = 32;
-
-static int TileARGBScale(const uint8_t* src_argb,
-                         int src_stride_argb,
-                         int src_width,
-                         int src_height,
-                         uint8_t* dst_argb,
-                         int dst_stride_argb,
-                         int dst_width,
-                         int dst_height,
-                         libyuv::FilterMode filtering) {
-  for (int y = 0; y < dst_height; y += kTileY) {
-    for (int x = 0; x < dst_width; x += kTileX) {
-      int clip_width = kTileX;
-      if (x + clip_width > dst_width) {
-        clip_width = dst_width - x;
-      }
-      int clip_height = kTileY;
-      if (y + clip_height > dst_height) {
-        clip_height = dst_height - y;
-      }
-      int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb, src_width,
-                                    src_height, dst_argb, dst_stride_argb,
-                                    dst_width, dst_height, x, y, clip_width,
-                                    clip_height, filtering);
-      if (r) {
-        return r;
-      }
-    }
-  }
-  return 0;
-}
-
-int main(int argc, const char* argv[]) {
-  ParseOptions(argc, argv);
-
-  // Open original file (first file argument)
-  FILE* const file_org = fopen(argv[fileindex_org], "rb");
-  if (file_org == NULL) {
-    fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]);
-    exit(1);
-  }
-
-  // Open all files to convert to
-  FILE** file_rec = new FILE*[num_rec];
-  memset(file_rec, 0, num_rec * sizeof(FILE*));  // NOLINT
-  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
-    file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "wb");
-    if (file_rec[cur_rec] == NULL) {
-      fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]);
-      fclose(file_org);
-      for (int i = 0; i < cur_rec; ++i) {
-        fclose(file_rec[i]);
-      }
-      delete[] file_rec;
-      exit(1);
-    }
-  }
-
-  bool org_is_yuv = strstr(argv[fileindex_org], "_P420.") != NULL;
-  bool org_is_argb = strstr(argv[fileindex_org], "_ARGB.") != NULL;
-  if (!org_is_yuv && !org_is_argb) {
-    fprintf(stderr, "Original format unknown %s\n", argv[fileindex_org]);
-    exit(1);
-  }
-  int org_size = Abs(image_width) * Abs(image_height) * 4;  // ARGB
-  // Input is YUV
-  if (org_is_yuv) {
-    const int y_size = Abs(image_width) * Abs(image_height);
-    const int uv_size =
-        ((Abs(image_width) + 1) / 2) * ((Abs(image_height) + 1) / 2);
-    org_size = y_size + 2 * uv_size;  // YUV original.
-  }
-
-  const int dst_size = dst_width * dst_height * 4;  // ARGB scaled
-  const int y_size = dst_width * dst_height;
-  const int uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
-  const size_t total_size = y_size + 2 * uv_size;
-#if defined(_MSC_VER)
-  _fseeki64(file_org,
-            static_cast<__int64>(num_skip_org) * static_cast<__int64>(org_size),
-            SEEK_SET);
-#else
-  fseek(file_org, num_skip_org * total_size, SEEK_SET);
-#endif
-
-  uint8_t* const ch_org = new uint8_t[org_size];
-  uint8_t* const ch_dst = new uint8_t[dst_size];
-  uint8_t* const ch_rec = new uint8_t[total_size];
-  if (ch_org == NULL || ch_rec == NULL) {
-    fprintf(stderr, "No memory available\n");
-    fclose(file_org);
-    for (int i = 0; i < num_rec; ++i) {
-      fclose(file_rec[i]);
-    }
-    delete[] ch_org;
-    delete[] ch_dst;
-    delete[] ch_rec;
-    delete[] file_rec;
-    exit(1);
-  }
-
-  if (verbose) {
-    printf("Size: %dx%d to %dx%d\n", image_width, image_height, dst_width,
-           dst_height);
-  }
-
-  int number_of_frames;
-  for (number_of_frames = 0;; ++number_of_frames) {
-    if (num_frames && number_of_frames >= num_frames) {
-      break;
-    }
-
-    // Load original YUV or ARGB frame.
-    size_t bytes_org =
-        fread(ch_org, sizeof(uint8_t), static_cast<size_t>(org_size), file_org);
-    if (bytes_org < static_cast<size_t>(org_size)) {
-      break;
-    }
-
-    // TODO(fbarchard): Attenuate doesnt need to know dimensions.
-    // ARGB attenuate frame
-    if (org_is_argb && attenuate) {
-      libyuv::ARGBAttenuate(ch_org, 0, ch_org, 0, org_size / 4, 1);
-    }
-    // ARGB unattenuate frame
-    if (org_is_argb && unattenuate) {
-      libyuv::ARGBUnattenuate(ch_org, 0, ch_org, 0, org_size / 4, 1);
-    }
-
-    for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
-      // Scale YUV or ARGB frame.
-      if (org_is_yuv) {
-        int src_width = Abs(image_width);
-        int src_height = Abs(image_height);
-        int half_src_width = (src_width + 1) / 2;
-        int half_src_height = (src_height + 1) / 2;
-        int half_dst_width = (dst_width + 1) / 2;
-        int half_dst_height = (dst_height + 1) / 2;
-        I420Scale(
-            ch_org, src_width, ch_org + src_width * src_height, half_src_width,
-            ch_org + src_width * src_height + half_src_width * half_src_height,
-            half_src_width, image_width, image_height, ch_rec, dst_width,
-            ch_rec + dst_width * dst_height, half_dst_width,
-            ch_rec + dst_width * dst_height + half_dst_width * half_dst_height,
-            half_dst_width, dst_width, dst_height,
-            static_cast<libyuv::FilterMode>(filter));
-      } else {
-        TileARGBScale(ch_org, Abs(image_width) * 4, image_width, image_height,
-                      ch_dst, dst_width * 4, dst_width, dst_height,
-                      static_cast<libyuv::FilterMode>(filter));
-      }
-      bool rec_is_yuv = strstr(argv[fileindex_rec + cur_rec], "_P420.") != NULL;
-      bool rec_is_argb =
-          strstr(argv[fileindex_rec + cur_rec], "_ARGB.") != NULL;
-      if (!rec_is_yuv && !rec_is_argb) {
-        fprintf(stderr, "Output format unknown %s\n",
-                argv[fileindex_rec + cur_rec]);
-        continue;  // Advance to next file.
-      }
-
-      // Convert ARGB to YUV.
-      if (!org_is_yuv && rec_is_yuv) {
-        int half_width = (dst_width + 1) / 2;
-        int half_height = (dst_height + 1) / 2;
-        libyuv::ARGBToI420(
-            ch_dst, dst_width * 4, ch_rec, dst_width,
-            ch_rec + dst_width * dst_height, half_width,
-            ch_rec + dst_width * dst_height + half_width * half_height,
-            half_width, dst_width, dst_height);
-      }
-
-      // Output YUV or ARGB frame.
-      if (rec_is_yuv) {
-        size_t bytes_rec =
-            fwrite(ch_rec, sizeof(uint8_t), static_cast<size_t>(total_size),
-                   file_rec[cur_rec]);
-        if (bytes_rec < static_cast<size_t>(total_size)) {
-          break;
-        }
-      } else {
-        size_t bytes_rec =
-            fwrite(ch_dst, sizeof(uint8_t), static_cast<size_t>(dst_size),
-                   file_rec[cur_rec]);
-        if (bytes_rec < static_cast<size_t>(dst_size)) {
-          break;
-        }
-      }
-      if (verbose) {
-        printf("%5d", number_of_frames);
-      }
-      if (verbose) {
-        printf("\t%s", argv[fileindex_rec + cur_rec]);
-        printf("\n");
-      }
-    }
-  }
-
-  fclose(file_org);
-  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
-    fclose(file_rec[cur_rec]);
-  }
-  delete[] ch_org;
-  delete[] ch_dst;
-  delete[] ch_rec;
-  delete[] file_rec;
-  return 0;
-}
diff --git a/thirdparty/libyuv/winarm.mk b/thirdparty/libyuv/winarm.mk
deleted file mode 100644
index b0a344a..0000000
--- a/thirdparty/libyuv/winarm.mk
+++ /dev/null
@@ -1,47 +0,0 @@
-# This is a generic makefile for libyuv for Windows Arm.
-# call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
-# nmake /f winarm.mk
-# make -f winarm.mk
-# nmake /f winarm.mk clean
-# consider /arch:ARMv7VE
-CC=cl
-CCFLAGS=/Ox /nologo /Iinclude /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP
-AR=lib
-ARFLAGS=/MACHINE:ARM /NOLOGO /SUBSYSTEM:NATIVE
-RM=cmd /c del
-
-LOCAL_OBJ_FILES = \
-	source/compare.o\
-	source/compare_common.o\
-	source/convert.o\
-	source/convert_argb.o\
-	source/convert_from.o\
-	source/convert_from_argb.o\
-	source/convert_to_argb.o\
-	source/convert_to_i420.o\
-	source/cpu_id.o\
-	source/planar_functions.o\
-	source/rotate.o\
-	source/rotate_any.o\
-	source/rotate_argb.o\
-	source/rotate_common.o\
-	source/row_any.o\
-	source/row_common.o\
-	source/scale.o\
-	source/scale_any.o\
-	source/scale_argb.o\
-	source/scale_common.o\
-	source/scale_uv.o\
-	source/video_common.o
-
-.cc.o:
-	$(CC) /c $(CCFLAGS) $*.cc /Fo$@
-
-all: libyuv_arm.lib winarm.mk
-
-libyuv_arm.lib: $(LOCAL_OBJ_FILES) winarm.mk
-	$(AR) $(ARFLAGS) /OUT:$@ $(LOCAL_OBJ_FILES)
-
-clean:
-	$(RM) "source\*.o" libyuv_arm.lib
-
diff --git a/thirdparty/libyuv/xmake.lua b/thirdparty/libyuv/xmake.lua
index bb6e1a9..003a8f5 100644
--- a/thirdparty/libyuv/xmake.lua
+++ b/thirdparty/libyuv/xmake.lua
@@ -3,19 +3,16 @@ package("libyuv")
     set_homepage("https://chromium.googlesource.com/libyuv/libyuv/")
     set_description("libyuv is an open source project that includes YUV scaling and conversion functionality.")
     set_license("BSD-3-Clause")
-    -- add_versions("20210528", "eb6e7bb63738e29efd82ea3cf2a115238a89fa51")
+    set_urls("https://chromium.googlesource.com/libyuv/libyuv.git")
+    add_versions("2024.5.21", "8e18fc93c8c07d2ba6f9671281d6f35c8c47b2f4")
 
-    -- set_urls("https://chromium.googlesource.com/libyuv/libyuv.git")
-    -- add_versions("2023.10.27", "31e1d6f896615342d5d5b6bde8f7b50b3fd698dc")
-
-    set_sourcedir(os.scriptdir())
     add_deps("cmake")
 
     on_install("windows", "linux", "macosx", "android", "cross", "bsd", "mingw", function (package)
         local configs = {"-DTEST=OFF"}
         table.insert(configs, "-DCMAKE_BUILD_TYPE=" .. (package:debug() and "Debug" or "Release"))
         
-        io.replace("CMakeLists.txt", "INSTALL ( PROGRAMS ${CMAKE_BINARY_DIR}/yuvconvert			DESTINATION bin )", "", {plain = true})
+        io.replace("CMakeLists.txt", "INSTALL ( PROGRAMS ${CMAKE_BINARY_DIR}/yuvconvert DESTINATION bin )", "", {plain = true})
         import("package.tools.cmake").install(package, configs)
         
         if package:is_plat("macosx", "linux", "android") then
@@ -26,3 +23,7 @@ package("libyuv")
             end
         end
     end)
+
+    on_test(function (package)
+        assert(package:has_cfuncs("I420Rotate", {includes = "libyuv/rotate.h"}))
+    end)
\ No newline at end of file
diff --git a/thirdparty/xmake.lua b/thirdparty/xmake.lua
index 40e3c60..288faf9 100644
--- a/thirdparty/xmake.lua
+++ b/thirdparty/xmake.lua
@@ -1 +1 @@
-includes("openfec", "libyuv")
\ No newline at end of file
+includes("openfec", "libyuv", "aom")
\ No newline at end of file
diff --git a/xmake.lua b/xmake.lua
index f8c932a..9a6ee77 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -11,31 +11,28 @@ add_defines("ASIO_STANDALONE", "ASIO_HAS_STD_TYPE_TRAITS", "ASIO_HAS_STD_SHARED_
     "ASIO_HAS_STD_ADDRESSOF", "ASIO_HAS_STD_ATOMIC", "ASIO_HAS_STD_CHRONO", 
     "ASIO_HAS_CSTDINT", "ASIO_HAS_STD_ARRAY",  "ASIO_HAS_STD_SYSTEM_ERROR")
 
-add_requires("asio 1.24.0", "nlohmann_json", "spdlog 1.11.0", "openfec", "libopus 1.4", "dav1d 1.1.0", "libyuv")
-add_packages("asio", "nlohmann_json", "spdlog", "openfec", "libopus", "dav1d", "libyuv")
+add_requires("asio 1.24.0", "nlohmann_json", "spdlog 1.11.0", "openfec", "libopus 1.4", "dav1d 1.1.0", "libyuv", "aom")
+add_packages("asio", "nlohmann_json", "spdlog", "openfec", "libopus", "dav1d", "libyuv", "aom")
 
 includes("thirdparty")
 
 if is_os("windows") then
     add_requires("vcpkg::libnice", {configs = {shared = false}})
     add_requires("openh264 2.1.1", {configs = {shared = false}})
-    add_requires("vcpkg::aom 3.8.1")
-    add_packages("vcpkg::libnice", "openh264", "vcpkg::aom", "cuda")
+    add_packages("vcpkg::libnice", "openh264", "cuda")
     add_defines("_WEBSOCKETPP_CPP11_INTERNAL_")
     add_requires("cuda")
 elseif is_os("linux") then
     add_requires("glib", {system = true})
     add_requires("vcpkg::libnice", {configs = {shared = false}})
     add_requires("openh264 2.1.1", {configs = {shared = false}})
-    add_requires("vcpkg::aom 3.8.1")
     add_packages("glib", "vcpkg::libnice", "openh264", "cuda")
     add_cxflags("-fPIC") 
     add_syslinks("pthread")
 elseif is_os("macosx") then
     add_requires("vcpkg::libnice", {configs = {shared = false}})
     add_requires("vcpkg::openh264", {configs = {shared = false}})
-    add_requires("vcpkg::aom 3.8.1")
-    add_packages("vcpkg::libnice", "vcpkg::openh264", "vcpkg::aom")
+    add_packages("vcpkg::libnice", "vcpkg::openh264")
     add_ldflags("-Wl,-ld_classic")
 end
 
@@ -197,7 +194,7 @@ target("projectx")
         add_links("nice", "glib-2.0", "gio-2.0", "gmodule-2.0", "gobject-2.0",
         "pcre2-8", "pcre2-16", "pcre2-32", "pcre2-posix", 
         "zlib", "ffi", "libcrypto", "libssl", "intl", "iconv", 
-        "Shell32", "Advapi32", "Dnsapi", "Shlwapi", 
+        "Shell32", "Advapi32", "Dnsapi", "Shlwapi", "Crypt32", 
         "cuda", "nvencodeapi", "nvcuvid",
         "ws2_32", "Bcrypt", "windowsapp", "User32", "Strmiids", "Mfuuid",
         "Secur32", "Bcrypt")