Use kcp as QoS module

2025-12-17 12:42:51 +08:00 · 2023-08-30 17:44:22 +08:00
parent a4cd77dcb0
commit 3c1f7973d0
79 changed files with 14442 additions and 3150 deletions
--- a/thirdparty/ffmpeg/xmake.lua
+++ b/thirdparty/ffmpeg/xmake.lua
@@ -32,7 +32,7 @@ package("ffmpeg")
        add_configs("hardcoded-tables", {description = "Enable hardcoded tables.", default = true, type = "boolean"})
    end

-    add_links("avfilter", "avdevice", "avformat", "avcodec", "swscale", "swresample", "avutil")
+    -- add_links("avfilter", "avdevice", "avformat", "avcodec", "swscale", "swresample", "avutil")
    if is_plat("macosx") then
        add_frameworks("CoreFoundation", "Foundation", "CoreVideo", "CoreMedia", "AudioToolbox", "VideoToolbox", "Security")
    elseif is_plat("linux") then
--- a/thirdparty/nvcodec/Interface/cuviddec.h
+++ b/thirdparty/nvcodec/Interface/cuviddec.h
--- a/thirdparty/nvcodec/Interface/nvEncodeAPI.h
+++ b/thirdparty/nvcodec/Interface/nvEncodeAPI.h
--- a/thirdparty/nvcodec/Interface/nvcuvid.h
+++ b/thirdparty/nvcodec/Interface/nvcuvid.h
@@ -0,0 +1,436 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2020 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/********************************************************************************************************************/
+//! \file nvcuvid.h
+//!   NVDECODE API provides video decoding interface to NVIDIA GPU devices.
+//! \date 2015-2020
+//!  This file contains the interface constants, structure definitions and function prototypes.
+/********************************************************************************************************************/
+
+#if !defined(__NVCUVID_H__)
+#define __NVCUVID_H__
+
+#include "cuviddec.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+
+/***********************************************/
+//!
+//! High-level helper APIs for video sources
+//!
+/***********************************************/
+
+typedef void *CUvideosource;
+typedef void *CUvideoparser;
+typedef long long CUvideotimestamp;
+
+
+/************************************************************************/
+//! \enum cudaVideoState
+//! Video source state enums
+//! Used in cuvidSetVideoSourceState and cuvidGetVideoSourceState APIs
+/************************************************************************/
+typedef enum {
+    cudaVideoState_Error   = -1,    /**< Error state (invalid source)                  */
+    cudaVideoState_Stopped = 0,     /**< Source is stopped (or reached end-of-stream)  */
+    cudaVideoState_Started = 1      /**< Source is running and delivering data         */
+} cudaVideoState;
+
+/************************************************************************/
+//! \enum cudaAudioCodec
+//! Audio compression enums
+//! Used in CUAUDIOFORMAT structure
+/************************************************************************/
+typedef enum {
+    cudaAudioCodec_MPEG1=0,         /**< MPEG-1 Audio               */
+    cudaAudioCodec_MPEG2,           /**< MPEG-2 Audio               */
+    cudaAudioCodec_MP3,             /**< MPEG-1 Layer III Audio     */
+    cudaAudioCodec_AC3,             /**< Dolby Digital (AC3) Audio  */
+    cudaAudioCodec_LPCM,            /**< PCM Audio                  */
+    cudaAudioCodec_AAC,             /**< AAC Audio                  */
+} cudaAudioCodec;
+
+/************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMAT
+//! Video format
+//! Used in cuvidGetSourceVideoFormat API
+/************************************************************************************************/
+typedef struct
+{
+    cudaVideoCodec codec;                   /**< OUT: Compression format          */
+   /**
+    * OUT: frame rate = numerator / denominator (for example: 30000/1001)
+    */
+    struct {
+        /**< OUT: frame rate numerator   (0 = unspecified or variable frame rate) */
+        unsigned int numerator;
+        /**< OUT: frame rate denominator (0 = unspecified or variable frame rate) */
+        unsigned int denominator;
+    } frame_rate;
+    unsigned char progressive_sequence;     /**< OUT: 0=interlaced, 1=progressive                                      */
+    unsigned char bit_depth_luma_minus8;    /**< OUT: high bit depth luma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth   */
+    unsigned char bit_depth_chroma_minus8;  /**< OUT: high bit depth chroma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */
+    unsigned char min_num_decode_surfaces;  /**< OUT: Minimum number of decode surfaces to be allocated for correct
+                                                      decoding. The client can send this value in ulNumDecodeSurfaces
+                                                      (in CUVIDDECODECREATEINFO structure).
+                                                      This guarantees correct functionality and optimal video memory
+                                                      usage but not necessarily the best performance, which depends on
+                                                      the design of the overall application. The optimal number of
+                                                      decode surfaces (in terms of performance and memory utilization)
+                                                      should be decided by experimentation for each application, but it
+                                                      cannot go below min_num_decode_surfaces.
+                                                      If this value is used for ulNumDecodeSurfaces then it must be
+                                                      returned to parser during sequence callback.                     */
+    unsigned int coded_width;               /**< OUT: coded frame width in pixels                                      */
+    unsigned int coded_height;              /**< OUT: coded frame height in pixels                                     */
+   /**
+    * area of the frame that should be displayed
+    * typical example:
+    * coded_width = 1920, coded_height = 1088
+    * display_area = { 0,0,1920,1080 }
+    */
+    struct {
+        int left;                           /**< OUT: left position of display rect    */
+        int top;                            /**< OUT: top position of display rect     */
+        int right;                          /**< OUT: right position of display rect   */
+        int bottom;                         /**< OUT: bottom position of display rect  */
+    } display_area;
+    cudaVideoChromaFormat chroma_format;    /**< OUT:  Chroma format                   */
+    unsigned int bitrate;                   /**< OUT: video bitrate (bps, 0=unknown)   */
+   /**
+    * OUT: Display Aspect Ratio = x:y (4:3, 16:9, etc)
+    */
+    struct {
+        int x;
+        int y;
+    } display_aspect_ratio;
+    /**
+    * Video Signal Description
+    * Refer section E.2.1 (VUI parameters semantics) of H264 spec file
+    */
+    struct {
+        unsigned char video_format          : 3; /**< OUT: 0-Component, 1-PAL, 2-NTSC, 3-SECAM, 4-MAC, 5-Unspecified     */
+        unsigned char video_full_range_flag : 1; /**< OUT: indicates the black level and luma and chroma range           */
+        unsigned char reserved_zero_bits    : 4; /**< Reserved bits                                                      */
+        unsigned char color_primaries;           /**< OUT: chromaticity coordinates of source primaries                  */
+        unsigned char transfer_characteristics;  /**< OUT: opto-electronic transfer characteristic of the source picture */
+        unsigned char matrix_coefficients;       /**< OUT: used in deriving luma and chroma signals from RGB primaries   */
+    } video_signal_description;
+    unsigned int seqhdr_data_length;             /**< OUT: Additional bytes following (CUVIDEOFORMATEX)                  */
+} CUVIDEOFORMAT;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDOPERATINGPOINTINFO
+//! Operating point information of scalable bitstream
+/****************************************************************/
+typedef struct 
+{
+    cudaVideoCodec codec;
+    union 
+    {
+        struct
+        {
+            unsigned char  operating_points_cnt;
+            unsigned char  reserved24_bits[3];
+            unsigned short operating_points_idc[32];
+        } av1;
+        unsigned char CodecReserved[1024];
+    };
+} CUVIDOPERATINGPOINTINFO;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDAV1SEQHDR
+//! AV1 specific sequence header information
+/****************************************************************/
+typedef struct {
+    unsigned int max_width;
+    unsigned int max_height;
+    unsigned char reserved[1016];
+} CUVIDAV1SEQHDR;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMATEX
+//! Video format including raw sequence header information
+//! Used in cuvidGetSourceVideoFormat API
+/****************************************************************/
+typedef struct
+{
+    CUVIDEOFORMAT format;                 /**< OUT: CUVIDEOFORMAT structure */
+    union {
+        CUVIDAV1SEQHDR av1;
+        unsigned char raw_seqhdr_data[1024];  /**< OUT: Sequence header data    */
+    };
+} CUVIDEOFORMATEX;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUAUDIOFORMAT
+//! Audio formats
+//! Used in cuvidGetSourceAudioFormat API
+/****************************************************************/
+typedef struct
+{
+    cudaAudioCodec codec;       /**< OUT: Compression format                                              */
+    unsigned int channels;      /**< OUT: number of audio channels                                        */
+    unsigned int samplespersec; /**< OUT: sampling frequency                                              */
+    unsigned int bitrate;       /**< OUT: For uncompressed, can also be used to determine bits per sample */
+    unsigned int reserved1;     /**< Reserved for future use                                              */
+    unsigned int reserved2;     /**< Reserved for future use                                              */
+} CUAUDIOFORMAT;
+
+
+/***************************************************************/
+//! \enum CUvideopacketflags
+//! Data packet flags
+//! Used in CUVIDSOURCEDATAPACKET structure
+/***************************************************************/
+typedef enum {
+    CUVID_PKT_ENDOFSTREAM   = 0x01,   /**< Set when this is the last packet for this stream                              */
+    CUVID_PKT_TIMESTAMP     = 0x02,   /**< Timestamp is valid                                                            */
+    CUVID_PKT_DISCONTINUITY = 0x04,   /**< Set when a discontinuity has to be signalled                                  */
+    CUVID_PKT_ENDOFPICTURE  = 0x08,   /**< Set when the packet contains exactly one frame or one field                   */
+    CUVID_PKT_NOTIFY_EOS    = 0x10,   /**< If this flag is set along with CUVID_PKT_ENDOFSTREAM, an additional (dummy)
+                                           display callback will be invoked with null value of CUVIDPARSERDISPINFO which
+                                           should be interpreted as end of the stream.                                   */
+} CUvideopacketflags;
+
+/*****************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEDATAPACKET
+//! Data Packet
+//! Used in cuvidParseVideoData API
+//! IN for cuvidParseVideoData
+/*****************************************************************************/
+typedef struct _CUVIDSOURCEDATAPACKET
+{
+    unsigned long flags;            /**< IN: Combination of CUVID_PKT_XXX flags                              */
+    unsigned long payload_size;     /**< IN: number of bytes in the payload (may be zero if EOS flag is set) */
+    const unsigned char *payload;   /**< IN: Pointer to packet payload data (may be NULL if EOS flag is set) */
+    CUvideotimestamp timestamp;     /**< IN: Presentation time stamp (10MHz clock), only valid if
+                                             CUVID_PKT_TIMESTAMP flag is set                                 */
+} CUVIDSOURCEDATAPACKET;
+
+// Callback for packet delivery
+typedef int (CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *);
+
+/**************************************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEPARAMS
+//! Describes parameters needed in cuvidCreateVideoSource API
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported
+//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
+/**************************************************************************************************************************/
+typedef struct _CUVIDSOURCEPARAMS
+{
+    unsigned int ulClockRate;                   /**< IN: Time stamp units in Hz (0=default=10000000Hz)      */
+    unsigned int bAnnexb : 1;                   /**< IN: AV1 annexB stream                                  */
+    unsigned int uReserved : 31;                /**< Reserved for future use - set to zero                  */
+    unsigned int uReserved1[6];                 /**< Reserved for future use - set to zero                  */
+    void *pUserData;                            /**< IN: User private data passed in to the data handlers   */
+    PFNVIDSOURCECALLBACK pfnVideoDataHandler;   /**< IN: Called to deliver video packets                    */
+    PFNVIDSOURCECALLBACK pfnAudioDataHandler;   /**< IN: Called to deliver audio packets.                   */
+    void *pvReserved2[8];                       /**< Reserved for future use - set to NULL                  */
+} CUVIDSOURCEPARAMS;
+
+
+/**********************************************/
+//! \ingroup ENUMS
+//! \enum CUvideosourceformat_flags
+//! CUvideosourceformat_flags
+//! Used in cuvidGetSourceVideoFormat API
+/**********************************************/
+typedef enum {
+    CUVID_FMT_EXTFORMATINFO = 0x100             /**< Return extended format structure (CUVIDEOFORMATEX) */
+} CUvideosourceformat_flags;
+
+#if !defined(__APPLE__)
+/***************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams)
+//! Create CUvideosource object. CUvideosource spawns demultiplexer thread that provides two callbacks: 
+//! pfnVideoDataHandler() and pfnAudioDataHandler()
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported 
+//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
+/***************************************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams);
+
+/***************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams)
+//! Create video source
+/***************************************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams);
+
+/********************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj)
+//! Destroy video source
+/********************************************************************/
+CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj);
+
+/******************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state)
+//! Set video source state to:
+//! cudaVideoState_Started - to signal the source to run and deliver data
+//! cudaVideoState_Stopped - to stop the source from delivering the data
+//! cudaVideoState_Error   - invalid source
+/******************************************************************************************/
+CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state);
+
+/******************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj)
+//! Get video source state
+//! Returns:
+//! cudaVideoState_Started - if Source is running and delivering data
+//! cudaVideoState_Stopped - if Source is stopped or reached end-of-stream
+//! cudaVideoState_Error   - if Source is in error state
+/******************************************************************************************/
+cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj);
+
+/******************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags)
+//! Gets video source format in pvidfmt, flags is set to combination of CUvideosourceformat_flags as per requirement
+/******************************************************************************************************************/
+CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags);
+
+/**************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags)
+//! Get audio source format
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported 
+//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
+/**************************************************************************************************************************/
+CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags);
+
+#endif
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERDISPINFO
+//! Used in cuvidParseVideoData API with PFNVIDDISPLAYCALLBACK pfnDisplayPicture
+/**********************************************************************************/
+typedef struct _CUVIDPARSERDISPINFO
+{
+    int picture_index;          /**< OUT: Index of the current picture                                                         */
+    int progressive_frame;      /**< OUT: 1 if progressive frame; 0 otherwise                                                  */
+    int top_field_first;        /**< OUT: 1 if top field is displayed first; 0 otherwise                                       */
+    int repeat_first_field;     /**< OUT: Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling, 
+                                     -1=unpaired field)                                                                        */
+    CUvideotimestamp timestamp; /**< OUT: Presentation time stamp                                                              */
+} CUVIDPARSERDISPINFO;
+
+/***********************************************************************************************************************/
+//! Parser callbacks
+//! The parser will call these synchronously from within cuvidParseVideoData(), whenever there is sequence change or a picture
+//! is ready to be decoded and/or displayed. First argument in functions is "void *pUserData" member of structure CUVIDSOURCEPARAMS
+//! Return values from these callbacks are interpreted as below. If the callbacks return failure, it will be propagated by
+//! cuvidParseVideoData() to the application.
+//! Parser picks default operating point as 0 and outputAllLayers flag as 0 if PFNVIDOPPOINTCALLBACK is not set or return value is 
+//! -1 or invalid operating point.
+//! PFNVIDSEQUENCECALLBACK : 0: fail, 1: succeeded, > 1: override dpb size of parser (set by CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces
+//! while creating parser)
+//! PFNVIDDECODECALLBACK   : 0: fail, >=1: succeeded
+//! PFNVIDDISPLAYCALLBACK  : 0: fail, >=1: succeeded
+//! PFNVIDOPPOINTCALLBACK  : <0: fail, >=0: succeeded (bit 0-9: OperatingPoint, bit 10-10: outputAllLayers, bit 11-30: reserved)
+/***********************************************************************************************************************/
+typedef int (CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *);
+typedef int (CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *);
+typedef int (CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *);
+typedef int (CUDAAPI *PFNVIDOPPOINTCALLBACK)(void *, CUVIDOPERATINGPOINTINFO*);
+
+/**************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERPARAMS
+//! Used in cuvidCreateVideoParser API
+/**************************************/
+typedef struct _CUVIDPARSERPARAMS
+{
+    cudaVideoCodec CodecType;                   /**< IN: cudaVideoCodec_XXX                                                  */
+    unsigned int ulMaxNumDecodeSurfaces;        /**< IN: Max # of decode surfaces (parser will cycle through these)          */
+    unsigned int ulClockRate;                   /**< IN: Timestamp units in Hz (0=default=10000000Hz)                        */
+    unsigned int ulErrorThreshold;              /**< IN: % Error threshold (0-100) for calling pfnDecodePicture (100=always 
+                                                     IN: call pfnDecodePicture even if picture bitstream is fully corrupted) */
+    unsigned int ulMaxDisplayDelay;             /**< IN: Max display queue delay (improves pipelining of decode with display)
+                                                         0=no delay (recommended values: 2..4)                               */
+    unsigned int bAnnexb : 1;                   /**< IN: AV1 annexB stream                                                   */
+    unsigned int uReserved : 31;                /**< Reserved for future use - set to zero                                   */
+    unsigned int uReserved1[4];                 /**< IN: Reserved for future use - set to 0                                  */
+    void *pUserData;                            /**< IN: User data for callbacks                                             */
+    PFNVIDSEQUENCECALLBACK pfnSequenceCallback; /**< IN: Called before decoding frames and/or whenever there is a fmt change */
+    PFNVIDDECODECALLBACK pfnDecodePicture;      /**< IN: Called when a picture is ready to be decoded (decode order)         */
+    PFNVIDDISPLAYCALLBACK pfnDisplayPicture;    /**< IN: Called whenever a picture is ready to be displayed (display order)  */
+    PFNVIDOPPOINTCALLBACK pfnGetOperatingPoint; /**< IN: Called from AV1 sequence header to get operating point of a AV1 
+                                                         scalable bitstream                                                  */
+    void *pvReserved2[6];                       /**< Reserved for future use - set to NULL                                   */
+    CUVIDEOFORMATEX *pExtVideoInfo;             /**< IN: [Optional] sequence header data from system layer                   */
+} CUVIDPARSERPARAMS;
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams)
+//! Create video parser object and initialize
+/************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket)
+//! Parse the video data from source data packet in pPacket 
+//! Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket and 
+//! calls back pfnDecodePicture with CUVIDPICPARAMS data for kicking of HW decoding
+//! calls back pfnSequenceCallback with CUVIDEOFORMAT data for initial sequence header or when
+//! the decoder encounters a video format change
+//! calls back pfnDisplayPicture with CUVIDPARSERDISPINFO data to display a video frame
+/************************************************************************************************/
+CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj)
+//! Destroy the video parser
+/************************************************************************************************/
+CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj);
+
+/**********************************************************************************************/
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif // __NVCUVID_H__
+
+
--- a/thirdparty/nvcodec/Lib/Win32/nvcuvid.lib
+++ b/thirdparty/nvcodec/Lib/Win32/nvcuvid.lib
--- a/thirdparty/nvcodec/Lib/Win32/nvencodeapi.lib
+++ b/thirdparty/nvcodec/Lib/Win32/nvencodeapi.lib
--- a/thirdparty/nvcodec/Lib/linux/stubs/aarch64/libnvcuvid.so
+++ b/thirdparty/nvcodec/Lib/linux/stubs/aarch64/libnvcuvid.so
--- a/thirdparty/nvcodec/Lib/linux/stubs/aarch64/libnvidia-encode.so
+++ b/thirdparty/nvcodec/Lib/linux/stubs/aarch64/libnvidia-encode.so
--- a/thirdparty/nvcodec/Lib/linux/stubs/ppc64le/libnvcuvid.so
+++ b/thirdparty/nvcodec/Lib/linux/stubs/ppc64le/libnvcuvid.so
--- a/thirdparty/nvcodec/Lib/linux/stubs/ppc64le/libnvidia-encode.so
+++ b/thirdparty/nvcodec/Lib/linux/stubs/ppc64le/libnvidia-encode.so
--- a/thirdparty/nvcodec/Lib/linux/stubs/x86_64/libnvcuvid.so
+++ b/thirdparty/nvcodec/Lib/linux/stubs/x86_64/libnvcuvid.so
--- a/thirdparty/nvcodec/Lib/linux/stubs/x86_64/libnvidia-encode.so
+++ b/thirdparty/nvcodec/Lib/linux/stubs/x86_64/libnvidia-encode.so
--- a/thirdparty/nvcodec/Lib/x64/nvcuvid.lib
+++ b/thirdparty/nvcodec/Lib/x64/nvcuvid.lib
--- a/thirdparty/nvcodec/Lib/x64/nvencodeapi.lib
+++ b/thirdparty/nvcodec/Lib/x64/nvencodeapi.lib
--- a/thirdparty/nvcodec/Samples/CMakeLists.txt
+++ b/thirdparty/nvcodec/Samples/CMakeLists.txt
@@ -0,0 +1,111 @@
+# Copyright 2020 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+# 3.7 is required for FindVulkan module support in CMake.
+cmake_minimum_required(VERSION 3.7)
+
+project(NvCodec)
+
+# Set C++11 for all projects and disable non-standard extensions
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(CMAKE_INSTALL_PREFIX .)
+
+set(NVCODEC_PUBLIC_INTERFACE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../Interface)
+set(NVCODEC_UTILS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/Utils)
+set(NV_CODEC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/NvCodec)
+set(NV_ENC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/NvCodec/NvEncoder)
+set(NV_DEC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/NvCodec/NvDecoder)
+set(NV_APPENC_COMMON_DIR ${CMAKE_CURRENT_SOURCE_DIR}/AppEncode/Common)
+set(NV_APPDEC_COMMON_DIR  ${CMAKE_CURRENT_SOURCE_DIR}/AppDecode/Common)
+
+ if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    set(NVCODEC_SAMPLES_INSTALL_DIR ${CMAKE_BINARY_DIR})
+else()
+    set(NVCODEC_SAMPLES_INSTALL_DIR ${CMAKE_BINARY_DIR})
+endif()
+
+if(WIN32)
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+        Set(CUVID_LIB ${CMAKE_CURRENT_SOURCE_DIR}/../Lib/x64/nvcuvid.lib)
+        set(NVENCODEAPI_LIB ${CMAKE_CURRENT_SOURCE_DIR}/../Lib/x64/nvencodeapi.lib)
+    else()
+        Set(CUVID_LIB ${CMAKE_CURRENT_SOURCE_DIR}/../Lib/Win32/nvcuvid.lib)
+        set(NVENCODEAPI_LIB ${CMAKE_CURRENT_SOURCE_DIR}/../Lib/Win32/nvencodeapi.lib)
+    endif()
+else ()
+    find_library(CUVID_LIB nvcuvid)
+    find_library(NVENCODEAPI_LIB nvidia-encode)
+endif()
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    find_package(PkgConfig REQUIRED)
+    pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec)
+    pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
+    pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil)
+    pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample)
+
+    set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS})
+    find_library(AVCODEC_LIBRARY NAMES avcodec
+            HINTS
+            ${PC_AVCODEC_LIBDIR}
+            ${PC_AVCODEC_LIBRARY_DIRS}
+            )
+    find_library(AVFORMAT_LIBRARY NAMES avformat
+            HINTS
+            ${PC_AVFORMAT_LIBDIR}
+            ${PC_AVFORMAT_LIBRARY_DIRS}
+            )
+    find_library(AVUTIL_LIBRARY NAMES avutil
+            HINTS
+            ${PC_AVUTIL_LIBDIR}
+            ${PC_AVUTIL_LIBRARY_DIRS}
+            )
+    find_library(SWRESAMPLE_LIBRARY NAMES swresample
+            HINTS
+            ${PC_SWRESAMPLE_LIBDIR}
+            ${PC_SWRESAMPLE_LIBRARY_DIRS}
+            )
+    set(AVCODEC_LIB ${AVCODEC_LIBRARY})
+    set(AVFORMAT_LIB ${AVFORMAT_LIBRARY})
+    set(AVUTIL_LIB ${AVUTIL_LIBRARY})
+    set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY})
+endif()
+
+
+if(WIN32)
+    add_subdirectory(AppEncode/AppEncD3D11)
+    add_subdirectory(AppEncode/AppEncD3D9)
+    add_subdirectory(AppDecode/AppDecD3D)
+else ()
+    #Need only linux Makefile for this
+    add_subdirectory(AppEncode/AppEncGL)
+endif()
+
+add_subdirectory(AppEncode/AppEncCuda)
+add_subdirectory(AppEncode/AppEncDec)
+add_subdirectory(AppEncode/AppEncLowLatency)
+add_subdirectory(AppEncode/AppEncME)
+add_subdirectory(AppEncode/AppEncPerf)
+add_subdirectory(AppEncode/AppEncQual)
+add_subdirectory(AppEncode/AppMotionEstimationVkCuda)
+add_subdirectory(AppTranscode/AppTrans)
+add_subdirectory(AppTranscode/AppTransOneToN)
+add_subdirectory(AppTranscode/AppTransPerf)
+add_subdirectory(AppDecode/AppDec)
+add_subdirectory(AppDecode/AppDecGL)
+add_subdirectory(AppDecode/AppDecImageProvider)
+add_subdirectory(AppDecode/AppDecLowLatency)
+add_subdirectory(AppDecode/AppDecMem)
+add_subdirectory(AppDecode/AppDecMultiFiles)
+add_subdirectory(AppDecode/AppDecMultiInput)
+add_subdirectory(AppDecode/AppDecPerf)
+
+
--- a/thirdparty/nvcodec/Samples/Utils/BitDepth.cu
+++ b/thirdparty/nvcodec/Samples/Utils/BitDepth.cu
@@ -0,0 +1,54 @@
+/*
+* Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include <stdio.h>
+
+static __global__ void ConvertUInt8ToUInt16Kernel(uint8_t *dpUInt8, uint16_t *dpUInt16, int nSrcPitch, int nDestPitch, int nWidth, int nHeight)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x,
+        y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= nWidth || y >= nHeight)
+    {
+        return;
+    }
+    int destStrideInPixels = nDestPitch / (sizeof(uint16_t));
+    *(uchar2 *)&dpUInt16[y * destStrideInPixels + x] = uchar2{ 0, dpUInt8[y * nSrcPitch + x] };
+}
+
+static __global__ void ConvertUInt16ToUInt8Kernel(uint16_t *dpUInt16, uint8_t *dpUInt8, int nSrcPitch, int nDestPitch, int nWidth, int nHeight)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x,
+        y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= nWidth || y >= nHeight)
+    {
+        return;
+    }
+    int srcStrideInPixels = nSrcPitch / (sizeof(uint16_t));
+    dpUInt8[y * nDestPitch + x] = ((uchar2 *)&dpUInt16[y * srcStrideInPixels + x])->y;
+}
+
+void ConvertUInt8ToUInt16(uint8_t *dpUInt8, uint16_t *dpUInt16, int nSrcPitch, int nDestPitch, int nWidth, int nHeight)
+{
+    dim3 blockSize(16, 16, 1);
+    dim3 gridSize(((uint32_t)nWidth + blockSize.x - 1) / blockSize.x, ((uint32_t)nHeight + blockSize.y - 1) / blockSize.y, 1);
+    ConvertUInt8ToUInt16Kernel <<< gridSize, blockSize >>>(dpUInt8, dpUInt16, nSrcPitch, nDestPitch, nWidth, nHeight);
+}
+
+void ConvertUInt16ToUInt8(uint16_t *dpUInt16, uint8_t *dpUInt8, int nSrcPitch, int nDestPitch, int nWidth, int nHeight)
+{
+    dim3 blockSize(16, 16, 1);
+    dim3 gridSize(((uint32_t)nWidth + blockSize.x - 1) / blockSize.x, ((uint32_t)nHeight + blockSize.y - 1) / blockSize.y, 1);
+    ConvertUInt16ToUInt8Kernel <<<gridSize, blockSize >>>(dpUInt16, dpUInt8, nSrcPitch, nDestPitch, nWidth, nHeight);
+}
--- a/thirdparty/nvcodec/Samples/Utils/ColorSpace.cu
+++ b/thirdparty/nvcodec/Samples/Utils/ColorSpace.cu
@@ -0,0 +1,399 @@
+/*
+* Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#include "ColorSpace.h"
+
+__constant__ float matYuv2Rgb[3][3];
+__constant__ float matRgb2Yuv[3][3];
+
+
+void inline GetConstants(int iMatrix, float &wr, float &wb, int &black, int &white, int &max) {
+    black = 16; white = 235;
+    max = 255;
+
+    switch (iMatrix)
+    {
+    case ColorSpaceStandard_BT709:
+    default:
+        wr = 0.2126f; wb = 0.0722f;
+        break;
+
+    case ColorSpaceStandard_FCC:
+        wr = 0.30f; wb = 0.11f;
+        break;
+
+    case ColorSpaceStandard_BT470:
+    case ColorSpaceStandard_BT601:
+        wr = 0.2990f; wb = 0.1140f;
+        break;
+
+    case ColorSpaceStandard_SMPTE240M:
+        wr = 0.212f; wb = 0.087f;
+        break;
+
+    case ColorSpaceStandard_BT2020:
+    case ColorSpaceStandard_BT2020C:
+        wr = 0.2627f; wb = 0.0593f;
+        // 10-bit only
+        black = 64 << 6; white = 940 << 6;
+        max = (1 << 16) - 1;
+        break;
+    }
+}
+
+void SetMatYuv2Rgb(int iMatrix) {
+    float wr, wb;
+    int black, white, max;
+    GetConstants(iMatrix, wr, wb, black, white, max);
+    float mat[3][3] = {
+        1.0f, 0.0f, (1.0f - wr) / 0.5f,
+        1.0f, -wb * (1.0f - wb) / 0.5f / (1 - wb - wr), -wr * (1 - wr) / 0.5f / (1 - wb - wr),
+        1.0f, (1.0f - wb) / 0.5f, 0.0f,
+    };
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            mat[i][j] = (float)(1.0 * max / (white - black) * mat[i][j]);
+        }
+    }
+    cudaMemcpyToSymbol(matYuv2Rgb, mat, sizeof(mat));
+}
+
+void SetMatRgb2Yuv(int iMatrix) {
+    float wr, wb;
+    int black, white, max;
+    GetConstants(iMatrix, wr, wb, black, white, max);
+    float mat[3][3] = {
+        wr, 1.0f - wb - wr, wb,
+        -0.5f * wr / (1.0f - wb), -0.5f * (1 - wb - wr) / (1.0f - wb), 0.5f,
+        0.5f, -0.5f * (1.0f - wb - wr) / (1.0f - wr), -0.5f * wb / (1.0f - wr),
+    };
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            mat[i][j] = (float)(1.0 * (white - black) / max * mat[i][j]);
+        }
+    }
+    cudaMemcpyToSymbol(matRgb2Yuv, mat, sizeof(mat));
+}
+
+template<class T>
+__device__ static T Clamp(T x, T lower, T upper) {
+    return x < lower ? lower : (x > upper ? upper : x);
+}
+
+template<class Rgb, class YuvUnit>
+__device__ inline Rgb YuvToRgbForPixel(YuvUnit y, YuvUnit u, YuvUnit v) {
+    const int 
+        low = 1 << (sizeof(YuvUnit) * 8 - 4),
+        mid = 1 << (sizeof(YuvUnit) * 8 - 1);
+    float fy = (int)y - low, fu = (int)u - mid, fv = (int)v - mid;
+    const float maxf = (1 << sizeof(YuvUnit) * 8) - 1.0f;
+    YuvUnit 
+        r = (YuvUnit)Clamp(matYuv2Rgb[0][0] * fy + matYuv2Rgb[0][1] * fu + matYuv2Rgb[0][2] * fv, 0.0f, maxf),
+        g = (YuvUnit)Clamp(matYuv2Rgb[1][0] * fy + matYuv2Rgb[1][1] * fu + matYuv2Rgb[1][2] * fv, 0.0f, maxf),
+        b = (YuvUnit)Clamp(matYuv2Rgb[2][0] * fy + matYuv2Rgb[2][1] * fu + matYuv2Rgb[2][2] * fv, 0.0f, maxf);
+    
+    Rgb rgb{};
+    const int nShift = abs((int)sizeof(YuvUnit) - (int)sizeof(rgb.c.r)) * 8;
+    if (sizeof(YuvUnit) >= sizeof(rgb.c.r)) {
+        rgb.c.r = r >> nShift;
+        rgb.c.g = g >> nShift;
+        rgb.c.b = b >> nShift;
+    } else {
+        rgb.c.r = r << nShift;
+        rgb.c.g = g << nShift;
+        rgb.c.b = b << nShift;
+    }
+    return rgb;
+}
+
+template<class YuvUnitx2, class Rgb, class RgbIntx2>
+__global__ static void YuvToRgbKernel(uint8_t *pYuv, int nYuvPitch, uint8_t *pRgb, int nRgbPitch, int nWidth, int nHeight) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t *pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+    uint8_t *pDst = pRgb + x * sizeof(Rgb) + y * nRgbPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2 *)pSrc;
+    YuvUnitx2 l1 = *(YuvUnitx2 *)(pSrc + nYuvPitch);
+    YuvUnitx2 ch = *(YuvUnitx2 *)(pSrc + (nHeight - y / 2) * nYuvPitch);
+
+    *(RgbIntx2 *)pDst = RgbIntx2 {
+        YuvToRgbForPixel<Rgb>(l0.x, ch.x, ch.y).d,
+        YuvToRgbForPixel<Rgb>(l0.y, ch.x, ch.y).d,
+    };
+    *(RgbIntx2 *)(pDst + nRgbPitch) = RgbIntx2 {
+        YuvToRgbForPixel<Rgb>(l1.x, ch.x, ch.y).d, 
+        YuvToRgbForPixel<Rgb>(l1.y, ch.x, ch.y).d,
+    };
+}
+
+template<class YuvUnitx2, class Rgb, class RgbIntx2>
+__global__ static void Yuv444ToRgbKernel(uint8_t *pYuv, int nYuvPitch, uint8_t *pRgb, int nRgbPitch, int nWidth, int nHeight) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y  >= nHeight) {
+        return;
+    }
+
+    uint8_t *pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+    uint8_t *pDst = pRgb + x * sizeof(Rgb) + y * nRgbPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2 *)pSrc;
+    YuvUnitx2 ch1 = *(YuvUnitx2 *)(pSrc + (nHeight * nYuvPitch));
+    YuvUnitx2 ch2 = *(YuvUnitx2 *)(pSrc + (2 * nHeight * nYuvPitch));
+
+    *(RgbIntx2 *)pDst = RgbIntx2{
+        YuvToRgbForPixel<Rgb>(l0.x, ch1.x, ch2.x).d,
+        YuvToRgbForPixel<Rgb>(l0.y, ch1.y, ch2.y).d,
+    };
+}
+
+template<class YuvUnitx2, class Rgb, class RgbUnitx2>
+__global__ static void YuvToRgbPlanarKernel(uint8_t *pYuv, int nYuvPitch, uint8_t *pRgbp, int nRgbpPitch, int nWidth, int nHeight) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t *pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2 *)pSrc;
+    YuvUnitx2 l1 = *(YuvUnitx2 *)(pSrc + nYuvPitch);
+    YuvUnitx2 ch = *(YuvUnitx2 *)(pSrc + (nHeight - y / 2) * nYuvPitch);
+
+    Rgb rgb0 = YuvToRgbForPixel<Rgb>(l0.x, ch.x, ch.y),
+        rgb1 = YuvToRgbForPixel<Rgb>(l0.y, ch.x, ch.y),
+        rgb2 = YuvToRgbForPixel<Rgb>(l1.x, ch.x, ch.y),
+        rgb3 = YuvToRgbForPixel<Rgb>(l1.y, ch.x, ch.y);
+
+    uint8_t *pDst = pRgbp + x * sizeof(RgbUnitx2) / 2 + y * nRgbpPitch;
+    *(RgbUnitx2 *)pDst = RgbUnitx2 {rgb0.v.x, rgb1.v.x};
+    *(RgbUnitx2 *)(pDst + nRgbpPitch) = RgbUnitx2 {rgb2.v.x, rgb3.v.x};
+    pDst += nRgbpPitch * nHeight;
+    *(RgbUnitx2 *)pDst = RgbUnitx2 {rgb0.v.y, rgb1.v.y};
+    *(RgbUnitx2 *)(pDst + nRgbpPitch) = RgbUnitx2 {rgb2.v.y, rgb3.v.y};
+    pDst += nRgbpPitch * nHeight;
+    *(RgbUnitx2 *)pDst = RgbUnitx2 {rgb0.v.z, rgb1.v.z};
+    *(RgbUnitx2 *)(pDst + nRgbpPitch) = RgbUnitx2 {rgb2.v.z, rgb3.v.z};
+}
+
+template<class YuvUnitx2, class Rgb, class RgbUnitx2>
+__global__ static void Yuv444ToRgbPlanarKernel(uint8_t *pYuv, int nYuvPitch, uint8_t *pRgbp, int nRgbpPitch, int nWidth, int nHeight) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t *pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2 *)pSrc;
+    YuvUnitx2 ch1 = *(YuvUnitx2 *)(pSrc + (nHeight * nYuvPitch));
+    YuvUnitx2 ch2 = *(YuvUnitx2 *)(pSrc + (2 * nHeight * nYuvPitch));
+
+    Rgb rgb0 = YuvToRgbForPixel<Rgb>(l0.x, ch1.x, ch2.x),
+        rgb1 = YuvToRgbForPixel<Rgb>(l0.y, ch1.y, ch2.y);
+
+
+    uint8_t *pDst = pRgbp + x * sizeof(RgbUnitx2) / 2 + y * nRgbpPitch;
+    *(RgbUnitx2 *)pDst = RgbUnitx2{ rgb0.v.x, rgb1.v.x };
+
+    pDst += nRgbpPitch * nHeight;
+    *(RgbUnitx2 *)pDst = RgbUnitx2{ rgb0.v.y, rgb1.v.y };
+
+    pDst += nRgbpPitch * nHeight;
+    *(RgbUnitx2 *)pDst = RgbUnitx2{ rgb0.v.z, rgb1.v.z };
+}
+
+template <class COLOR32>
+void Nv12ToColor32(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    YuvToRgbKernel<uchar2, COLOR32, uint2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2 / 2), dim3(32, 2)>>>
+        (dpNv12, nNv12Pitch, dpBgra, nBgraPitch, nWidth, nHeight);
+}
+
+template <class COLOR64>
+void Nv12ToColor64(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    YuvToRgbKernel<uchar2, COLOR64, ulonglong2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2 / 2), dim3(32, 2)>>>
+        (dpNv12, nNv12Pitch, dpBgra, nBgraPitch, nWidth, nHeight);
+}
+
+template <class COLOR32>
+void YUV444ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    Yuv444ToRgbKernel<uchar2, COLOR32, uint2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2), dim3(32, 2) >>>
+        (dpYUV444, nPitch, dpBgra, nBgraPitch, nWidth, nHeight);
+}
+
+template <class COLOR64>
+void YUV444ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    Yuv444ToRgbKernel<uchar2, COLOR64, ulonglong2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2), dim3(32, 2) >>>
+        (dpYUV444, nPitch, dpBgra, nBgraPitch, nWidth, nHeight);
+}
+
+template <class COLOR32>
+void P016ToColor32(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    YuvToRgbKernel<ushort2, COLOR32, uint2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2 / 2), dim3(32, 2)>>>
+        (dpP016, nP016Pitch, dpBgra, nBgraPitch, nWidth, nHeight);
+}
+
+template <class COLOR64>
+void P016ToColor64(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    YuvToRgbKernel<ushort2, COLOR64, ulonglong2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2 / 2), dim3(32, 2)>>>
+        (dpP016, nP016Pitch, dpBgra, nBgraPitch, nWidth, nHeight);
+}
+
+template <class COLOR32>
+void YUV444P16ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    Yuv444ToRgbKernel<ushort2, COLOR32, uint2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2), dim3(32, 2) >>>
+        (dpYUV444, nPitch, dpBgra, nBgraPitch, nWidth, nHeight);
+}
+
+template <class COLOR64>
+void YUV444P16ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    Yuv444ToRgbKernel<ushort2, COLOR64, ulonglong2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2), dim3(32, 2) >>>
+        (dpYUV444, nPitch, dpBgra, nBgraPitch, nWidth, nHeight);
+}
+
+template <class COLOR32>
+void Nv12ToColorPlanar(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    YuvToRgbPlanarKernel<uchar2, COLOR32, uchar2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2 / 2), dim3(32, 2)>>>
+        (dpNv12, nNv12Pitch, dpBgrp, nBgrpPitch, nWidth, nHeight);
+}
+
+template <class COLOR32>
+void P016ToColorPlanar(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    YuvToRgbPlanarKernel<ushort2, COLOR32, uchar2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2 / 2), dim3(32, 2)>>>
+        (dpP016, nP016Pitch, dpBgrp, nBgrpPitch, nWidth, nHeight);
+}
+
+template <class COLOR32>
+void YUV444ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    Yuv444ToRgbPlanarKernel<uchar2, COLOR32, uchar2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2), dim3(32, 2) >>>
+        (dpYUV444, nPitch, dpBgrp, nBgrpPitch, nWidth, nHeight);
+}
+
+template <class COLOR32>
+void YUV444P16ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatYuv2Rgb(iMatrix);
+    Yuv444ToRgbPlanarKernel<ushort2, COLOR32, uchar2>
+        << <dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2), dim3(32, 2) >> >
+        (dpYUV444, nPitch, dpBgrp, nBgrpPitch, nWidth, nHeight);
+}
+
+// Explicit Instantiation
+template void Nv12ToColor32<BGRA32>(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void Nv12ToColor32<RGBA32>(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void Nv12ToColor64<BGRA64>(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void Nv12ToColor64<RGBA64>(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444ToColor32<BGRA32>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444ToColor32<RGBA32>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444ToColor64<BGRA64>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444ToColor64<RGBA64>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void P016ToColor32<BGRA32>(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void P016ToColor32<RGBA32>(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void P016ToColor64<BGRA64>(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void P016ToColor64<RGBA64>(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444P16ToColor32<BGRA32>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444P16ToColor32<RGBA32>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444P16ToColor64<BGRA64>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444P16ToColor64<RGBA64>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix);
+template void Nv12ToColorPlanar<BGRA32>(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix);
+template void Nv12ToColorPlanar<RGBA32>(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix);
+template void P016ToColorPlanar<BGRA32>(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix);
+template void P016ToColorPlanar<RGBA32>(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444ToColorPlanar<BGRA32>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444ToColorPlanar<RGBA32>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444P16ToColorPlanar<BGRA32>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix);
+template void YUV444P16ToColorPlanar<RGBA32>(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, int iMatrix);
+
+template<class YuvUnit, class RgbUnit>
+__device__ inline YuvUnit RgbToY(RgbUnit r, RgbUnit g, RgbUnit b) {
+    const YuvUnit low = 1 << (sizeof(YuvUnit) * 8 - 4);
+    return matRgb2Yuv[0][0] * r + matRgb2Yuv[0][1] * g + matRgb2Yuv[0][2] * b + low;
+}
+
+template<class YuvUnit, class RgbUnit>
+__device__ inline YuvUnit RgbToU(RgbUnit r, RgbUnit g, RgbUnit b) {
+    const YuvUnit mid = 1 << (sizeof(YuvUnit) * 8 - 1);
+    return matRgb2Yuv[1][0] * r + matRgb2Yuv[1][1] * g + matRgb2Yuv[1][2] * b + mid;
+}
+
+template<class YuvUnit, class RgbUnit>
+__device__ inline YuvUnit RgbToV(RgbUnit r, RgbUnit g, RgbUnit b) {
+    const YuvUnit mid = 1 << (sizeof(YuvUnit) * 8 - 1);
+    return matRgb2Yuv[2][0] * r + matRgb2Yuv[2][1] * g + matRgb2Yuv[2][2] * b + mid;
+}
+
+template<class YuvUnitx2, class Rgb, class RgbIntx2>
+__global__ static void RgbToYuvKernel(uint8_t *pRgb, int nRgbPitch, uint8_t *pYuv, int nYuvPitch, int nWidth, int nHeight) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t *pSrc = pRgb + x * sizeof(Rgb) + y * nRgbPitch;
+    RgbIntx2 int2a = *(RgbIntx2 *)pSrc;
+    RgbIntx2 int2b = *(RgbIntx2 *)(pSrc + nRgbPitch);
+
+    Rgb rgb[4] = {int2a.x, int2a.y, int2b.x, int2b.y};
+    decltype(Rgb::c.r)
+        r = (rgb[0].c.r + rgb[1].c.r + rgb[2].c.r + rgb[3].c.r) / 4,
+        g = (rgb[0].c.g + rgb[1].c.g + rgb[2].c.g + rgb[3].c.g) / 4,
+        b = (rgb[0].c.b + rgb[1].c.b + rgb[2].c.b + rgb[3].c.b) / 4;
+
+    uint8_t *pDst = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+    *(YuvUnitx2 *)pDst = YuvUnitx2 {
+        RgbToY<decltype(YuvUnitx2::x)>(rgb[0].c.r, rgb[0].c.g, rgb[0].c.b),
+        RgbToY<decltype(YuvUnitx2::x)>(rgb[1].c.r, rgb[1].c.g, rgb[1].c.b),
+    };
+    *(YuvUnitx2 *)(pDst + nYuvPitch) = YuvUnitx2 {
+        RgbToY<decltype(YuvUnitx2::x)>(rgb[2].c.r, rgb[2].c.g, rgb[2].c.b),
+        RgbToY<decltype(YuvUnitx2::x)>(rgb[3].c.r, rgb[3].c.g, rgb[3].c.b),
+    };
+    *(YuvUnitx2 *)(pDst + (nHeight - y / 2) * nYuvPitch) = YuvUnitx2 {
+        RgbToU<decltype(YuvUnitx2::x)>(r, g, b), 
+        RgbToV<decltype(YuvUnitx2::x)>(r, g, b),
+    };
+}
+
+void Bgra64ToP016(uint8_t *dpBgra, int nBgraPitch, uint8_t *dpP016, int nP016Pitch, int nWidth, int nHeight, int iMatrix) {
+    SetMatRgb2Yuv(iMatrix);
+    RgbToYuvKernel<ushort2, BGRA64, ulonglong2>
+        <<<dim3((nWidth + 63) / 32 / 2, (nHeight + 3) / 2 / 2), dim3(32, 2)>>>
+        (dpBgra, nBgraPitch, dpP016, nP016Pitch, nWidth, nHeight);
+}
--- a/thirdparty/nvcodec/Samples/Utils/ColorSpace.h
+++ b/thirdparty/nvcodec/Samples/Utils/ColorSpace.h
@@ -0,0 +1,48 @@
+#pragma once
+#include <stdint.h>
+#include <cuda_runtime.h>
+
+typedef enum ColorSpaceStandard {
+    ColorSpaceStandard_BT709 = 1,
+    ColorSpaceStandard_Unspecified = 2,
+    ColorSpaceStandard_Reserved = 3,
+    ColorSpaceStandard_FCC = 4,
+    ColorSpaceStandard_BT470 = 5,
+    ColorSpaceStandard_BT601 = 6,
+    ColorSpaceStandard_SMPTE240M = 7,
+    ColorSpaceStandard_YCgCo = 8,
+    ColorSpaceStandard_BT2020 = 9,
+    ColorSpaceStandard_BT2020C = 10
+} ColorSpaceStandard;
+
+union BGRA32 {
+    uint32_t d;
+    uchar4 v;
+    struct {
+        uint8_t b, g, r, a;
+    } c;
+};
+
+union RGBA32 {
+    uint32_t d;
+    uchar4 v;
+    struct {
+        uint8_t r, g, b, a;
+    } c;
+};
+
+union BGRA64 {
+    uint64_t d;
+    ushort4 v;
+    struct {
+        uint16_t b, g, r, a;
+    } c;
+};
+
+union RGBA64 {
+    uint64_t d;
+    ushort4 v;
+    struct {
+        uint16_t r, g, b, a;
+    } c;
+};
--- a/thirdparty/nvcodec/Samples/Utils/FFmpegDemuxer.h
+++ b/thirdparty/nvcodec/Samples/Utils/FFmpegDemuxer.h
@@ -0,0 +1,357 @@
+/*
+* Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+#pragma once
+
+extern "C" {
+#include <libavformat/avformat.h>
+#include <libavformat/avio.h>
+#include <libavcodec/avcodec.h>
+}
+#include "NvCodecUtils.h"
+
+//---------------------------------------------------------------------------
+//! \file FFmpegDemuxer.h 
+//! \brief Provides functionality for stream demuxing
+//!
+//! This header file is used by Decode/Transcode apps to demux input video clips before decoding frames from it. 
+//---------------------------------------------------------------------------
+
+/**
+* @brief libavformat wrapper class. Retrieves the elementary encoded stream from the container format.
+*/
+class FFmpegDemuxer {
+private:
+    AVFormatContext *fmtc = NULL;
+    AVIOContext *avioc = NULL;
+    AVPacket pkt, pktFiltered; /*!< AVPacket stores compressed data typically exported by demuxers and then passed as input to decoders */
+    AVBSFContext *bsfc = NULL;
+
+    int iVideoStream;
+    bool bMp4H264, bMp4HEVC, bMp4MPEG4;
+    AVCodecID eVideoCodec;
+    AVPixelFormat eChromaFormat;
+    int nWidth, nHeight, nBitDepth, nBPP, nChromaHeight;
+    double timeBase = 0.0;
+    int64_t userTimeScale = 0; 
+
+    uint8_t *pDataWithHeader = NULL;
+
+    unsigned int frameCount = 0;
+
+public:
+    class DataProvider {
+    public:
+        virtual ~DataProvider() {}
+        virtual int GetData(uint8_t *pBuf, int nBuf) = 0;
+    };
+
+private:
+
+    /**
+    *   @brief  Private constructor to initialize libavformat resources.
+    *   @param  fmtc - Pointer to AVFormatContext allocated inside avformat_open_input()
+    */
+    FFmpegDemuxer(AVFormatContext *fmtc, int64_t timeScale = 1000 /*Hz*/) : fmtc(fmtc) {
+        if (!fmtc) {
+            LOG(ERROR) << "No AVFormatContext provided.";
+            return;
+        }
+
+        LOG(INFO) << "Media format: " << fmtc->iformat->long_name << " (" << fmtc->iformat->name << ")";
+
+        ck(avformat_find_stream_info(fmtc, NULL));
+        iVideoStream = av_find_best_stream(fmtc, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
+        if (iVideoStream < 0) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " " << "Could not find stream in input file";
+            return;
+        }
+
+        //fmtc->streams[iVideoStream]->need_parsing = AVSTREAM_PARSE_NONE;
+        eVideoCodec = fmtc->streams[iVideoStream]->codecpar->codec_id;
+        nWidth = fmtc->streams[iVideoStream]->codecpar->width;
+        nHeight = fmtc->streams[iVideoStream]->codecpar->height;
+        eChromaFormat = (AVPixelFormat)fmtc->streams[iVideoStream]->codecpar->format;
+        AVRational rTimeBase = fmtc->streams[iVideoStream]->time_base;
+        timeBase = av_q2d(rTimeBase);
+        userTimeScale = timeScale;
+
+        // Set bit depth, chroma height, bits per pixel based on eChromaFormat of input
+        switch (eChromaFormat)
+        {
+        case AV_PIX_FMT_YUV420P10LE:
+        case AV_PIX_FMT_GRAY10LE:   // monochrome is treated as 420 with chroma filled with 0x0
+            nBitDepth = 10;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV420P12LE:
+            nBitDepth = 12;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV444P10LE:
+            nBitDepth = 10;
+            nChromaHeight = nHeight << 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV444P12LE:
+            nBitDepth = 12;
+            nChromaHeight = nHeight << 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV444P:
+            nBitDepth = 8;
+            nChromaHeight = nHeight << 1;
+            nBPP = 1;
+            break;
+        case AV_PIX_FMT_YUV420P:
+        case AV_PIX_FMT_YUVJ420P:
+        case AV_PIX_FMT_YUVJ422P:   // jpeg decoder output is subsampled to NV12 for 422/444 so treat it as 420
+        case AV_PIX_FMT_YUVJ444P:   // jpeg decoder output is subsampled to NV12 for 422/444 so treat it as 420
+        case AV_PIX_FMT_GRAY8:      // monochrome is treated as 420 with chroma filled with 0x0
+            nBitDepth = 8;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 1;
+            break;
+        default:
+            LOG(WARNING) << "ChromaFormat not recognized. Assuming 420";
+            eChromaFormat = AV_PIX_FMT_YUV420P;
+            nBitDepth = 8;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 1;
+        }
+
+        bMp4H264 = eVideoCodec == AV_CODEC_ID_H264 && (
+                !strcmp(fmtc->iformat->long_name, "QuickTime / MOV") 
+                || !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") 
+                || !strcmp(fmtc->iformat->long_name, "Matroska / WebM")
+            );
+        bMp4HEVC = eVideoCodec == AV_CODEC_ID_HEVC && (
+                !strcmp(fmtc->iformat->long_name, "QuickTime / MOV")
+                || !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)")
+                || !strcmp(fmtc->iformat->long_name, "Matroska / WebM")
+            );
+
+        bMp4MPEG4 = eVideoCodec == AV_CODEC_ID_MPEG4 && (
+                !strcmp(fmtc->iformat->long_name, "QuickTime / MOV")
+                || !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)")
+                || !strcmp(fmtc->iformat->long_name, "Matroska / WebM")
+            );
+
+        //Initialize packet fields with default values
+        av_init_packet(&pkt);
+        pkt.data = NULL;
+        pkt.size = 0;
+        av_init_packet(&pktFiltered);
+        pktFiltered.data = NULL;
+        pktFiltered.size = 0;
+
+        // Initialize bitstream filter and its required resources
+        if (bMp4H264) {
+            const AVBitStreamFilter *bsf = av_bsf_get_by_name("h264_mp4toannexb");
+            if (!bsf) {
+                LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " " << "av_bsf_get_by_name() failed";
+                return;
+            }
+            ck(av_bsf_alloc(bsf, &bsfc));
+            avcodec_parameters_copy(bsfc->par_in, fmtc->streams[iVideoStream]->codecpar);
+            ck(av_bsf_init(bsfc));
+        }
+        if (bMp4HEVC) {
+            const AVBitStreamFilter *bsf = av_bsf_get_by_name("hevc_mp4toannexb");
+            if (!bsf) {
+                LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " " << "av_bsf_get_by_name() failed";
+                return;
+            }
+            ck(av_bsf_alloc(bsf, &bsfc));
+            avcodec_parameters_copy(bsfc->par_in, fmtc->streams[iVideoStream]->codecpar);
+            ck(av_bsf_init(bsfc));
+        }
+    }
+
+    AVFormatContext *CreateFormatContext(DataProvider *pDataProvider) {
+
+        AVFormatContext *ctx = NULL;
+        if (!(ctx = avformat_alloc_context())) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+            return NULL;
+        }
+
+        uint8_t *avioc_buffer = NULL;
+        int avioc_buffer_size = 8 * 1024 * 1024;
+        avioc_buffer = (uint8_t *)av_malloc(avioc_buffer_size);
+        if (!avioc_buffer) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+            return NULL;
+        }
+        avioc = avio_alloc_context(avioc_buffer, avioc_buffer_size,
+            0, pDataProvider, &ReadPacket, NULL, NULL);
+        if (!avioc) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+            return NULL;
+        }
+        ctx->pb = avioc;
+
+        ck(avformat_open_input(&ctx, NULL, NULL, NULL));
+        return ctx;
+    }
+
+    /**
+    *   @brief  Allocate and return AVFormatContext*.
+    *   @param  szFilePath - Filepath pointing to input stream.
+    *   @return Pointer to AVFormatContext
+    */
+     AVFormatContext *CreateFormatContext(const char *szFilePath) {
+        avformat_network_init();
+
+        AVFormatContext *ctx = NULL;
+        ck(avformat_open_input(&ctx, szFilePath, NULL, NULL));
+        return ctx;
+    }
+
+public:
+    FFmpegDemuxer(const char *szFilePath, int64_t timescale = 1000 /*Hz*/) : FFmpegDemuxer(CreateFormatContext(szFilePath), timescale) {}
+    FFmpegDemuxer(DataProvider *pDataProvider) : FFmpegDemuxer(CreateFormatContext(pDataProvider)) {avioc = fmtc->pb;}
+    ~FFmpegDemuxer() {
+
+        if (!fmtc) {
+            return;
+        }
+
+        if (pkt.data) {
+            av_packet_unref(&pkt);
+        }
+        if (pktFiltered.data) {
+            av_packet_unref(&pktFiltered);
+        }
+
+        if (bsfc) {
+            av_bsf_free(&bsfc);
+        }
+
+        avformat_close_input(&fmtc);
+
+        if (avioc) {
+            av_freep(&avioc->buffer);
+            av_freep(&avioc);
+        }
+
+        if (pDataWithHeader) {
+            av_free(pDataWithHeader);
+        }
+    }
+    AVCodecID GetVideoCodec() {
+        return eVideoCodec;
+    }
+    AVPixelFormat GetChromaFormat() {
+        return eChromaFormat;
+    }
+    int GetWidth() {
+        return nWidth;
+    }
+    int GetHeight() {
+        return nHeight;
+    }
+    int GetBitDepth() {
+        return nBitDepth;
+    }
+    int GetFrameSize() {
+        return nWidth * (nHeight + nChromaHeight) * nBPP;
+    }
+    bool Demux(uint8_t **ppVideo, int *pnVideoBytes, int64_t *pts = NULL) {
+        if (!fmtc) {
+            return false;
+        }
+
+        *pnVideoBytes = 0;
+
+        if (pkt.data) {
+            av_packet_unref(&pkt);
+        }
+
+        int e = 0;
+        while ((e = av_read_frame(fmtc, &pkt)) >= 0 && pkt.stream_index != iVideoStream) {
+            av_packet_unref(&pkt);
+        }
+        if (e < 0) {
+            return false;
+        }
+
+        if (bMp4H264 || bMp4HEVC) {
+            if (pktFiltered.data) {
+                av_packet_unref(&pktFiltered);
+            }
+            ck(av_bsf_send_packet(bsfc, &pkt));
+            ck(av_bsf_receive_packet(bsfc, &pktFiltered));
+            *ppVideo = pktFiltered.data;
+            *pnVideoBytes = pktFiltered.size;
+            if (pts)
+                *pts = (int64_t) (pktFiltered.pts * userTimeScale * timeBase);
+        } else {
+
+            if (bMp4MPEG4 && (frameCount == 0)) {
+
+                int extraDataSize = fmtc->streams[iVideoStream]->codecpar->extradata_size;
+
+                if (extraDataSize > 0) {
+
+                    // extradata contains start codes 00 00 01. Subtract its size
+                    pDataWithHeader = (uint8_t *)av_malloc(extraDataSize + pkt.size - 3*sizeof(uint8_t));
+
+                    if (!pDataWithHeader) {
+                        LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+                        return false;
+                    }
+
+                    memcpy(pDataWithHeader, fmtc->streams[iVideoStream]->codecpar->extradata, extraDataSize);
+                    memcpy(pDataWithHeader+extraDataSize, pkt.data+3, pkt.size - 3*sizeof(uint8_t));
+
+                    *ppVideo = pDataWithHeader;
+                    *pnVideoBytes = extraDataSize + pkt.size - 3*sizeof(uint8_t);
+                }
+
+            } else {
+                *ppVideo = pkt.data;
+                *pnVideoBytes = pkt.size;
+            }
+
+            if (pts)
+                *pts = (int64_t)(pkt.pts * userTimeScale * timeBase);
+        }
+
+        frameCount++;
+
+        return true;
+    }
+
+    static int ReadPacket(void *opaque, uint8_t *pBuf, int nBuf) {
+        return ((DataProvider *)opaque)->GetData(pBuf, nBuf);
+    }
+};
+
+inline cudaVideoCodec FFmpeg2NvCodecId(AVCodecID id) {
+    switch (id) {
+    case AV_CODEC_ID_MPEG1VIDEO : return cudaVideoCodec_MPEG1;
+    case AV_CODEC_ID_MPEG2VIDEO : return cudaVideoCodec_MPEG2;
+    case AV_CODEC_ID_MPEG4      : return cudaVideoCodec_MPEG4;
+    case AV_CODEC_ID_WMV3       :
+    case AV_CODEC_ID_VC1        : return cudaVideoCodec_VC1;
+    case AV_CODEC_ID_H264       : return cudaVideoCodec_H264;
+    case AV_CODEC_ID_HEVC       : return cudaVideoCodec_HEVC;
+    case AV_CODEC_ID_VP8        : return cudaVideoCodec_VP8;
+    case AV_CODEC_ID_VP9        : return cudaVideoCodec_VP9;
+    case AV_CODEC_ID_MJPEG      : return cudaVideoCodec_JPEG;
+    case AV_CODEC_ID_AV1        : return cudaVideoCodec_AV1;
+    default                     : return cudaVideoCodec_NumCodecs;
+    }
+}
+
+
--- a/thirdparty/nvcodec/Samples/Utils/FFmpegStreamer.h
+++ b/thirdparty/nvcodec/Samples/Utils/FFmpegStreamer.h
@@ -0,0 +1,109 @@
+/*
+* Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+#pragma once
+
+#include <thread>
+#include <mutex>
+extern "C" {
+#include <libavformat/avformat.h>
+#include <libavutil/opt.h>
+#include <libswresample/swresample.h>
+};
+#include "Logger.h"
+
+extern simplelogger::Logger *logger;
+
+class FFmpegStreamer {
+private:
+    AVFormatContext *oc = NULL;
+    AVStream *vs = NULL;
+    int nFps = 0;
+
+public:
+    FFmpegStreamer(AVCodecID eCodecId, int nWidth, int nHeight, int nFps, const char *szInFilePath) : nFps(nFps) {
+        avformat_network_init();
+        oc = avformat_alloc_context();
+        if (!oc) {
+            LOG(ERROR) << "FFMPEG: avformat_alloc_context error";
+            return;
+        }
+
+        // Set format on oc
+        AVOutputFormat *fmt = av_guess_format("mpegts", NULL, NULL);
+        if (!fmt) {
+            LOG(ERROR) << "Invalid format";
+            return;
+        }
+        fmt->video_codec = eCodecId;
+
+        oc->oformat = fmt;
+        oc->url = av_strdup(szInFilePath);
+        LOG(INFO) << "Streaming destination: " << oc->url;
+
+        // Add video stream to oc
+        vs = avformat_new_stream(oc, NULL);
+        if (!vs) {
+            LOG(ERROR) << "FFMPEG: Could not alloc video stream";
+            return;
+        }
+        vs->id = 0;
+
+        // Set video parameters
+        AVCodecParameters *vpar = vs->codecpar;
+        vpar->codec_id = fmt->video_codec;
+        vpar->codec_type = AVMEDIA_TYPE_VIDEO;
+        vpar->width = nWidth;
+        vpar->height = nHeight;
+
+        // Everything is ready. Now open the output stream.
+        if (avio_open(&oc->pb, oc->url, AVIO_FLAG_WRITE) < 0) {
+            LOG(ERROR) << "FFMPEG: Could not open " << oc->url;
+            return ;
+        }
+
+        // Write the container header
+        if (avformat_write_header(oc, NULL)) {
+            LOG(ERROR) << "FFMPEG: avformat_write_header error!";
+            return;
+        }
+    }
+    ~FFmpegStreamer() {
+        if (oc) {
+            av_write_trailer(oc);
+            avio_close(oc->pb);
+            avformat_free_context(oc);
+        }
+    }
+
+    bool Stream(uint8_t *pData, int nBytes, int nPts) {
+        AVPacket pkt = {0};
+        av_init_packet(&pkt);
+        pkt.pts = av_rescale_q(nPts++, AVRational {1, nFps}, vs->time_base);
+        // No B-frames
+        pkt.dts = pkt.pts;
+        pkt.stream_index = vs->index;
+        pkt.data = pData;
+        pkt.size = nBytes;
+
+        if(!memcmp(pData, "\x00\x00\x00\x01\x67", 5)) {
+            pkt.flags |= AV_PKT_FLAG_KEY;
+        }
+
+        // Write the compressed frame into the output
+        int ret = av_write_frame(oc, &pkt);
+        av_write_frame(oc, NULL);
+        if (ret < 0) {
+            LOG(ERROR) << "FFMPEG: Error while writing video frame";
+        }
+
+        return true;
+    }
+};
--- a/thirdparty/nvcodec/Samples/Utils/NvCodecUtils.h
+++ b/thirdparty/nvcodec/Samples/Utils/NvCodecUtils.h
@@ -0,0 +1,490 @@
+/*
+ * Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+//---------------------------------------------------------------------------
+//! \file NvCodecUtils.h
+//! \brief Miscellaneous classes and error checking functions.
+//!
+//! Used by Transcode/Encode samples apps for reading input files,
+//! mutithreading, performance measurement or colorspace conversion while
+//! decoding.
+//---------------------------------------------------------------------------
+
+#pragma once
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include <chrono>
+#include <condition_variable>
+#include <fstream>
+#include <iomanip>
+#include <ios>
+#include <list>
+#include <sstream>
+#include <thread>
+
+#ifdef __cuda_cuda_h__
+inline bool check(CUresult e, int iLine, const char *szFile) {
+  if (e != CUDA_SUCCESS) {
+    const char *szErrName = NULL;
+    cuGetErrorName(e, &szErrName);
+
+    return false;
+  }
+  return true;
+}
+#endif
+
+#ifdef __CUDA_RUNTIME_H__
+inline bool check(cudaError_t e, int iLine, const char *szFile) {
+  if (e != cudaSuccess) {
+    return false;
+  }
+  return true;
+}
+#endif
+
+#ifdef _NV_ENCODEAPI_H_
+inline bool check(NVENCSTATUS e, int iLine, const char *szFile) {
+  const char *aszErrName[] = {
+      "NV_ENC_SUCCESS",
+      "NV_ENC_ERR_NO_ENCODE_DEVICE",
+      "NV_ENC_ERR_UNSUPPORTED_DEVICE",
+      "NV_ENC_ERR_INVALID_ENCODERDEVICE",
+      "NV_ENC_ERR_INVALID_DEVICE",
+      "NV_ENC_ERR_DEVICE_NOT_EXIST",
+      "NV_ENC_ERR_INVALID_PTR",
+      "NV_ENC_ERR_INVALID_EVENT",
+      "NV_ENC_ERR_INVALID_PARAM",
+      "NV_ENC_ERR_INVALID_CALL",
+      "NV_ENC_ERR_OUT_OF_MEMORY",
+      "NV_ENC_ERR_ENCODER_NOT_INITIALIZED",
+      "NV_ENC_ERR_UNSUPPORTED_PARAM",
+      "NV_ENC_ERR_LOCK_BUSY",
+      "NV_ENC_ERR_NOT_ENOUGH_BUFFER",
+      "NV_ENC_ERR_INVALID_VERSION",
+      "NV_ENC_ERR_MAP_FAILED",
+      "NV_ENC_ERR_NEED_MORE_INPUT",
+      "NV_ENC_ERR_ENCODER_BUSY",
+      "NV_ENC_ERR_EVENT_NOT_REGISTERD",
+      "NV_ENC_ERR_GENERIC",
+      "NV_ENC_ERR_INCOMPATIBLE_CLIENT_KEY",
+      "NV_ENC_ERR_UNIMPLEMENTED",
+      "NV_ENC_ERR_RESOURCE_REGISTER_FAILED",
+      "NV_ENC_ERR_RESOURCE_NOT_REGISTERED",
+      "NV_ENC_ERR_RESOURCE_NOT_MAPPED",
+  };
+  if (e != NV_ENC_SUCCESS) {
+    return false;
+  }
+  return true;
+}
+#endif
+
+#ifdef _WINERROR_
+inline bool check(HRESULT e, int iLine, const char *szFile) {
+  if (e != S_OK) {
+    std::stringstream stream;
+    stream << std::hex << std::uppercase << e;
+
+    return false;
+  }
+  return true;
+}
+#endif
+
+#if defined(__gl_h_) || defined(__GL_H__)
+inline bool check(GLenum e, int iLine, const char *szFile) {
+  if (e != 0) {
+    return false;
+  }
+  return true;
+}
+#endif
+
+inline bool check(int e, int iLine, const char *szFile) {
+  if (e < 0) {
+    return false;
+  }
+  return true;
+}
+
+#define ck(call) check(call, __LINE__, __FILE__)
+
+/**
+ * @brief Wrapper class around std::thread
+ */
+class NvThread {
+ public:
+  NvThread() = default;
+  NvThread(const NvThread &) = delete;
+  NvThread &operator=(const NvThread &other) = delete;
+
+  NvThread(std::thread &&thread) : t(std::move(thread)) {}
+
+  NvThread(NvThread &&thread) : t(std::move(thread.t)) {}
+
+  NvThread &operator=(NvThread &&other) {
+    t = std::move(other.t);
+    return *this;
+  }
+
+  ~NvThread() { join(); }
+
+  void join() {
+    if (t.joinable()) {
+      t.join();
+    }
+  }
+
+ private:
+  std::thread t;
+};
+
+#ifndef _WIN32
+#define _stricmp strcasecmp
+#define _stat64 stat64
+#endif
+
+/**
+ * @brief Utility class to allocate buffer memory. Helps avoid I/O during the
+ * encode/decode loop in case of performance tests.
+ */
+class BufferedFileReader {
+ public:
+  /**
+   * @brief Constructor function to allocate appropriate memory and copy file
+   * contents into it
+   */
+  BufferedFileReader(const char *szFileName, bool bPartial = false) {
+    struct _stat64 st;
+
+    if (_stat64(szFileName, &st) != 0) {
+      return;
+    }
+
+    nSize = st.st_size;
+    while (nSize) {
+      try {
+        pBuf = new uint8_t[(size_t)nSize];
+        if (nSize != st.st_size) {
+        }
+        break;
+      } catch (std::bad_alloc) {
+        if (!bPartial) {
+          return;
+        }
+        nSize = (uint32_t)(nSize * 0.9);
+      }
+    }
+
+    std::ifstream fpIn(szFileName, std::ifstream::in | std::ifstream::binary);
+    if (!fpIn) {
+      return;
+    }
+
+    std::streamsize nRead =
+        fpIn.read(reinterpret_cast<char *>(pBuf), nSize).gcount();
+    fpIn.close();
+
+    assert(nRead == nSize);
+  }
+  ~BufferedFileReader() {
+    if (pBuf) {
+      delete[] pBuf;
+    }
+  }
+  bool GetBuffer(uint8_t **ppBuf, uint64_t *pnSize) {
+    if (!pBuf) {
+      return false;
+    }
+
+    *ppBuf = pBuf;
+    *pnSize = nSize;
+    return true;
+  }
+
+ private:
+  uint8_t *pBuf = NULL;
+  uint64_t nSize = 0;
+};
+
+/**
+ * @brief Template class to facilitate color space conversion
+ */
+template <typename T>
+class YuvConverter {
+ public:
+  YuvConverter(int nWidth, int nHeight) : nWidth(nWidth), nHeight(nHeight) {
+    pQuad = new T[((nWidth + 1) / 2) * ((nHeight + 1) / 2)];
+  }
+  ~YuvConverter() { delete[] pQuad; }
+  void PlanarToUVInterleaved(T *pFrame, int nPitch = 0) {
+    if (nPitch == 0) {
+      nPitch = nWidth;
+    }
+
+    // sizes of source surface plane
+    int nSizePlaneY = nPitch * nHeight;
+    int nSizePlaneU = ((nPitch + 1) / 2) * ((nHeight + 1) / 2);
+    int nSizePlaneV = nSizePlaneU;
+
+    T *puv = pFrame + nSizePlaneY;
+    if (nPitch == nWidth) {
+      memcpy(pQuad, puv, nSizePlaneU * sizeof(T));
+    } else {
+      for (int i = 0; i < (nHeight + 1) / 2; i++) {
+        memcpy(pQuad + ((nWidth + 1) / 2) * i, puv + ((nPitch + 1) / 2) * i,
+               ((nWidth + 1) / 2) * sizeof(T));
+      }
+    }
+    T *pv = puv + nSizePlaneU;
+    for (int y = 0; y < (nHeight + 1) / 2; y++) {
+      for (int x = 0; x < (nWidth + 1) / 2; x++) {
+        puv[y * nPitch + x * 2] = pQuad[y * ((nWidth + 1) / 2) + x];
+        puv[y * nPitch + x * 2 + 1] = pv[y * ((nPitch + 1) / 2) + x];
+      }
+    }
+  }
+  void UVInterleavedToPlanar(T *pFrame, int nPitch = 0) {
+    if (nPitch == 0) {
+      nPitch = nWidth;
+    }
+
+    // sizes of source surface plane
+    int nSizePlaneY = nPitch * nHeight;
+    int nSizePlaneU = ((nPitch + 1) / 2) * ((nHeight + 1) / 2);
+    int nSizePlaneV = nSizePlaneU;
+
+    T *puv = pFrame + nSizePlaneY, *pu = puv, *pv = puv + nSizePlaneU;
+
+    // split chroma from interleave to planar
+    for (int y = 0; y < (nHeight + 1) / 2; y++) {
+      for (int x = 0; x < (nWidth + 1) / 2; x++) {
+        pu[y * ((nPitch + 1) / 2) + x] = puv[y * nPitch + x * 2];
+        pQuad[y * ((nWidth + 1) / 2) + x] = puv[y * nPitch + x * 2 + 1];
+      }
+    }
+    if (nPitch == nWidth) {
+      memcpy(pv, pQuad, nSizePlaneV * sizeof(T));
+    } else {
+      for (int i = 0; i < (nHeight + 1) / 2; i++) {
+        memcpy(pv + ((nPitch + 1) / 2) * i, pQuad + ((nWidth + 1) / 2) * i,
+               ((nWidth + 1) / 2) * sizeof(T));
+      }
+    }
+  }
+
+ private:
+  T *pQuad;
+  int nWidth, nHeight;
+};
+
+/**
+ * @brief Utility class to measure elapsed time in seconds between the block of
+ * executed code
+ */
+class StopWatch {
+ public:
+  void Start() { t0 = std::chrono::high_resolution_clock::now(); }
+  double Stop() {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+               std::chrono::high_resolution_clock::now().time_since_epoch() -
+               t0.time_since_epoch())
+               .count() /
+           1.0e9;
+  }
+
+ private:
+  std::chrono::high_resolution_clock::time_point t0;
+};
+
+template <typename T>
+class ConcurrentQueue {
+ public:
+  ConcurrentQueue() {}
+  ConcurrentQueue(size_t size) : maxSize(size) {}
+  ConcurrentQueue(const ConcurrentQueue &) = delete;
+  ConcurrentQueue &operator=(const ConcurrentQueue &) = delete;
+
+  void setSize(size_t s) { maxSize = s; }
+
+  void push_back(const T &value) {
+    // Do not use a std::lock_guard here. We will need to explicitly
+    // unlock before notify_one as the other waiting thread will
+    // automatically try to acquire mutex once it wakes up
+    // (which will happen on notify_one)
+    std::unique_lock<std::mutex> lock(m_mutex);
+    auto wasEmpty = m_List.empty();
+
+    while (full()) {
+      m_cond.wait(lock);
+    }
+
+    m_List.push_back(value);
+    if (wasEmpty && !m_List.empty()) {
+      lock.unlock();
+      m_cond.notify_one();
+    }
+  }
+
+  T pop_front() {
+    std::unique_lock<std::mutex> lock(m_mutex);
+
+    while (m_List.empty()) {
+      m_cond.wait(lock);
+    }
+    auto wasFull = full();
+    T data = std::move(m_List.front());
+    m_List.pop_front();
+
+    if (wasFull && !full()) {
+      lock.unlock();
+      m_cond.notify_one();
+    }
+
+    return data;
+  }
+
+  T front() {
+    std::unique_lock<std::mutex> lock(m_mutex);
+
+    while (m_List.empty()) {
+      m_cond.wait(lock);
+    }
+
+    return m_List.front();
+  }
+
+  size_t size() {
+    std::unique_lock<std::mutex> lock(m_mutex);
+    return m_List.size();
+  }
+
+  bool empty() {
+    std::unique_lock<std::mutex> lock(m_mutex);
+    return m_List.empty();
+  }
+  void clear() {
+    std::unique_lock<std::mutex> lock(m_mutex);
+    m_List.clear();
+  }
+
+ private:
+  bool full() {
+    if (m_List.size() == maxSize) return true;
+    return false;
+  }
+
+ private:
+  std::list<T> m_List;
+  std::mutex m_mutex;
+  std::condition_variable m_cond;
+  size_t maxSize;
+};
+
+inline void CheckInputFile(const char *szInFilePath) {
+  std::ifstream fpIn(szInFilePath, std::ios::in | std::ios::binary);
+  if (fpIn.fail()) {
+    std::ostringstream err;
+    err << "Unable to open input file: " << szInFilePath << std::endl;
+    throw std::invalid_argument(err.str());
+  }
+}
+
+inline void ValidateResolution(int nWidth, int nHeight) {
+  if (nWidth <= 0 || nHeight <= 0) {
+    std::ostringstream err;
+    err << "Please specify positive non zero resolution as -s WxH. Current "
+           "resolution is "
+        << nWidth << "x" << nHeight << std::endl;
+    throw std::invalid_argument(err.str());
+  }
+}
+
+template <class COLOR32>
+void Nv12ToColor32(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra,
+                   int nBgraPitch, int nWidth, int nHeight, int iMatrix = 0);
+template <class COLOR64>
+void Nv12ToColor64(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra,
+                   int nBgraPitch, int nWidth, int nHeight, int iMatrix = 0);
+
+template <class COLOR32>
+void P016ToColor32(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra,
+                   int nBgraPitch, int nWidth, int nHeight, int iMatrix = 4);
+template <class COLOR64>
+void P016ToColor64(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra,
+                   int nBgraPitch, int nWidth, int nHeight, int iMatrix = 4);
+
+template <class COLOR32>
+void YUV444ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra,
+                     int nBgraPitch, int nWidth, int nHeight, int iMatrix = 0);
+template <class COLOR64>
+void YUV444ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra,
+                     int nBgraPitch, int nWidth, int nHeight, int iMatrix = 0);
+
+template <class COLOR32>
+void YUV444P16ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra,
+                        int nBgraPitch, int nWidth, int nHeight,
+                        int iMatrix = 4);
+template <class COLOR64>
+void YUV444P16ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra,
+                        int nBgraPitch, int nWidth, int nHeight,
+                        int iMatrix = 4);
+
+template <class COLOR32>
+void Nv12ToColorPlanar(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgrp,
+                       int nBgrpPitch, int nWidth, int nHeight,
+                       int iMatrix = 0);
+template <class COLOR32>
+void P016ToColorPlanar(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgrp,
+                       int nBgrpPitch, int nWidth, int nHeight,
+                       int iMatrix = 4);
+
+template <class COLOR32>
+void YUV444ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp,
+                         int nBgrpPitch, int nWidth, int nHeight,
+                         int iMatrix = 0);
+template <class COLOR32>
+void YUV444P16ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp,
+                            int nBgrpPitch, int nWidth, int nHeight,
+                            int iMatrix = 4);
+
+void Bgra64ToP016(uint8_t *dpBgra, int nBgraPitch, uint8_t *dpP016,
+                  int nP016Pitch, int nWidth, int nHeight, int iMatrix = 4);
+
+void ConvertUInt8ToUInt16(uint8_t *dpUInt8, uint16_t *dpUInt16, int nSrcPitch,
+                          int nDestPitch, int nWidth, int nHeight);
+void ConvertUInt16ToUInt8(uint16_t *dpUInt16, uint8_t *dpUInt8, int nSrcPitch,
+                          int nDestPitch, int nWidth, int nHeight);
+
+void ResizeNv12(unsigned char *dpDstNv12, int nDstPitch, int nDstWidth,
+                int nDstHeight, unsigned char *dpSrcNv12, int nSrcPitch,
+                int nSrcWidth, int nSrcHeight,
+                unsigned char *dpDstNv12UV = nullptr);
+void ResizeP016(unsigned char *dpDstP016, int nDstPitch, int nDstWidth,
+                int nDstHeight, unsigned char *dpSrcP016, int nSrcPitch,
+                int nSrcWidth, int nSrcHeight,
+                unsigned char *dpDstP016UV = nullptr);
+
+void ScaleYUV420(unsigned char *dpDstY, unsigned char *dpDstU,
+                 unsigned char *dpDstV, int nDstPitch, int nDstChromaPitch,
+                 int nDstWidth, int nDstHeight, unsigned char *dpSrcY,
+                 unsigned char *dpSrcU, unsigned char *dpSrcV, int nSrcPitch,
+                 int nSrcChromaPitch, int nSrcWidth, int nSrcHeight,
+                 bool bSemiplanar);
+
+#ifdef __cuda_cuda_h__
+void ComputeCRC(uint8_t *pBuffer, uint32_t *crcValue,
+                CUstream_st *outputCUStream);
+#endif
--- a/thirdparty/nvcodec/Samples/Utils/NvEncoderCLIOptions.h
+++ b/thirdparty/nvcodec/Samples/Utils/NvEncoderCLIOptions.h
@@ -0,0 +1,644 @@
+/*
+* Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#pragma once
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <stdexcept>
+#include <sstream>
+#include <iterator>
+#include <cstring>
+#include <functional>
+#include "../Utils/Logger.h"
+
+extern simplelogger::Logger *logger;
+
+#ifndef _WIN32
+inline bool operator==(const GUID &guid1, const GUID &guid2) {
+    return !memcmp(&guid1, &guid2, sizeof(GUID));
+}
+
+inline bool operator!=(const GUID &guid1, const GUID &guid2) {
+    return !(guid1 == guid2);
+}
+#endif
+
+/*
+ * Helper class for parsing generic encoder options and preparing encoder
+ * initialization parameters. This class also provides some utility methods
+ * which generate verbose descriptions of the provided set of encoder
+ * initialization parameters.
+ */
+class NvEncoderInitParam {
+public:
+    NvEncoderInitParam(const char *szParam = "", 
+        std::function<void(NV_ENC_INITIALIZE_PARAMS *pParams)> *pfuncInit = NULL, bool _bLowLatency = false) 
+        : strParam(szParam), bLowLatency(_bLowLatency)
+    {
+        if (pfuncInit) {
+            funcInit = *pfuncInit;
+        }
+
+        std::transform(strParam.begin(), strParam.end(), strParam.begin(), tolower);
+        std::istringstream ss(strParam);
+        tokens = std::vector<std::string> {
+            std::istream_iterator<std::string>(ss),
+            std::istream_iterator<std::string>() 
+        };
+
+        for (unsigned i = 0; i < tokens.size(); i++)
+        {
+            if (tokens[i] == "-codec" && ++i != tokens.size())
+            {
+                ParseString("-codec", tokens[i], vCodec, szCodecNames, &guidCodec);
+                continue;
+            }
+            if (tokens[i] == "-preset" && ++i != tokens.size()) {
+                ParseString("-preset", tokens[i], vPreset, szPresetNames, &guidPreset);
+                continue;
+            }
+            if (tokens[i] == "-tuninginfo" && ++i != tokens.size())
+            {
+                ParseString("-tuninginfo", tokens[i], vTuningInfo, szTuningInfoNames, &m_TuningInfo);
+                continue;
+            }
+        }
+    }
+    virtual ~NvEncoderInitParam() {}
+    virtual bool IsCodecH264() {
+        return GetEncodeGUID() == NV_ENC_CODEC_H264_GUID;
+    }
+
+    virtual bool IsCodecHEVC() {
+        return GetEncodeGUID() == NV_ENC_CODEC_HEVC_GUID;
+    }
+    std::string GetHelpMessage(bool bMeOnly = false, bool bUnbuffered = false, bool bHide444 = false, bool bOutputInVidMem = false)
+    {
+        std::ostringstream oss;
+
+        if (bOutputInVidMem && bMeOnly)
+        {
+            oss << "-codec       Codec: " << "h264" << std::endl;
+        }
+        else
+        {
+            oss << "-codec       Codec: " << szCodecNames << std::endl;
+        }
+
+        oss << "-preset      Preset: " << szPresetNames << std::endl
+            << "-profile     H264: " << szH264ProfileNames;
+
+        if (bOutputInVidMem && bMeOnly)
+        {
+            oss << std::endl;
+        }
+        else
+        {
+            oss << "; HEVC: " << szHevcProfileNames << std::endl;
+        }
+
+        if (!bMeOnly)
+        {
+            if (bLowLatency == false)
+                oss << "-tuninginfo  TuningInfo: " << szTuningInfoNames << std::endl;
+            else
+                oss << "-tuninginfo  TuningInfo: " << szLowLatencyTuningInfoNames << std::endl;
+            oss << "-multipass   Multipass: " << szMultipass << std::endl;
+        }
+
+        if (!bHide444 && !bLowLatency)
+        {
+            oss << "-444         (Only for RGB input) YUV444 encode" << std::endl;
+        }
+        if (bMeOnly) return oss.str();
+        oss << "-fps         Frame rate" << std::endl;
+
+        if (!bUnbuffered && !bLowLatency)
+        {
+            oss << "-bf          Number of consecutive B-frames" << std::endl;
+        }
+
+        if (!bLowLatency)
+        {
+            oss << "-rc          Rate control mode: " << szRcModeNames << std::endl
+                << "-gop         Length of GOP (Group of Pictures)" << std::endl
+                << "-bitrate     Average bit rate, can be in unit of 1, K, M" << std::endl
+                << "-maxbitrate  Max bit rate, can be in unit of 1, K, M" << std::endl
+                << "-vbvbufsize  VBV buffer size in bits, can be in unit of 1, K, M" << std::endl
+                << "-vbvinit     VBV initial delay in bits, can be in unit of 1, K, M" << std::endl
+                << "-aq          Enable spatial AQ and set its stength (range 1-15, 0-auto)" << std::endl
+                << "-temporalaq  (No value) Enable temporal AQ" << std::endl
+                << "-cq          Target constant quality level for VBR mode (range 1-51, 0-auto)" << std::endl;
+        }
+        if (!bUnbuffered && !bLowLatency)
+        {
+            oss << "-lookahead   Maximum depth of lookahead (range 0-(31 - number of B frames))" << std::endl;
+        }
+        oss << "-qmin        Min QP value" << std::endl
+            << "-qmax        Max QP value" << std::endl
+            << "-initqp      Initial QP value" << std::endl;
+        if (!bLowLatency)
+        {
+            oss << "-constqp     QP value for constqp rate control mode" << std::endl
+                << "Note: QP value can be in the form of qp_of_P_B_I or qp_P,qp_B,qp_I (no space)" << std::endl;
+        }
+        if (bUnbuffered && !bLowLatency)
+        {
+            oss << "Note: Options -bf and -lookahead are unavailable for this app" << std::endl;
+        }
+        return oss.str();
+    }
+
+    /**
+     * @brief Generate and return a string describing the values of the main/common
+     *        encoder initialization parameters
+     */
+    std::string MainParamToString(const NV_ENC_INITIALIZE_PARAMS *pParams) {
+        std::ostringstream os;
+        os 
+            << "Encoding Parameters:" 
+            << std::endl << "\tcodec        : " << ConvertValueToString(vCodec, szCodecNames, pParams->encodeGUID)
+            << std::endl << "\tpreset       : " << ConvertValueToString(vPreset, szPresetNames, pParams->presetGUID);
+        if (pParams->tuningInfo)
+        {
+            os << std::endl << "\ttuningInfo   : " << ConvertValueToString(vTuningInfo, szTuningInfoNames, pParams->tuningInfo);
+        }
+        os
+            << std::endl << "\tprofile      : " << ConvertValueToString(vProfile, szProfileNames, pParams->encodeConfig->profileGUID)
+            << std::endl << "\tchroma       : " << ConvertValueToString(vChroma, szChromaNames, (pParams->encodeGUID == NV_ENC_CODEC_H264_GUID) ? pParams->encodeConfig->encodeCodecConfig.h264Config.chromaFormatIDC : pParams->encodeConfig->encodeCodecConfig.hevcConfig.chromaFormatIDC)
+            << std::endl << "\tbitdepth     : " << ((pParams->encodeGUID == NV_ENC_CODEC_H264_GUID) ? 0 : pParams->encodeConfig->encodeCodecConfig.hevcConfig.pixelBitDepthMinus8) + 8
+            << std::endl << "\trc           : " << ConvertValueToString(vRcMode, szRcModeNames, pParams->encodeConfig->rcParams.rateControlMode)
+            ;
+            if (pParams->encodeConfig->rcParams.rateControlMode == NV_ENC_PARAMS_RC_CONSTQP) {
+                os << " (P,B,I=" << pParams->encodeConfig->rcParams.constQP.qpInterP << "," << pParams->encodeConfig->rcParams.constQP.qpInterB << "," << pParams->encodeConfig->rcParams.constQP.qpIntra << ")";
+            }
+        os
+            << std::endl << "\tfps          : " << pParams->frameRateNum << "/" << pParams->frameRateDen
+            << std::endl << "\tgop          : " << (pParams->encodeConfig->gopLength == NVENC_INFINITE_GOPLENGTH ? "INF" : std::to_string(pParams->encodeConfig->gopLength))
+            << std::endl << "\tbf           : " << pParams->encodeConfig->frameIntervalP - 1
+            << std::endl << "\tmultipass    : " << pParams->encodeConfig->rcParams.multiPass
+            << std::endl << "\tsize         : " << pParams->encodeWidth << "x" << pParams->encodeHeight
+            << std::endl << "\tbitrate      : " << pParams->encodeConfig->rcParams.averageBitRate
+            << std::endl << "\tmaxbitrate   : " << pParams->encodeConfig->rcParams.maxBitRate
+            << std::endl << "\tvbvbufsize   : " << pParams->encodeConfig->rcParams.vbvBufferSize
+            << std::endl << "\tvbvinit      : " << pParams->encodeConfig->rcParams.vbvInitialDelay
+            << std::endl << "\taq           : " << (pParams->encodeConfig->rcParams.enableAQ ? (pParams->encodeConfig->rcParams.aqStrength ? std::to_string(pParams->encodeConfig->rcParams.aqStrength) : "auto") : "disabled")
+            << std::endl << "\ttemporalaq   : " << (pParams->encodeConfig->rcParams.enableTemporalAQ ? "enabled" : "disabled")
+            << std::endl << "\tlookahead    : " << (pParams->encodeConfig->rcParams.enableLookahead ? std::to_string(pParams->encodeConfig->rcParams.lookaheadDepth) : "disabled")
+            << std::endl << "\tcq           : " << (unsigned int)pParams->encodeConfig->rcParams.targetQuality
+            << std::endl << "\tqmin         : P,B,I=" << (int)pParams->encodeConfig->rcParams.minQP.qpInterP << "," << (int)pParams->encodeConfig->rcParams.minQP.qpInterB << "," << (int)pParams->encodeConfig->rcParams.minQP.qpIntra
+            << std::endl << "\tqmax         : P,B,I=" << (int)pParams->encodeConfig->rcParams.maxQP.qpInterP << "," << (int)pParams->encodeConfig->rcParams.maxQP.qpInterB << "," << (int)pParams->encodeConfig->rcParams.maxQP.qpIntra
+            << std::endl << "\tinitqp       : P,B,I=" << (int)pParams->encodeConfig->rcParams.initialRCQP.qpInterP << "," << (int)pParams->encodeConfig->rcParams.initialRCQP.qpInterB << "," << (int)pParams->encodeConfig->rcParams.initialRCQP.qpIntra
+            ;
+        return os.str();
+    }
+
+public:
+    virtual GUID GetEncodeGUID() { return guidCodec; }
+    virtual GUID GetPresetGUID() { return guidPreset; }
+    virtual NV_ENC_TUNING_INFO GetTuningInfo() { return m_TuningInfo; }
+
+    /*
+     * @brief Set encoder initialization parameters based on input options
+     * This method parses the tokens formed from the command line options
+     * provided to the application and sets the fields from NV_ENC_INITIALIZE_PARAMS
+     * based on the supplied values.
+     */
+    virtual void SetInitParams(NV_ENC_INITIALIZE_PARAMS *pParams, NV_ENC_BUFFER_FORMAT eBufferFormat)
+    {
+        NV_ENC_CONFIG &config = *pParams->encodeConfig;
+        for (unsigned i = 0; i < tokens.size(); i++)
+        {
+            if (
+                tokens[i] == "-codec"      && ++i ||
+                tokens[i] == "-preset"     && ++i ||
+                tokens[i] == "-tuninginfo" && ++i ||
+                tokens[i] == "-multipass" && ++i != tokens.size() && ParseString("-multipass", tokens[i], vMultiPass, szMultipass, &config.rcParams.multiPass) ||
+                tokens[i] == "-profile"    && ++i != tokens.size() && (IsCodecH264() ? 
+                    ParseString("-profile", tokens[i], vH264Profile, szH264ProfileNames, &config.profileGUID) : 
+                    ParseString("-profile", tokens[i], vHevcProfile, szHevcProfileNames, &config.profileGUID)) ||
+                tokens[i] == "-rc"         && ++i != tokens.size() && ParseString("-rc",          tokens[i], vRcMode, szRcModeNames, &config.rcParams.rateControlMode)                    ||
+                tokens[i] == "-fps"        && ++i != tokens.size() && ParseInt("-fps",            tokens[i], &pParams->frameRateNum)                                                      ||
+                tokens[i] == "-bf"         && ++i != tokens.size() && ParseInt("-bf",             tokens[i], &config.frameIntervalP) && ++config.frameIntervalP                           ||
+                tokens[i] == "-bitrate"    && ++i != tokens.size() && ParseBitRate("-bitrate",    tokens[i], &config.rcParams.averageBitRate)                                             ||
+                tokens[i] == "-maxbitrate" && ++i != tokens.size() && ParseBitRate("-maxbitrate", tokens[i], &config.rcParams.maxBitRate)                                                 ||
+                tokens[i] == "-vbvbufsize" && ++i != tokens.size() && ParseBitRate("-vbvbufsize", tokens[i], &config.rcParams.vbvBufferSize)                                              ||
+                tokens[i] == "-vbvinit"    && ++i != tokens.size() && ParseBitRate("-vbvinit",    tokens[i], &config.rcParams.vbvInitialDelay)                                            ||
+                tokens[i] == "-cq"         && ++i != tokens.size() && ParseInt("-cq",             tokens[i], &config.rcParams.targetQuality)                                              ||
+                tokens[i] == "-initqp"     && ++i != tokens.size() && ParseQp("-initqp",          tokens[i], &config.rcParams.initialRCQP) && (config.rcParams.enableInitialRCQP = true)  ||
+                tokens[i] == "-qmin"       && ++i != tokens.size() && ParseQp("-qmin",            tokens[i], &config.rcParams.minQP) && (config.rcParams.enableMinQP = true)              ||
+                tokens[i] == "-qmax"       && ++i != tokens.size() && ParseQp("-qmax",            tokens[i], &config.rcParams.maxQP) && (config.rcParams.enableMaxQP = true)              ||
+                tokens[i] == "-constqp"    && ++i != tokens.size() && ParseQp("-constqp",         tokens[i], &config.rcParams.constQP)                                                    ||
+                tokens[i] == "-temporalaq" && (config.rcParams.enableTemporalAQ = true)
+            )
+            {
+                continue;
+            }
+            if (tokens[i] == "-lookahead" && ++i != tokens.size() && ParseInt("-lookahead", tokens[i], &config.rcParams.lookaheadDepth))
+            {
+                config.rcParams.enableLookahead = config.rcParams.lookaheadDepth > 0;
+                continue;
+            }
+            int aqStrength;
+            if (tokens[i] == "-aq" && ++i != tokens.size() && ParseInt("-aq", tokens[i], &aqStrength)) {
+                config.rcParams.enableAQ = true;
+                config.rcParams.aqStrength = aqStrength;
+                continue;
+            }
+
+            if (tokens[i] == "-gop" && ++i != tokens.size() && ParseInt("-gop", tokens[i], &config.gopLength))
+            {
+                if (IsCodecH264()) 
+                {
+                    config.encodeCodecConfig.h264Config.idrPeriod = config.gopLength;
+                }
+                else 
+                {
+                    config.encodeCodecConfig.hevcConfig.idrPeriod = config.gopLength;
+                }
+                continue;
+            }
+
+            if (tokens[i] == "-444")
+            {
+                if (IsCodecH264()) 
+                {
+                    config.encodeCodecConfig.h264Config.chromaFormatIDC = 3;
+                } else 
+                {
+                    config.encodeCodecConfig.hevcConfig.chromaFormatIDC = 3;
+                }
+                continue;
+            }
+
+            std::ostringstream errmessage;
+            errmessage << "Incorrect parameter: " << tokens[i] << std::endl;
+            errmessage << "Re-run the application with the -h option to get a list of the supported options.";
+            errmessage << std::endl;
+
+            throw std::invalid_argument(errmessage.str());
+        }
+
+        if (IsCodecHEVC())
+        {
+            if (eBufferFormat == NV_ENC_BUFFER_FORMAT_YUV420_10BIT || eBufferFormat == NV_ENC_BUFFER_FORMAT_YUV444_10BIT)
+            {
+                config.encodeCodecConfig.hevcConfig.pixelBitDepthMinus8 = 2;
+            }
+        }
+
+        funcInit(pParams);
+        LOG(INFO) << NvEncoderInitParam().MainParamToString(pParams);
+        LOG(TRACE) << NvEncoderInitParam().FullParamToString(pParams);
+    }
+
+private:
+    /*
+     * Helper methods for parsing tokens (generated by splitting the command line)
+     * and performing conversions to the appropriate target type/value.
+     */
+    template<typename T>
+    bool ParseString(const std::string &strName, const std::string &strValue, const std::vector<T> &vValue, const std::string &strValueNames, T *pValue) {
+        std::vector<std::string> vstrValueName = split(strValueNames, ' ');
+        auto it = std::find(vstrValueName.begin(), vstrValueName.end(), strValue);
+        if (it == vstrValueName.end()) {
+            LOG(ERROR) << strName << " options: " << strValueNames;
+            return false;
+        }
+        *pValue = vValue[it - vstrValueName.begin()];
+        return true;
+    }
+    template<typename T>
+    std::string ConvertValueToString(const std::vector<T> &vValue, const std::string &strValueNames, T value) {
+        auto it = std::find(vValue.begin(), vValue.end(), value);
+        if (it == vValue.end()) {
+            LOG(ERROR) << "Invalid value. Can't convert to one of " << strValueNames;
+            return std::string();
+        }
+        return split(strValueNames, ' ')[it - vValue.begin()];
+    }
+    bool ParseBitRate(const std::string &strName, const std::string &strValue, unsigned *pBitRate) {
+        try {
+            size_t l;
+            double r = std::stod(strValue, &l);
+            char c = strValue[l];
+            if (c != 0 && c != 'k' && c != 'm') {
+                LOG(ERROR) << strName << " units: 1, K, M (lower case also allowed)";
+            }
+            *pBitRate = (unsigned)((c == 'm' ? 1000000 : (c == 'k' ? 1000 : 1)) * r);
+        } catch (std::invalid_argument) {
+            return false;
+        }
+        return true;
+    }
+    template<typename T>
+    bool ParseInt(const std::string &strName, const std::string &strValue, T *pInt) {
+        try {
+            *pInt = std::stoi(strValue);
+        } catch (std::invalid_argument) {
+            LOG(ERROR) << strName << " need a value of positive number";
+            return false;
+        }
+        return true;
+    }
+    bool ParseQp(const std::string &strName, const std::string &strValue, NV_ENC_QP *pQp) {
+        std::vector<std::string> vQp = split(strValue, ',');
+        try {
+            if (vQp.size() == 1) {
+                unsigned qp = (unsigned)std::stoi(vQp[0]);
+                *pQp = {qp, qp, qp};
+            } else if (vQp.size() == 3) {
+                *pQp = {(unsigned)std::stoi(vQp[0]), (unsigned)std::stoi(vQp[1]), (unsigned)std::stoi(vQp[2])};
+            } else {
+                LOG(ERROR) << strName << " qp_for_P_B_I or qp_P,qp_B,qp_I (no space is allowed)";
+                return false;
+            }
+        } catch (std::invalid_argument) {
+            return false;
+        }
+        return true;
+    }
+    std::vector<std::string> split(const std::string &s, char delim) {
+        std::stringstream ss(s);
+        std::string token;
+        std::vector<std::string> tokens;
+        while (getline(ss, token, delim)) {
+            tokens.push_back(token);
+        }
+        return tokens;
+    }
+
+private:
+    std::string strParam;
+    std::function<void(NV_ENC_INITIALIZE_PARAMS *pParams)> funcInit = [](NV_ENC_INITIALIZE_PARAMS *pParams){};
+    std::vector<std::string> tokens;
+    GUID guidCodec = NV_ENC_CODEC_H264_GUID;
+    GUID guidPreset = NV_ENC_PRESET_P3_GUID;
+    NV_ENC_TUNING_INFO m_TuningInfo = NV_ENC_TUNING_INFO_HIGH_QUALITY;
+    bool bLowLatency = false;
+    
+    const char *szCodecNames = "h264 hevc";
+    std::vector<GUID> vCodec = std::vector<GUID> {
+        NV_ENC_CODEC_H264_GUID,
+        NV_ENC_CODEC_HEVC_GUID
+    };
+    
+    const char *szChromaNames = "yuv420 yuv444";
+    std::vector<uint32_t> vChroma = std::vector<uint32_t>
+    {
+        1, 3
+    };
+    
+    const char *szPresetNames = "p1 p2 p3 p4 p5 p6 p7";
+    std::vector<GUID> vPreset = std::vector<GUID> {
+        NV_ENC_PRESET_P1_GUID,
+        NV_ENC_PRESET_P2_GUID,
+        NV_ENC_PRESET_P3_GUID,
+        NV_ENC_PRESET_P4_GUID,
+        NV_ENC_PRESET_P5_GUID,
+        NV_ENC_PRESET_P6_GUID,
+        NV_ENC_PRESET_P7_GUID,
+    };
+
+    const char *szH264ProfileNames = "baseline main high high444";
+    std::vector<GUID> vH264Profile = std::vector<GUID> {
+        NV_ENC_H264_PROFILE_BASELINE_GUID,
+        NV_ENC_H264_PROFILE_MAIN_GUID,
+        NV_ENC_H264_PROFILE_HIGH_GUID,
+        NV_ENC_H264_PROFILE_HIGH_444_GUID,
+    };
+    const char *szHevcProfileNames = "main main10 frext";
+    std::vector<GUID> vHevcProfile = std::vector<GUID> {
+        NV_ENC_HEVC_PROFILE_MAIN_GUID,
+        NV_ENC_HEVC_PROFILE_MAIN10_GUID,
+        NV_ENC_HEVC_PROFILE_FREXT_GUID,
+    };
+    const char *szProfileNames = "(default) auto baseline(h264) main(h264) high(h264) high444(h264)"
+        " stereo(h264) progressiv_high(h264) constrained_high(h264)"
+        " main(hevc) main10(hevc) frext(hevc)";
+    std::vector<GUID> vProfile = std::vector<GUID> {
+        GUID{},
+        NV_ENC_CODEC_PROFILE_AUTOSELECT_GUID,
+        NV_ENC_H264_PROFILE_BASELINE_GUID,
+        NV_ENC_H264_PROFILE_MAIN_GUID,
+        NV_ENC_H264_PROFILE_HIGH_GUID,
+        NV_ENC_H264_PROFILE_HIGH_444_GUID,
+        NV_ENC_H264_PROFILE_STEREO_GUID,
+        NV_ENC_H264_PROFILE_PROGRESSIVE_HIGH_GUID,
+        NV_ENC_H264_PROFILE_CONSTRAINED_HIGH_GUID,
+        NV_ENC_HEVC_PROFILE_MAIN_GUID,
+        NV_ENC_HEVC_PROFILE_MAIN10_GUID,
+        NV_ENC_HEVC_PROFILE_FREXT_GUID,
+    };
+
+    const char *szLowLatencyTuningInfoNames = "lowlatency ultralowlatency";
+    const char *szTuningInfoNames = "hq lowlatency ultralowlatency lossless";
+    std::vector<NV_ENC_TUNING_INFO> vTuningInfo = std::vector<NV_ENC_TUNING_INFO>{
+        NV_ENC_TUNING_INFO_HIGH_QUALITY,
+        NV_ENC_TUNING_INFO_LOW_LATENCY,
+        NV_ENC_TUNING_INFO_ULTRA_LOW_LATENCY,
+        NV_ENC_TUNING_INFO_LOSSLESS
+    };
+
+    const char *szRcModeNames = "constqp vbr cbr";
+    std::vector<NV_ENC_PARAMS_RC_MODE> vRcMode = std::vector<NV_ENC_PARAMS_RC_MODE> {
+        NV_ENC_PARAMS_RC_CONSTQP,
+        NV_ENC_PARAMS_RC_VBR,
+        NV_ENC_PARAMS_RC_CBR,
+    };
+
+    const char *szMultipass = "disabled qres fullres";
+    std::vector<NV_ENC_MULTI_PASS> vMultiPass = std::vector<NV_ENC_MULTI_PASS>{
+        NV_ENC_MULTI_PASS_DISABLED,
+        NV_ENC_TWO_PASS_QUARTER_RESOLUTION,
+        NV_ENC_TWO_PASS_FULL_RESOLUTION,
+    };
+
+   const char *szQpMapModeNames = "disabled emphasis_level_map delta_qp_map qp_map";
+    std::vector<NV_ENC_QP_MAP_MODE> vQpMapMode = std::vector<NV_ENC_QP_MAP_MODE> {
+        NV_ENC_QP_MAP_DISABLED,
+        NV_ENC_QP_MAP_EMPHASIS,
+        NV_ENC_QP_MAP_DELTA,
+        NV_ENC_QP_MAP,
+    };
+
+
+public:
+    /*
+     * Generates and returns a string describing the values for each field in
+     * the NV_ENC_INITIALIZE_PARAMS structure (i.e. a description of the entire
+     * set of initialization parameters supplied to the API).
+     */
+    std::string FullParamToString(const NV_ENC_INITIALIZE_PARAMS *pInitializeParams) {
+        std::ostringstream os;
+        os << "NV_ENC_INITIALIZE_PARAMS:" << std::endl
+            << "encodeGUID: " << ConvertValueToString(vCodec, szCodecNames, pInitializeParams->encodeGUID) << std::endl
+            << "presetGUID: " << ConvertValueToString(vPreset, szPresetNames, pInitializeParams->presetGUID) << std::endl;
+        if (pInitializeParams->tuningInfo)
+        {
+            os << "tuningInfo: " << ConvertValueToString(vTuningInfo, szTuningInfoNames, pInitializeParams->tuningInfo) << std::endl;
+        }
+        os
+            << "encodeWidth: " << pInitializeParams->encodeWidth << std::endl
+            << "encodeHeight: " << pInitializeParams->encodeHeight << std::endl
+            << "darWidth: " << pInitializeParams->darWidth << std::endl
+            << "darHeight: " << pInitializeParams->darHeight << std::endl
+            << "frameRateNum: " << pInitializeParams->frameRateNum << std::endl
+            << "frameRateDen: " << pInitializeParams->frameRateDen << std::endl
+            << "enableEncodeAsync: " << pInitializeParams->enableEncodeAsync << std::endl
+            << "reportSliceOffsets: " << pInitializeParams->reportSliceOffsets << std::endl
+            << "enableSubFrameWrite: " << pInitializeParams->enableSubFrameWrite << std::endl
+            << "enableExternalMEHints: " << pInitializeParams->enableExternalMEHints << std::endl
+            << "enableMEOnlyMode: " << pInitializeParams->enableMEOnlyMode << std::endl
+            << "enableWeightedPrediction: " << pInitializeParams->enableWeightedPrediction << std::endl
+            << "maxEncodeWidth: " << pInitializeParams->maxEncodeWidth << std::endl
+            << "maxEncodeHeight: " << pInitializeParams->maxEncodeHeight << std::endl
+            << "maxMEHintCountsPerBlock: " << pInitializeParams->maxMEHintCountsPerBlock << std::endl
+        ;
+        NV_ENC_CONFIG *pConfig = pInitializeParams->encodeConfig;
+        os << "NV_ENC_CONFIG:" << std::endl
+            << "profile: " << ConvertValueToString(vProfile, szProfileNames, pConfig->profileGUID) << std::endl
+            << "gopLength: " << pConfig->gopLength << std::endl
+            << "frameIntervalP: " << pConfig->frameIntervalP << std::endl
+            << "monoChromeEncoding: " << pConfig->monoChromeEncoding << std::endl
+            << "frameFieldMode: " << pConfig->frameFieldMode << std::endl
+            << "mvPrecision: " << pConfig->mvPrecision << std::endl
+            << "NV_ENC_RC_PARAMS:" << std::endl
+            << "    rateControlMode: 0x" << std::hex << pConfig->rcParams.rateControlMode << std::dec << std::endl
+            << "    constQP: " << pConfig->rcParams.constQP.qpInterP << ", " << pConfig->rcParams.constQP.qpInterB << ", " << pConfig->rcParams.constQP.qpIntra << std::endl
+            << "    averageBitRate:  " << pConfig->rcParams.averageBitRate << std::endl
+            << "    maxBitRate:      " << pConfig->rcParams.maxBitRate << std::endl
+            << "    vbvBufferSize:   " << pConfig->rcParams.vbvBufferSize << std::endl
+            << "    vbvInitialDelay: " << pConfig->rcParams.vbvInitialDelay << std::endl
+            << "    enableMinQP: " << pConfig->rcParams.enableMinQP << std::endl
+            << "    enableMaxQP: " << pConfig->rcParams.enableMaxQP << std::endl
+            << "    enableInitialRCQP: " << pConfig->rcParams.enableInitialRCQP << std::endl
+            << "    enableAQ: " << pConfig->rcParams.enableAQ << std::endl
+            << "    qpMapMode: " << ConvertValueToString(vQpMapMode, szQpMapModeNames, pConfig->rcParams.qpMapMode) << std::endl
+            << "    multipass: " << ConvertValueToString(vMultiPass, szMultipass, pConfig->rcParams.multiPass) << std::endl
+            << "    enableLookahead: " << pConfig->rcParams.enableLookahead << std::endl
+            << "    disableIadapt: " << pConfig->rcParams.disableIadapt << std::endl
+            << "    disableBadapt: " << pConfig->rcParams.disableBadapt << std::endl
+            << "    enableTemporalAQ: " << pConfig->rcParams.enableTemporalAQ << std::endl
+            << "    zeroReorderDelay: " << pConfig->rcParams.zeroReorderDelay << std::endl
+            << "    enableNonRefP: " << pConfig->rcParams.enableNonRefP << std::endl
+            << "    strictGOPTarget: " << pConfig->rcParams.strictGOPTarget << std::endl
+            << "    aqStrength: " << pConfig->rcParams.aqStrength << std::endl
+            << "    minQP: " << pConfig->rcParams.minQP.qpInterP << ", " << pConfig->rcParams.minQP.qpInterB << ", " << pConfig->rcParams.minQP.qpIntra << std::endl
+            << "    maxQP: " << pConfig->rcParams.maxQP.qpInterP << ", " << pConfig->rcParams.maxQP.qpInterB << ", " << pConfig->rcParams.maxQP.qpIntra << std::endl
+            << "    initialRCQP: " << pConfig->rcParams.initialRCQP.qpInterP << ", " << pConfig->rcParams.initialRCQP.qpInterB << ", " << pConfig->rcParams.initialRCQP.qpIntra << std::endl
+            << "    temporallayerIdxMask: " << pConfig->rcParams.temporallayerIdxMask << std::endl
+            << "    temporalLayerQP: " << (int)pConfig->rcParams.temporalLayerQP[0] << ", " << (int)pConfig->rcParams.temporalLayerQP[1] << ", " << (int)pConfig->rcParams.temporalLayerQP[2] << ", " << (int)pConfig->rcParams.temporalLayerQP[3] << ", " << (int)pConfig->rcParams.temporalLayerQP[4] << ", " << (int)pConfig->rcParams.temporalLayerQP[5] << ", " << (int)pConfig->rcParams.temporalLayerQP[6] << ", " << (int)pConfig->rcParams.temporalLayerQP[7] << std::endl
+            << "    targetQuality: " << pConfig->rcParams.targetQuality << std::endl
+            << "    lookaheadDepth: " << pConfig->rcParams.lookaheadDepth << std::endl;
+        if (pInitializeParams->encodeGUID == NV_ENC_CODEC_H264_GUID) {
+            os  
+            << "NV_ENC_CODEC_CONFIG (H264):" << std::endl
+            << "    enableStereoMVC: " << pConfig->encodeCodecConfig.h264Config.enableStereoMVC << std::endl
+            << "    hierarchicalPFrames: " << pConfig->encodeCodecConfig.h264Config.hierarchicalPFrames << std::endl
+            << "    hierarchicalBFrames: " << pConfig->encodeCodecConfig.h264Config.hierarchicalBFrames << std::endl
+            << "    outputBufferingPeriodSEI: " << pConfig->encodeCodecConfig.h264Config.outputBufferingPeriodSEI << std::endl
+            << "    outputPictureTimingSEI: " << pConfig->encodeCodecConfig.h264Config.outputPictureTimingSEI << std::endl
+            << "    outputAUD: " << pConfig->encodeCodecConfig.h264Config.outputAUD << std::endl
+            << "    disableSPSPPS: " << pConfig->encodeCodecConfig.h264Config.disableSPSPPS << std::endl
+            << "    outputFramePackingSEI: " << pConfig->encodeCodecConfig.h264Config.outputFramePackingSEI << std::endl
+            << "    outputRecoveryPointSEI: " << pConfig->encodeCodecConfig.h264Config.outputRecoveryPointSEI << std::endl
+            << "    enableIntraRefresh: " << pConfig->encodeCodecConfig.h264Config.enableIntraRefresh << std::endl
+            << "    enableConstrainedEncoding: " << pConfig->encodeCodecConfig.h264Config.enableConstrainedEncoding << std::endl
+            << "    repeatSPSPPS: " << pConfig->encodeCodecConfig.h264Config.repeatSPSPPS << std::endl
+            << "    enableVFR: " << pConfig->encodeCodecConfig.h264Config.enableVFR << std::endl
+            << "    enableLTR: " << pConfig->encodeCodecConfig.h264Config.enableLTR << std::endl
+            << "    qpPrimeYZeroTransformBypassFlag: " << pConfig->encodeCodecConfig.h264Config.qpPrimeYZeroTransformBypassFlag << std::endl
+            << "    useConstrainedIntraPred: " << pConfig->encodeCodecConfig.h264Config.useConstrainedIntraPred << std::endl
+            << "    level: " << pConfig->encodeCodecConfig.h264Config.level << std::endl
+            << "    idrPeriod: " << pConfig->encodeCodecConfig.h264Config.idrPeriod << std::endl
+            << "    separateColourPlaneFlag: " << pConfig->encodeCodecConfig.h264Config.separateColourPlaneFlag << std::endl
+            << "    disableDeblockingFilterIDC: " << pConfig->encodeCodecConfig.h264Config.disableDeblockingFilterIDC << std::endl
+            << "    numTemporalLayers: " << pConfig->encodeCodecConfig.h264Config.numTemporalLayers << std::endl
+            << "    spsId: " << pConfig->encodeCodecConfig.h264Config.spsId << std::endl
+            << "    ppsId: " << pConfig->encodeCodecConfig.h264Config.ppsId << std::endl
+            << "    adaptiveTransformMode: " << pConfig->encodeCodecConfig.h264Config.adaptiveTransformMode << std::endl
+            << "    fmoMode: " << pConfig->encodeCodecConfig.h264Config.fmoMode << std::endl
+            << "    bdirectMode: " << pConfig->encodeCodecConfig.h264Config.bdirectMode << std::endl
+            << "    entropyCodingMode: " << pConfig->encodeCodecConfig.h264Config.entropyCodingMode << std::endl
+            << "    stereoMode: " << pConfig->encodeCodecConfig.h264Config.stereoMode << std::endl
+            << "    intraRefreshPeriod: " << pConfig->encodeCodecConfig.h264Config.intraRefreshPeriod << std::endl
+            << "    intraRefreshCnt: " << pConfig->encodeCodecConfig.h264Config.intraRefreshCnt << std::endl
+            << "    maxNumRefFrames: " << pConfig->encodeCodecConfig.h264Config.maxNumRefFrames << std::endl
+            << "    sliceMode: " << pConfig->encodeCodecConfig.h264Config.sliceMode << std::endl
+            << "    sliceModeData: " << pConfig->encodeCodecConfig.h264Config.sliceModeData << std::endl
+            << "    NV_ENC_CONFIG_H264_VUI_PARAMETERS:" << std::endl
+            << "        overscanInfoPresentFlag: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.overscanInfoPresentFlag << std::endl
+            << "        overscanInfo: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.overscanInfo << std::endl
+            << "        videoSignalTypePresentFlag: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.videoSignalTypePresentFlag << std::endl
+            << "        videoFormat: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.videoFormat << std::endl
+            << "        videoFullRangeFlag: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.videoFullRangeFlag << std::endl
+            << "        colourDescriptionPresentFlag: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.colourDescriptionPresentFlag << std::endl
+            << "        colourPrimaries: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.colourPrimaries << std::endl
+            << "        transferCharacteristics: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.transferCharacteristics << std::endl
+            << "        colourMatrix: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.colourMatrix << std::endl
+            << "        chromaSampleLocationFlag: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.chromaSampleLocationFlag << std::endl
+            << "        chromaSampleLocationTop: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.chromaSampleLocationTop << std::endl
+            << "        chromaSampleLocationBot: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.chromaSampleLocationBot << std::endl
+            << "        bitstreamRestrictionFlag: " << pConfig->encodeCodecConfig.h264Config.h264VUIParameters.bitstreamRestrictionFlag << std::endl
+            << "    ltrNumFrames: " << pConfig->encodeCodecConfig.h264Config.ltrNumFrames << std::endl
+            << "    ltrTrustMode: " << pConfig->encodeCodecConfig.h264Config.ltrTrustMode << std::endl
+            << "    chromaFormatIDC: " << pConfig->encodeCodecConfig.h264Config.chromaFormatIDC << std::endl
+            << "    maxTemporalLayers: " << pConfig->encodeCodecConfig.h264Config.maxTemporalLayers << std::endl;
+        } else if (pInitializeParams->encodeGUID == NV_ENC_CODEC_HEVC_GUID) {
+            os  
+            << "NV_ENC_CODEC_CONFIG (HEVC):" << std::endl
+            << "    level: " << pConfig->encodeCodecConfig.hevcConfig.level << std::endl
+            << "    tier: " << pConfig->encodeCodecConfig.hevcConfig.tier << std::endl
+            << "    minCUSize: " << pConfig->encodeCodecConfig.hevcConfig.minCUSize << std::endl
+            << "    maxCUSize: " << pConfig->encodeCodecConfig.hevcConfig.maxCUSize << std::endl
+            << "    useConstrainedIntraPred: " << pConfig->encodeCodecConfig.hevcConfig.useConstrainedIntraPred << std::endl
+            << "    disableDeblockAcrossSliceBoundary: " << pConfig->encodeCodecConfig.hevcConfig.disableDeblockAcrossSliceBoundary << std::endl
+            << "    outputBufferingPeriodSEI: " << pConfig->encodeCodecConfig.hevcConfig.outputBufferingPeriodSEI << std::endl
+            << "    outputPictureTimingSEI: " << pConfig->encodeCodecConfig.hevcConfig.outputPictureTimingSEI << std::endl
+            << "    outputAUD: " << pConfig->encodeCodecConfig.hevcConfig.outputAUD << std::endl
+            << "    enableLTR: " << pConfig->encodeCodecConfig.hevcConfig.enableLTR << std::endl
+            << "    disableSPSPPS: " << pConfig->encodeCodecConfig.hevcConfig.disableSPSPPS << std::endl
+            << "    repeatSPSPPS: " << pConfig->encodeCodecConfig.hevcConfig.repeatSPSPPS << std::endl
+            << "    enableIntraRefresh: " << pConfig->encodeCodecConfig.hevcConfig.enableIntraRefresh << std::endl
+            << "    chromaFormatIDC: " << pConfig->encodeCodecConfig.hevcConfig.chromaFormatIDC << std::endl
+            << "    pixelBitDepthMinus8: " << pConfig->encodeCodecConfig.hevcConfig.pixelBitDepthMinus8 << std::endl
+            << "    idrPeriod: " << pConfig->encodeCodecConfig.hevcConfig.idrPeriod << std::endl
+            << "    intraRefreshPeriod: " << pConfig->encodeCodecConfig.hevcConfig.intraRefreshPeriod << std::endl
+            << "    intraRefreshCnt: " << pConfig->encodeCodecConfig.hevcConfig.intraRefreshCnt << std::endl
+            << "    maxNumRefFramesInDPB: " << pConfig->encodeCodecConfig.hevcConfig.maxNumRefFramesInDPB << std::endl
+            << "    ltrNumFrames: " << pConfig->encodeCodecConfig.hevcConfig.ltrNumFrames << std::endl
+            << "    vpsId: " << pConfig->encodeCodecConfig.hevcConfig.vpsId << std::endl
+            << "    spsId: " << pConfig->encodeCodecConfig.hevcConfig.spsId << std::endl
+            << "    ppsId: " << pConfig->encodeCodecConfig.hevcConfig.ppsId << std::endl
+            << "    sliceMode: " << pConfig->encodeCodecConfig.hevcConfig.sliceMode << std::endl
+            << "    sliceModeData: " << pConfig->encodeCodecConfig.hevcConfig.sliceModeData << std::endl
+            << "    maxTemporalLayersMinus1: " << pConfig->encodeCodecConfig.hevcConfig.maxTemporalLayersMinus1 << std::endl
+            << "    NV_ENC_CONFIG_HEVC_VUI_PARAMETERS:" << std::endl
+            << "        overscanInfoPresentFlag: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.overscanInfoPresentFlag << std::endl
+            << "        overscanInfo: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.overscanInfo << std::endl
+            << "        videoSignalTypePresentFlag: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.videoSignalTypePresentFlag << std::endl
+            << "        videoFormat: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.videoFormat << std::endl
+            << "        videoFullRangeFlag: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.videoFullRangeFlag << std::endl
+            << "        colourDescriptionPresentFlag: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.colourDescriptionPresentFlag << std::endl
+            << "        colourPrimaries: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.colourPrimaries << std::endl
+            << "        transferCharacteristics: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.transferCharacteristics << std::endl
+            << "        colourMatrix: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.colourMatrix << std::endl
+            << "        chromaSampleLocationFlag: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.chromaSampleLocationFlag << std::endl
+            << "        chromaSampleLocationTop: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.chromaSampleLocationTop << std::endl
+            << "        chromaSampleLocationBot: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.chromaSampleLocationBot << std::endl
+            << "        bitstreamRestrictionFlag: " << pConfig->encodeCodecConfig.hevcConfig.hevcVUIParameters.bitstreamRestrictionFlag << std::endl
+            << "    ltrTrustMode: " << pConfig->encodeCodecConfig.hevcConfig.ltrTrustMode << std::endl;
+        }
+        return os.str();
+    }
+};
--- a/thirdparty/nvcodec/Samples/Utils/Resize.cu
+++ b/thirdparty/nvcodec/Samples/Utils/Resize.cu
@@ -0,0 +1,192 @@
+/*
+* Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#include <cuda_runtime.h>
+#include "NvCodecUtils.h"
+
+template<typename YuvUnitx2>
+static __global__ void Resize(cudaTextureObject_t texY, cudaTextureObject_t texUv,
+        uint8_t *pDst, uint8_t *pDstUV, int nPitch, int nWidth, int nHeight,
+        float fxScale, float fyScale)
+{
+    int ix = blockIdx.x * blockDim.x + threadIdx.x,
+        iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (ix >= nWidth / 2 || iy >= nHeight / 2) {
+        return;
+    }
+
+    int x = ix * 2, y = iy * 2;
+    typedef decltype(YuvUnitx2::x) YuvUnit;
+    const int MAX = (1 << (sizeof(YuvUnit) * 8)) - 1;
+    *(YuvUnitx2 *)(pDst + y * nPitch + x * sizeof(YuvUnit)) = YuvUnitx2 {
+        (YuvUnit)(tex2D<float>(texY, x / fxScale, y / fyScale) * MAX),
+        (YuvUnit)(tex2D<float>(texY, (x + 1) / fxScale, y / fyScale) * MAX)
+    };
+    y++;
+    *(YuvUnitx2 *)(pDst + y * nPitch + x * sizeof(YuvUnit)) = YuvUnitx2 {
+        (YuvUnit)(tex2D<float>(texY, x / fxScale, y / fyScale) * MAX),
+        (YuvUnit)(tex2D<float>(texY, (x + 1) / fxScale, y / fyScale) * MAX)
+    };
+    float2 uv = tex2D<float2>(texUv, ix / fxScale, (nHeight + iy) / fyScale + 0.5f);
+    *(YuvUnitx2 *)(pDstUV + iy * nPitch + ix * 2 * sizeof(YuvUnit)) = YuvUnitx2{ (YuvUnit)(uv.x * MAX), (YuvUnit)(uv.y * MAX) };
+}
+
+template <typename YuvUnitx2>
+static void Resize(unsigned char *dpDst, unsigned char* dpDstUV, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight) {
+    cudaResourceDesc resDesc = {};
+    resDesc.resType = cudaResourceTypePitch2D;
+    resDesc.res.pitch2D.devPtr = dpSrc;
+    resDesc.res.pitch2D.desc = cudaCreateChannelDesc<decltype(YuvUnitx2::x)>();
+    resDesc.res.pitch2D.width = nSrcWidth;
+    resDesc.res.pitch2D.height = nSrcHeight;
+    resDesc.res.pitch2D.pitchInBytes = nSrcPitch;
+
+    cudaTextureDesc texDesc = {};
+    texDesc.filterMode = cudaFilterModeLinear;
+    texDesc.readMode = cudaReadModeNormalizedFloat;
+
+    cudaTextureObject_t texY=0;
+    ck(cudaCreateTextureObject(&texY, &resDesc, &texDesc, NULL));
+
+    resDesc.res.pitch2D.desc = cudaCreateChannelDesc<YuvUnitx2>();
+    resDesc.res.pitch2D.width = nSrcWidth / 2;
+    resDesc.res.pitch2D.height = nSrcHeight * 3 / 2;
+
+    cudaTextureObject_t texUv=0;
+    ck(cudaCreateTextureObject(&texUv, &resDesc, &texDesc, NULL));
+
+    Resize<YuvUnitx2> << <dim3((nDstWidth + 31) / 32, (nDstHeight + 31) / 32), dim3(16, 16) >> >(texY, texUv, dpDst, dpDstUV,
+        nDstPitch, nDstWidth, nDstHeight, 1.0f * nDstWidth / nSrcWidth, 1.0f * nDstHeight / nSrcHeight);
+
+    ck(cudaDestroyTextureObject(texY));
+    ck(cudaDestroyTextureObject(texUv));
+}
+
+void ResizeNv12(unsigned char *dpDstNv12, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcNv12, int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char* dpDstNv12UV)
+{
+    unsigned char* dpDstUV = dpDstNv12UV ? dpDstNv12UV : dpDstNv12 + (nDstPitch*nDstHeight);
+    return Resize<uchar2>(dpDstNv12, dpDstUV, nDstPitch, nDstWidth, nDstHeight, dpSrcNv12, nSrcPitch, nSrcWidth, nSrcHeight);
+}
+
+
+void ResizeP016(unsigned char *dpDstP016, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcP016, int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char* dpDstP016UV)
+{
+    unsigned char* dpDstUV = dpDstP016UV ? dpDstP016UV : dpDstP016 + (nDstPitch*nDstHeight);
+    return Resize<ushort2>(dpDstP016, dpDstUV, nDstPitch, nDstWidth, nDstHeight, dpSrcP016, nSrcPitch, nSrcWidth, nSrcHeight);
+}
+
+static __global__ void Scale(cudaTextureObject_t texSrc,
+    uint8_t *pDst, int nPitch, int nWidth, int nHeight,
+    float fxScale, float fyScale)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x,
+        y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= nWidth || y >= nHeight)
+    {
+        return;
+    }
+
+    *(unsigned char*)(pDst + (y * nPitch) + x) = (unsigned char)(fminf((tex2D<float>(texSrc, x * fxScale, y * fyScale)) * 255.0f, 255.0f));
+}
+
+static __global__ void Scale_uv(cudaTextureObject_t texSrc,
+    uint8_t *pDst, int nPitch, int nWidth, int nHeight,
+    float fxScale, float fyScale)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x,
+        y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= nWidth || y >= nHeight)
+    {
+        return;
+    }
+
+    float2 uv = tex2D<float2>(texSrc, x * fxScale, y * fyScale);
+    uchar2 uvOut = uchar2{ (unsigned char)(fminf(uv.x * 255.0f, 255.0f)), (unsigned char)(fminf(uv.y * 255.0f, 255.0f)) };
+
+    *(uchar2*)(pDst + (y * nPitch) + 2 * x) = uvOut;
+}
+
+void ScaleKernelLaunch(unsigned char *dpDst, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight, bool bUVPlane = false) 
+{
+    cudaResourceDesc resDesc = {};
+    resDesc.resType = cudaResourceTypePitch2D;
+    resDesc.res.pitch2D.devPtr = dpSrc;
+    resDesc.res.pitch2D.desc = bUVPlane ? cudaCreateChannelDesc<uchar2>() : cudaCreateChannelDesc<unsigned char>();
+    resDesc.res.pitch2D.width = nSrcWidth;
+    resDesc.res.pitch2D.height = nSrcHeight;
+    resDesc.res.pitch2D.pitchInBytes = nSrcPitch;
+
+    cudaTextureDesc texDesc = {};
+    texDesc.filterMode = cudaFilterModeLinear;
+    texDesc.readMode = cudaReadModeNormalizedFloat;
+
+    texDesc.addressMode[0] = cudaAddressModeClamp;
+    texDesc.addressMode[1] = cudaAddressModeClamp;
+    texDesc.addressMode[2] = cudaAddressModeClamp;
+
+    cudaTextureObject_t texSrc = 0;
+    ck(cudaCreateTextureObject(&texSrc, &resDesc, &texDesc, NULL));
+
+    dim3 blockSize(16, 16, 1);
+    dim3 gridSize(((uint32_t)nDstWidth + blockSize.x - 1) / blockSize.x, ((uint32_t)nDstHeight + blockSize.y - 1) / blockSize.y, 1);
+
+    if (bUVPlane)
+    {
+        Scale_uv << <gridSize, blockSize >> >(texSrc, dpDst,
+            nDstPitch, nDstWidth, nDstHeight, 1.0f * nSrcWidth / nDstWidth, 1.0f * nSrcHeight / nDstHeight);
+    }
+    else
+    {
+        Scale << <gridSize, blockSize >> >(texSrc, dpDst,
+            nDstPitch, nDstWidth, nDstHeight, 1.0f * nSrcWidth / nDstWidth, 1.0f * nSrcHeight / nDstHeight);
+    }
+
+    ck(cudaGetLastError());
+    ck(cudaDestroyTextureObject(texSrc));
+}
+
+void ScaleYUV420(unsigned char *dpDstY,
+                 unsigned char* dpDstU,
+                unsigned char* dpDstV,
+                int nDstPitch,
+                int nDstChromaPitch,
+                int nDstWidth,
+                int nDstHeight,
+                unsigned char *dpSrcY,
+                unsigned char* dpSrcU,
+                unsigned char* dpSrcV, 
+                int nSrcPitch,
+                int nSrcChromaPitch,
+                int nSrcWidth,
+                int nSrcHeight,
+                bool bSemiplanar)
+{
+    int chromaWidthDst = (nDstWidth + 1) / 2;
+    int chromaHeightDst = (nDstHeight + 1) / 2;
+
+    int chromaWidthSrc = (nSrcWidth + 1) / 2;
+    int chromaHeightSrc = (nSrcHeight + 1) / 2;
+
+    ScaleKernelLaunch(dpDstY, nDstPitch, nDstWidth, nDstHeight, dpSrcY, nSrcPitch, nSrcWidth, nSrcHeight);
+
+    if (bSemiplanar)
+    {
+        ScaleKernelLaunch(dpDstU, nDstChromaPitch, chromaWidthDst, chromaHeightDst, dpSrcU, nSrcChromaPitch, chromaWidthSrc, chromaHeightSrc, true);
+    }
+    else
+    {
+        ScaleKernelLaunch(dpDstU, nDstChromaPitch, chromaWidthDst, chromaHeightDst, dpSrcU, nSrcChromaPitch, chromaWidthSrc, chromaHeightSrc);
+        ScaleKernelLaunch(dpDstV, nDstChromaPitch, chromaWidthDst, chromaHeightDst, dpSrcV, nSrcChromaPitch, chromaWidthSrc, chromaHeightSrc);
+    }
+}
--- a/thirdparty/nvcodec/Samples/Utils/crc.cu
+++ b/thirdparty/nvcodec/Samples/Utils/crc.cu
@@ -0,0 +1,126 @@
+/*
+* Copyright 2018-2020 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#include <cuda_runtime.h>
+#include "NvCodecUtils.h"
+
+/*
+* CRC32 lookup table
+* Generated by the following routine
+* int i, j;
+* U032 crc;
+* for (i = 0; i < 256; i++) 
+* {
+*    crc = i;
+*    for (j = 0; j < 8; j++) {    // 8 reduction
+*      crc = (crc >> 1) ^ ((crc & 1) ? 0xEDB88320L : 0);
+*    }
+*    Crc32Table[i] = crc;
+* }
+ */
+__device__ __constant__ uint32_t Crc32Table[256] = {
+    0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+    0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+    0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+    0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+    0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+    0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+    0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+    0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+    0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+    0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+    0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+    0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+    0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+    0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+    0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+    0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+    0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+    0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+    0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+    0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+    0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+    0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+    0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+    0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+    0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+    0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+    0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+    0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+    0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+    0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+    0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+    0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+    0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+    0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+    0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+    0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+    0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+    0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+    0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+    0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+    0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+    0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+    0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+    0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+    0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+    0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+    0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+    0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+    0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+    0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+    0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+    0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+    0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+    0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+    0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+    0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+    0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+    0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+    0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+    0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+    0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+    0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+    0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+    0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+typedef struct _NV_ENC_ENCODE_OUT_PARAMS
+{
+    uint32_t                  version;                 /**< [out]: Struct version. */
+    uint32_t                  bitstreamSizeInBytes;    /**< [out]: Encoded bitstream size in bytes */
+    uint32_t                  cycleCount;              /**< [out]: Cycle count */
+    uint32_t                  firstPassCycleCount;     /**< [out]: First pass cycle count */
+    uint32_t                  reserved[60];            /**< [out]: Reserved and must be set to 0 */
+} NV_ENC_ENCODE_OUT_PARAMS;
+
+static __global__ void ComputeCRCKernel(uint8_t *pBuffer, uint32_t *crcValue)
+{
+    NV_ENC_ENCODE_OUT_PARAMS *outParams = (NV_ENC_ENCODE_OUT_PARAMS *)pBuffer;
+    uint32_t bitstreamSize = outParams->bitstreamSizeInBytes;
+    uint8_t *pEncStream = pBuffer + sizeof(NV_ENC_ENCODE_OUT_PARAMS);
+    uint32_t crc=~0;
+
+    for(uint32_t i = 0; i < bitstreamSize; i++)
+    {
+        crc = (crc >> 8) ^ Crc32Table[((uint8_t)(crc))  ^ (*pEncStream++)];
+    }
+
+    *crcValue = ~crc;
+}
+
+void ComputeCRC(uint8_t *pBuffer, uint32_t *crcValue, cudaStream_t outputCUStream)
+{
+    dim3 blockSize(1, 1, 1);
+    dim3 gridSize(1, 1, 1);
+
+    ComputeCRCKernel <<<gridSize, blockSize, 0, outputCUStream >>>(pBuffer, crcValue);
+}