From 4624d4f27f63691cf96ab7b7c22cbdff6841fa22 Mon Sep 17 00:00:00 2001
From: dijunkun <junkun.di@hotmail.com>
Date: Tue, 13 Aug 2024 16:10:00 +0800
Subject: [PATCH] [fix] fix load NvCodec API failed on Windows

---
 src/media/nvcodec/nvcodec_api.cpp          |  313 ++++
 src/media/nvcodec/nvcodec_api.h            |  118 ++
 src/pc/peer_connection.cpp                 |    7 +-
 thirdparty/nvcodec/Interface/cuda.h        | 1490 +++-----------------
 thirdparty/nvcodec/Interface/nvcodec_api.h |  367 -----
 xmake.lua                                  |   10 +-
 6 files changed, 633 insertions(+), 1672 deletions(-)
 create mode 100644 src/media/nvcodec/nvcodec_api.cpp
 create mode 100644 src/media/nvcodec/nvcodec_api.h
 delete mode 100644 thirdparty/nvcodec/Interface/nvcodec_api.h

diff --git a/src/media/nvcodec/nvcodec_api.cpp b/src/media/nvcodec/nvcodec_api.cpp
new file mode 100644
index 0000000..3c3481c
--- /dev/null
+++ b/src/media/nvcodec/nvcodec_api.cpp
@@ -0,0 +1,313 @@
+#include "nvcodec_api.h"
+
+#include "log.h"
+
+TcuInit cuInit_ld = NULL;
+TcuDeviceGet cuDeviceGet_ld = NULL;
+TcuDeviceGetCount cuDeviceGetCount_ld = NULL;
+TcuCtxCreate cuCtxCreate_ld = NULL;
+TcuGetErrorName cuGetErrorName_ld = NULL;
+TcuCtxPushCurrent cuCtxPushCurrent_ld = NULL;
+TcuCtxPopCurrent cuCtxPopCurrent_ld = NULL;
+TcuMemAlloc cuMemAlloc_ld = NULL;
+TcuMemAllocPitch cuMemAllocPitch_ld = NULL;
+TcuMemFree cuMemFree_ld = NULL;
+TcuMemcpy2DAsync cuMemcpy2DAsync_ld = NULL;
+TcuStreamSynchronize cuStreamSynchronize_ld = NULL;
+TcuMemcpy2D cuMemcpy2D_ld = NULL;
+TcuMemcpy2DUnaligned cuMemcpy2DUnaligned_ld = NULL;
+
+TcuvidCtxLockCreate cuvidCtxLockCreate_ld = NULL;
+TcuvidGetDecoderCaps cuvidGetDecoderCaps_ld = NULL;
+TcuvidCreateDecoder cuvidCreateDecoder_ld = NULL;
+TcuvidDestroyDecoder cuvidDestroyDecoder_ld = NULL;
+TcuvidDecodePicture cuvidDecodePicture_ld = NULL;
+TcuvidGetDecodeStatus cuvidGetDecodeStatus_ld = NULL;
+TcuvidReconfigureDecoder cuvidReconfigureDecoder_ld = NULL;
+TcuvidMapVideoFrame64 cuvidMapVideoFrame64_ld = NULL;
+TcuvidUnmapVideoFrame64 cuvidUnmapVideoFrame64_ld = NULL;
+TcuvidCtxLockDestroy cuvidCtxLockDestroy_ld = NULL;
+TcuvidCreateVideoParser cuvidCreateVideoParser_ld = NULL;
+TcuvidParseVideoData cuvidParseVideoData_ld = NULL;
+TcuvidDestroyVideoParser cuvidDestroyVideoParser_ld = NULL;
+
+TNvEncodeAPICreateInstance NvEncodeAPICreateInstance_ld = NULL;
+TNvEncodeAPIGetMaxSupportedVersion NvEncodeAPIGetMaxSupportedVersion_ld = NULL;
+
+static HMODULE nvcuda_dll = NULL;
+static HMODULE nvcuvid_dll = NULL;
+static HMODULE nvEncodeAPI64_dll = NULL;
+
+int LoadNvCodecDll() {
+  // Load library
+  nvcuda_dll = LoadLibrary(TEXT("nvcuda.dll"));
+  if (nvcuda_dll == NULL) {
+    LOG_ERROR("Unable to load nvcuda.dll");
+    return -1;
+  }
+
+  cuInit_ld = (TcuInit)GetProcAddress(nvcuda_dll, "cuInit");
+  if (cuInit_ld == NULL) {
+    LOG_ERROR("Unable to find function cuInit()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuDeviceGet_ld = (TcuDeviceGet)GetProcAddress(nvcuda_dll, "cuDeviceGet");
+  if (cuDeviceGet_ld == NULL) {
+    LOG_ERROR("Unable to find function cuDeviceGet()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuDeviceGetCount_ld =
+      (TcuDeviceGetCount)GetProcAddress(nvcuda_dll, "cuDeviceGetCount");
+  if (cuDeviceGetCount_ld == NULL) {
+    LOG_ERROR("Unable to find function cuDeviceGetCount()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuCtxCreate_ld = (TcuCtxCreate)GetProcAddress(nvcuda_dll, "cuCtxCreate_v2");
+  if (cuCtxCreate_ld == NULL) {
+    LOG_ERROR("Unable to find function cuCtxCreate()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuGetErrorName_ld =
+      (TcuGetErrorName)GetProcAddress(nvcuda_dll, "cuGetErrorName");
+  if (cuGetErrorName_ld == NULL) {
+    LOG_ERROR("Unable to find function cuGetErrorName()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuCtxPushCurrent_ld =
+      (TcuCtxPushCurrent)GetProcAddress(nvcuda_dll, "cuCtxPushCurrent_v2");
+  if (cuCtxPushCurrent_ld == NULL) {
+    LOG_ERROR("Unable to find function cuCtxPushCurrent()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuCtxPopCurrent_ld =
+      (TcuCtxPopCurrent)GetProcAddress(nvcuda_dll, "cuCtxPopCurrent_v2");
+  if (cuCtxPopCurrent_ld == NULL) {
+    LOG_ERROR("Unable to find function cuCtxPopCurrent()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+  cuMemAlloc_ld = (TcuMemAlloc)GetProcAddress(nvcuda_dll, "cuMemAlloc_v2");
+  if (cuMemAlloc_ld == NULL) {
+    LOG_ERROR("Unable to find function cuMemAlloc()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuMemAllocPitch_ld =
+      (TcuMemAllocPitch)GetProcAddress(nvcuda_dll, "cuMemAllocPitch_v2");
+  if (cuMemAllocPitch_ld == NULL) {
+    LOG_ERROR("Unable to find function cuMemAllocPitch()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuMemFree_ld = (TcuMemFree)GetProcAddress(nvcuda_dll, "cuMemFree_v2");
+  if (cuMemFree_ld == NULL) {
+    LOG_ERROR("Unable to find function cuMemFree()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuMemcpy2DAsync_ld =
+      (TcuMemcpy2DAsync)GetProcAddress(nvcuda_dll, "cuMemcpy2DAsync_v2");
+  if (cuMemcpy2DAsync_ld == NULL) {
+    LOG_ERROR("Unable to find function cuMemcpy2DAsync()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuStreamSynchronize_ld =
+      (TcuStreamSynchronize)GetProcAddress(nvcuda_dll, "cuStreamSynchronize");
+  if (cuStreamSynchronize_ld == NULL) {
+    LOG_ERROR("Unable to find function cuStreamSynchronize()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuMemcpy2D_ld = (TcuMemcpy2D)GetProcAddress(nvcuda_dll, "cuMemcpy2D_v2");
+  if (cuMemcpy2D_ld == NULL) {
+    LOG_ERROR("Unable to find function cuMemcpy2D()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  cuMemcpy2DUnaligned_ld = (TcuMemcpy2DUnaligned)GetProcAddress(
+      nvcuda_dll, "cuMemcpy2DUnaligned_v2");
+  if (cuMemcpy2DUnaligned_ld == NULL) {
+    LOG_ERROR("Unable to find function cuMemcpy2DUnaligned()");
+    FreeLibrary(nvcuda_dll);
+    return -1;
+  }
+
+  //
+  nvcuvid_dll = LoadLibrary(TEXT("nvcuvid.dll"));
+  if (nvcuvid_dll == NULL) {
+    LOG_ERROR("Unable to load nvcuvid.dll");
+    return -1;
+  }
+
+  cuvidCtxLockCreate_ld =
+      (TcuvidCtxLockCreate)GetProcAddress(nvcuvid_dll, "cuvidCtxLockCreate");
+  if (cuvidCtxLockCreate_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidCtxLockCreate()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidGetDecoderCaps_ld =
+      (TcuvidGetDecoderCaps)GetProcAddress(nvcuvid_dll, "cuvidGetDecoderCaps");
+  if (cuvidGetDecoderCaps_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidGetDecoderCaps()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidCreateDecoder_ld =
+      (TcuvidCreateDecoder)GetProcAddress(nvcuvid_dll, "cuvidCreateDecoder");
+  if (cuvidCreateDecoder_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidCreateDecoder()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidDestroyDecoder_ld =
+      (TcuvidDestroyDecoder)GetProcAddress(nvcuvid_dll, "cuvidDestroyDecoder");
+  if (cuvidDestroyDecoder_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidDestroyDecoder()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidDecodePicture_ld =
+      (TcuvidDecodePicture)GetProcAddress(nvcuvid_dll, "cuvidDecodePicture");
+  if (cuvidDecodePicture_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidDecodePicture()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidGetDecodeStatus_ld = (TcuvidGetDecodeStatus)GetProcAddress(
+      nvcuvid_dll, "cuvidGetDecodeStatus");
+  if (cuvidGetDecodeStatus_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidGetDecodeStatus()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidReconfigureDecoder_ld = (TcuvidReconfigureDecoder)GetProcAddress(
+      nvcuvid_dll, "cuvidReconfigureDecoder");
+  if (cuvidReconfigureDecoder_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidReconfigureDecoder()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidMapVideoFrame64_ld = (TcuvidMapVideoFrame64)GetProcAddress(
+      nvcuvid_dll, "cuvidMapVideoFrame64");
+  if (cuvidMapVideoFrame64_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidMapVideoFrame64()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidUnmapVideoFrame64_ld = (TcuvidUnmapVideoFrame64)GetProcAddress(
+      nvcuvid_dll, "cuvidUnmapVideoFrame64");
+  if (cuvidUnmapVideoFrame64_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidUnmapVideoFrame64()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidCtxLockDestroy_ld =
+      (TcuvidCtxLockDestroy)GetProcAddress(nvcuvid_dll, "cuvidCtxLockDestroy");
+  if (cuvidCtxLockDestroy_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidCtxLockDestroy()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidCreateVideoParser_ld = (TcuvidCreateVideoParser)GetProcAddress(
+      nvcuvid_dll, "cuvidCreateVideoParser");
+  if (cuvidCreateVideoParser_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidCreateVideoParser()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidParseVideoData_ld =
+      (TcuvidParseVideoData)GetProcAddress(nvcuvid_dll, "cuvidParseVideoData");
+  if (cuvidParseVideoData_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidParseVideoData()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  cuvidDestroyVideoParser_ld = (TcuvidDestroyVideoParser)GetProcAddress(
+      nvcuvid_dll, "cuvidDestroyVideoParser");
+  if (cuvidDestroyVideoParser_ld == NULL) {
+    LOG_ERROR("Unable to find function cuvidDestroyVideoParser()");
+    FreeLibrary(nvcuvid_dll);
+    return -1;
+  }
+
+  //
+  nvEncodeAPI64_dll = LoadLibrary(TEXT("nvEncodeAPI64.dll"));
+  if (nvEncodeAPI64_dll == NULL) {
+    LOG_ERROR("Unable to load nvEncodeAPI64.dll");
+    return -1;
+  }
+
+  NvEncodeAPICreateInstance_ld = (TNvEncodeAPICreateInstance)GetProcAddress(
+      nvEncodeAPI64_dll, "NvEncodeAPICreateInstance");
+  if (NvEncodeAPICreateInstance_ld == NULL) {
+    LOG_ERROR("Unable to find function NvEncodeAPICreateInstance()");
+    FreeLibrary(nvEncodeAPI64_dll);
+    return -1;
+  }
+
+  NvEncodeAPIGetMaxSupportedVersion_ld =
+      (TNvEncodeAPIGetMaxSupportedVersion)GetProcAddress(
+          nvEncodeAPI64_dll, "NvEncodeAPIGetMaxSupportedVersion");
+  if (NvEncodeAPIGetMaxSupportedVersion_ld == NULL) {
+    LOG_ERROR("Unable to find function NvEncodeAPIGetMaxSupportedVersion()");
+    FreeLibrary(nvEncodeAPI64_dll);
+    return -1;
+  }
+
+  LOG_INFO("Load NvCodec API success");
+
+  return 0;
+}
+
+int ReleaseNvCodecDll() {
+  if (nvcuda_dll != NULL) {
+    FreeLibrary(nvcuda_dll);
+    nvcuda_dll = NULL;
+  }
+
+  if (nvcuvid_dll != NULL) {
+    FreeLibrary(nvcuvid_dll);
+    nvcuvid_dll = NULL;
+  }
+
+  if (nvEncodeAPI64_dll != NULL) {
+    FreeLibrary(nvEncodeAPI64_dll);
+    nvEncodeAPI64_dll = NULL;
+  }
+
+  LOG_INFO("Release NvCodec API success");
+
+  return 0;
+}
\ No newline at end of file
diff --git a/src/media/nvcodec/nvcodec_api.h b/src/media/nvcodec/nvcodec_api.h
new file mode 100644
index 0000000..5506235
--- /dev/null
+++ b/src/media/nvcodec/nvcodec_api.h
@@ -0,0 +1,118 @@
+/*
+ * @Author: DI JUNKUN
+ * @Date: 2024-08-12
+ * Copyright (c) 2024 by DI JUNKUN, All Rights Reserved.
+ */
+
+#ifndef _NVCODEC_API_H_
+#define _NVCODEC_API_H_
+
+#include <Windows.h>
+
+#include <iostream>
+
+#include "cuda.h"
+#include "cuviddec.h"
+#include "nvEncodeAPI.h"
+#include "nvcuvid.h"
+
+// nvcuda.dll
+typedef CUresult (*TcuInit)(unsigned int Flags);
+
+typedef CUresult (*TcuDeviceGet)(CUdevice *device, int ordinal);
+
+typedef CUresult (*TcuDeviceGetCount)(int *count);
+
+typedef CUresult (*TcuCtxCreate)(CUcontext *pctx, unsigned int flags,
+                                 CUdevice dev);
+
+typedef CUresult (*TcuGetErrorName)(CUresult error, const char **pStr);
+
+typedef CUresult (*TcuCtxPushCurrent)(CUcontext ctx);
+
+typedef CUresult (*TcuCtxPopCurrent)(CUcontext *pctx);
+
+typedef CUresult (*TcuMemAlloc)(CUdeviceptr *dptr, size_t bytesize);
+
+typedef CUresult (*TcuMemAllocPitch)(CUdeviceptr *dptr, size_t *pPitch,
+                                     size_t WidthInBytes, size_t Height,
+                                     unsigned int ElementSizeBytes);
+
+typedef CUresult (*TcuMemFree)(CUdeviceptr dptr);
+
+typedef CUresult (*TcuMemcpy2DAsync)(const CUDA_MEMCPY2D *pCopy,
+                                     CUstream hStream);
+
+typedef CUresult (*TcuStreamSynchronize)(CUstream hStream);
+
+typedef CUresult (*TcuMemcpy2D)(const CUDA_MEMCPY2D *pCopy);
+
+typedef CUresult (*TcuMemcpy2DUnaligned)(const CUDA_MEMCPY2D *pCopy);
+
+extern TcuInit cuInit_ld;
+extern TcuDeviceGet cuDeviceGet_ld;
+extern TcuDeviceGetCount cuDeviceGetCount_ld;
+extern TcuCtxCreate cuCtxCreate_ld;
+extern TcuGetErrorName cuGetErrorName_ld;
+extern TcuCtxPushCurrent cuCtxPushCurrent_ld;
+extern TcuCtxPopCurrent cuCtxPopCurrent_ld;
+extern TcuMemAlloc cuMemAlloc_ld;
+extern TcuMemAllocPitch cuMemAllocPitch_ld;
+extern TcuMemFree cuMemFree_ld;
+extern TcuMemcpy2DAsync cuMemcpy2DAsync_ld;
+extern TcuStreamSynchronize cuStreamSynchronize_ld;
+extern TcuMemcpy2D cuMemcpy2D_ld;
+extern TcuMemcpy2DUnaligned cuMemcpy2DUnaligned_ld;
+
+// nvcuvid.dll
+typedef CUresult (*TcuvidCtxLockCreate)(CUvideoctxlock *pLock, CUcontext ctx);
+typedef CUresult (*TcuvidGetDecoderCaps)(CUVIDDECODECAPS *pdc);
+typedef CUresult (*TcuvidCreateDecoder)(CUvideodecoder *phDecoder,
+                                        CUVIDDECODECREATEINFO *pdci);
+typedef CUresult (*TcuvidDestroyDecoder)(CUvideodecoder hDecoder);
+typedef CUresult (*TcuvidDecodePicture)(CUvideodecoder hDecoder,
+                                        CUVIDPICPARAMS *pPicParams);
+typedef CUresult (*TcuvidGetDecodeStatus)(CUvideodecoder hDecoder, int nPicIdx,
+                                          CUVIDGETDECODESTATUS *pDecodeStatus);
+typedef CUresult (*TcuvidReconfigureDecoder)(
+    CUvideodecoder hDecoder, CUVIDRECONFIGUREDECODERINFO *pDecReconfigParams);
+typedef CUresult (*TcuvidMapVideoFrame64)(CUvideodecoder hDecoder, int nPicIdx,
+                                          unsigned long long *pDevPtr,
+                                          unsigned int *pPitch,
+                                          CUVIDPROCPARAMS *pVPP);
+typedef CUresult (*TcuvidUnmapVideoFrame64)(CUvideodecoder hDecoder,
+                                            unsigned long long DevPtr);
+typedef CUresult (*TcuvidCtxLockDestroy)(CUvideoctxlock lck);
+typedef CUresult (*TcuvidCreateVideoParser)(CUvideoparser *pObj,
+                                            CUVIDPARSERPARAMS *pParams);
+typedef CUresult (*TcuvidParseVideoData)(CUvideoparser obj,
+                                         CUVIDSOURCEDATAPACKET *pPacket);
+typedef CUresult (*TcuvidDestroyVideoParser)(CUvideoparser obj);
+
+extern TcuvidCtxLockCreate cuvidCtxLockCreate_ld;
+extern TcuvidGetDecoderCaps cuvidGetDecoderCaps_ld;
+extern TcuvidCreateDecoder cuvidCreateDecoder_ld;
+extern TcuvidDestroyDecoder cuvidDestroyDecoder_ld;
+extern TcuvidDecodePicture cuvidDecodePicture_ld;
+extern TcuvidGetDecodeStatus cuvidGetDecodeStatus_ld;
+extern TcuvidReconfigureDecoder cuvidReconfigureDecoder_ld;
+extern TcuvidMapVideoFrame64 cuvidMapVideoFrame64_ld;
+extern TcuvidUnmapVideoFrame64 cuvidUnmapVideoFrame64_ld;
+extern TcuvidCtxLockDestroy cuvidCtxLockDestroy_ld;
+extern TcuvidCreateVideoParser cuvidCreateVideoParser_ld;
+extern TcuvidParseVideoData cuvidParseVideoData_ld;
+extern TcuvidDestroyVideoParser cuvidDestroyVideoParser_ld;
+
+// nvEncodeAPI64.dll
+typedef NVENCSTATUS (*TNvEncodeAPICreateInstance)(
+    NV_ENCODE_API_FUNCTION_LIST *functionList);
+typedef NVENCSTATUS (*TNvEncodeAPIGetMaxSupportedVersion)(uint32_t *version);
+
+extern TNvEncodeAPICreateInstance NvEncodeAPICreateInstance_ld;
+extern TNvEncodeAPIGetMaxSupportedVersion NvEncodeAPIGetMaxSupportedVersion_ld;
+
+int LoadNvCodecDll();
+
+int ReleaseNvCodecDll();
+
+#endif
\ No newline at end of file
diff --git a/src/pc/peer_connection.cpp b/src/pc/peer_connection.cpp
index fbc3a21..ca551c9 100644
--- a/src/pc/peer_connection.cpp
+++ b/src/pc/peer_connection.cpp
@@ -228,7 +228,6 @@ int PeerConnection::CreateVideoCodec(bool hardware_acceleration) {
         "MacOS not support hardware acceleration, use default software codec");
   }
 #else
-  InitNvCodecApi();
 #endif
 
   if (av1_encoding_) {
@@ -237,7 +236,7 @@ int PeerConnection::CreateVideoCodec(bool hardware_acceleration) {
     LOG_WARN("Only support software codec for AV1");
   } else {
     if (hardware_acceleration_) {
-      if (0 == InitNvCodecApi()) {
+      if (0 == LoadNvCodecDll()) {
         video_encoder_ = VideoEncoderFactory::CreateVideoEncoder(true, false);
         video_decoder_ = VideoDecoderFactory::CreateVideoDecoder(true, false);
       } else {
@@ -614,6 +613,10 @@ int PeerConnection::Destroy() {
     delete nv12_data_;
     nv12_data_ = nullptr;
   }
+
+  if (hardware_acceleration_) {
+    ReleaseNvCodecDll();
+  }
   return 0;
 }
 
diff --git a/thirdparty/nvcodec/Interface/cuda.h b/thirdparty/nvcodec/Interface/cuda.h
index 00e2aec..4cf4058 100644
--- a/thirdparty/nvcodec/Interface/cuda.h
+++ b/thirdparty/nvcodec/Interface/cuda.h
@@ -179,7 +179,6 @@ typedef uint64_t cuuint64_t;
 #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord)
 #define cuEventRecordWithFlags __CUDA_API_PTSZ(cuEventRecordWithFlags)
 #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel)
-
 #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc)
 #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources)
 #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources)
@@ -190,12 +189,6 @@ typedef uint64_t cuuint64_t;
 #define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64)
 #define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp)
 
-#define cuStreamWriteValue32_v2 __CUDA_API_PTSZ(cuStreamWriteValue32_v2)
-#define cuStreamWaitValue32_v2 __CUDA_API_PTSZ(cuStreamWaitValue32_v2)
-#define cuStreamWriteValue64_v2 __CUDA_API_PTSZ(cuStreamWriteValue64_v2)
-#define cuStreamWaitValue64_v2 __CUDA_API_PTSZ(cuStreamWaitValue64_v2)
-#define cuStreamBatchMemOp_v2 __CUDA_API_PTSZ(cuStreamBatchMemOp_v2)
-
 #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
 
 #define cuSignalExternalSemaphoresAsync \
@@ -223,11 +216,9 @@ typedef uint64_t cuuint64_t;
  * \brief Header file for the OpenGL interoperability functions of the
  * low-level CUDA driver application programming interface.
  *
-
  * \file cudaD3D9.h
  * \brief Header file for the Direct3D 9 interoperability functions of the
  * low-level CUDA driver application programming interface.
-
  */
 
 /**
@@ -238,7 +229,7 @@ typedef uint64_t cuuint64_t;
 /**
  * CUDA API version number
  */
-#define CUDA_VERSION 11070
+#define CUDA_VERSION 11040
 
 #ifdef __cplusplus
 extern "C" {
@@ -447,7 +438,7 @@ typedef enum CUstreamWaitValue_flags_enum {
                  by the second write, and downstream work needs to observe the
                  first write. Support for this operation is restricted to
                  selected platforms and can be queried with
-                 ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.*/
+                 ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/
 } CUstreamWaitValue_flags;
 
 /**
@@ -461,8 +452,7 @@ typedef enum CUstreamWriteValue_flags_enum {
                ::cuStreamWriteValue32 will provide a memory fence before the
                write, which has similar semantics to
                __threadfence_system() but is scoped to the stream
-               rather than a CUDA thread.
-               This flag is not supported in the v2 API. */
+               rather than a CUDA thread. */
 } CUstreamWriteValue_flags;
 
 /**
@@ -477,24 +467,11 @@ typedef enum CUstreamBatchMemOpType_enum {
       4, /**< Represents a ::cuStreamWaitValue64 operation */
   CU_STREAM_MEM_OP_WRITE_VALUE_64 =
       5, /**< Represents a ::cuStreamWriteValue64 operation */
-
-  CU_STREAM_MEM_OP_BARRIER =
-      6, /**< Insert a memory barrier of the specified type */
-
   CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES =
       3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a
              standalone operation. */
 } CUstreamBatchMemOpType;
 
-/**
- * Flags for ::cuStreamMemoryBarrier
- */
-typedef enum CUstreamMemoryBarrier_flags_enum {
-  CU_STREAM_MEMORY_BARRIER_TYPE_SYS = 0x0, /**< System-wide memory barrier. */
-  CU_STREAM_MEMORY_BARRIER_TYPE_GPU =
-      0x1 /**< Limit memory barrier scope to the GPU. */
-} CUstreamMemoryBarrier_flags;
-
 /**
  * Per-operation parameters for ::cuStreamBatchMemOp
  */
@@ -526,24 +503,10 @@ typedef union CUstreamBatchMemOpParams_union {
     CUstreamBatchMemOpType operation;
     unsigned int flags;
   } flushRemoteWrites;
-
-  struct CUstreamMemOpMemoryBarrierParams_st { /**< Only supported in the _v2
-                                                  API */
-    CUstreamBatchMemOpType operation;
-    unsigned int flags;
-  } memoryBarrier;
-
   cuuint64_t pad[6];
 } CUstreamBatchMemOpParams_v1;
 typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams;
 
-typedef struct CUDA_BATCH_MEM_OP_NODE_PARAMS_st {
-  CUcontext ctx;
-  unsigned int count;
-  CUstreamBatchMemOpParams *paramArray;
-  unsigned int flags;
-} CUDA_BATCH_MEM_OP_NODE_PARAMS;
-
 /**
  * Occupancy calculator flag
  */
@@ -576,65 +539,7 @@ typedef enum CUarray_format_enum {
   CU_AD_FORMAT_SIGNED_INT32 = 0x0a,   /**< Signed 32-bit integers */
   CU_AD_FORMAT_HALF = 0x10,           /**< 16-bit floating point */
   CU_AD_FORMAT_FLOAT = 0x20,          /**< 32-bit floating point */
-  CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
-  CU_AD_FORMAT_UNORM_INT8X1 =
-      0xc0, /**< 1 channel unsigned 8-bit normalized integer */
-  CU_AD_FORMAT_UNORM_INT8X2 =
-      0xc1, /**< 2 channel unsigned 8-bit normalized integer */
-  CU_AD_FORMAT_UNORM_INT8X4 =
-      0xc2, /**< 4 channel unsigned 8-bit normalized integer */
-  CU_AD_FORMAT_UNORM_INT16X1 =
-      0xc3, /**< 1 channel unsigned 16-bit normalized integer */
-  CU_AD_FORMAT_UNORM_INT16X2 =
-      0xc4, /**< 2 channel unsigned 16-bit normalized integer */
-  CU_AD_FORMAT_UNORM_INT16X4 =
-      0xc5, /**< 4 channel unsigned 16-bit normalized integer */
-  CU_AD_FORMAT_SNORM_INT8X1 =
-      0xc6, /**< 1 channel signed 8-bit normalized integer */
-  CU_AD_FORMAT_SNORM_INT8X2 =
-      0xc7, /**< 2 channel signed 8-bit normalized integer */
-  CU_AD_FORMAT_SNORM_INT8X4 =
-      0xc8, /**< 4 channel signed 8-bit normalized integer */
-  CU_AD_FORMAT_SNORM_INT16X1 =
-      0xc9, /**< 1 channel signed 16-bit normalized integer */
-  CU_AD_FORMAT_SNORM_INT16X2 =
-      0xca, /**< 2 channel signed 16-bit normalized integer */
-  CU_AD_FORMAT_SNORM_INT16X4 =
-      0xcb, /**< 4 channel signed 16-bit normalized integer */
-  CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized
-                                    block-compressed (BC1 compression) format */
-  CU_AD_FORMAT_BC1_UNORM_SRGB =
-      0x92, /**< 4 channel unsigned normalized block-compressed (BC1
-               compression) format with sRGB encoding*/
-  CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized
-                                    block-compressed (BC2 compression) format */
-  CU_AD_FORMAT_BC2_UNORM_SRGB =
-      0x94, /**< 4 channel unsigned normalized block-compressed (BC2
-               compression) format with sRGB encoding*/
-  CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized
-                                    block-compressed (BC3 compression) format */
-  CU_AD_FORMAT_BC3_UNORM_SRGB =
-      0x96, /**< 4 channel unsigned normalized block-compressed (BC3
-               compression) format with sRGB encoding*/
-  CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized
-                                    block-compressed (BC4 compression) format */
-  CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized
-                                    block-compressed (BC4 compression) format */
-  CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized
-                                    block-compressed (BC5 compression) format */
-  CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized
-                                    block-compressed (BC5 compression) format */
-  CU_AD_FORMAT_BC6H_UF16 =
-      0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H
-               compression) format */
-  CU_AD_FORMAT_BC6H_SF16 =
-      0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression)
-               format */
-  CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized
-                                    block-compressed (BC7 compression) format */
-  CU_AD_FORMAT_BC7_UNORM_SRGB =
-      0x9e /**< 4 channel unsigned normalized block-compressed (BC7 compression)
-              format with sRGB encoding */
+  CU_AD_FORMAT_NV12 = 0xb0
 } CUarray_format;
 
 /**
@@ -895,7 +800,7 @@ typedef enum CUdevice_attribute_enum {
               ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
   CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED =
       105, /**< Device supports exporting memory to a Win32 KMT handle with
-              ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+              ::cuMemExportToShareableHandle, if requested ::cuMemCreate */
   CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR =
       106, /**< Maximum number of blocks per multiprocessor */
   CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED =
@@ -914,7 +819,7 @@ typedef enum CUdevice_attribute_enum {
               arrays */
   CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED =
       113, /**< Device supports using the ::cuMemHostRegister flag
-              ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be
+              CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be
               mapped as read-only to the GPU */
   CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED =
       114, /**< External timeline semaphore interop is supported on the device
@@ -937,20 +842,6 @@ typedef enum CUdevice_attribute_enum {
               values returned here. */
   CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES =
       119, /**< Handle types supported with mempool based IPC */
-
-  CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED =
-      121, /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped
-              arrays */
-
-  CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 =
-      122, /**< 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and
-              related v2 MemOp APIs. */
-  CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 =
-      123, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */
-
-  CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED =
-      124, /**< Device supports buffer sharing with dma_buf mechanism. */
-
   CU_DEVICE_ATTRIBUTE_MAX
 } CUdevice_attribute;
 
@@ -1020,17 +911,6 @@ typedef enum CUpointer_attribute_enum {
   CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE =
       17 /**< Returns the mempool handle for the allocation if it was allocated
             from a mempool. Otherwise returns NULL. **/
-
-  ,
-  CU_POINTER_ATTRIBUTE_MAPPING_SIZE =
-      18, /**< Size of the actual underlying mapping that the pointer belongs to
-             **/
-  CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR =
-      19, /**< The start address of the mapping that the pointer belongs to **/
-  CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID =
-      20 /**< A process-wide unique id corresponding to the physical allocation
-            the pointer belongs to **/
-
 } CUpointer_attribute;
 
 /**
@@ -1340,14 +1220,10 @@ typedef enum CUjit_option_enum {
   CU_JIT_CACHE_MODE,
 
   /**
-   * \deprecated
-   * This jit option is deprecated and should not be used.
+   * The below jit options are used for internal purposes only, in this version
+   * of CUDA
    */
   CU_JIT_NEW_SM3X_OPT,
-
-  /**
-   * This jit option is used for internal purpose only.
-   */
   CU_JIT_FAST_COMPILE,
 
   /**
@@ -1382,9 +1258,9 @@ typedef enum CUjit_option_enum {
   CU_JIT_GLOBAL_SYMBOL_COUNT,
 
   /**
-   * Enable link-time optimization (-dlto) for device code (Disabled by
-   * default).\n This option is not supported on 32-bit platforms.\n Option
-   * type: int\n Applies to: compiler and linker
+   * Enable link-time optimization (-dlto) for device code (0: false, default)\n
+   * Option type: int\n
+   * Applies to: compiler and linker
    */
   CU_JIT_LTO,
 
@@ -1426,64 +1302,6 @@ typedef enum CUjit_option_enum {
    */
   CU_JIT_FMA,
 
-  /**
-   * Array of kernel names that should be preserved at link time while others
-   * can be removed.\n
-   * Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries.\n
-   * Note that kernel names can be mangled by the compiler in which case the
-   * mangled name needs to be specified.\n
-   * Wildcard "*" can be used to represent zero or more characters instead of
-   * specifying the full or mangled name.\n
-   * It is important to note that the wildcard "*" is also added implicitly.
-   * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz"
-   * and thus preserve all kernels with those names. This can be avoided by
-   * providing a more specific name like "barfoobaz".\n Option type: const char
-   * **\n Applies to: dynamic linker only
-   */
-  CU_JIT_REFERENCED_KERNEL_NAMES,
-
-  /**
-   * Number of entries in ::CU_JIT_REFERENCED_KERNEL_NAMES array.\n
-   * Option type: unsigned int\n
-   * Applies to: dynamic linker only
-   */
-  CU_JIT_REFERENCED_KERNEL_COUNT,
-
-  /**
-   * Array of variable names (__device__ and/or __constant__) that should be
-   * preserved at link time while others can be removed.\n
-   * Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries.\n
-   * Note that variable names can be mangled by the compiler in which case the
-   * mangled name needs to be specified.\n
-   * Wildcard "*" can be used to represent zero or more characters instead of
-   * specifying the full or mangled name.\n
-   * It is important to note that the wildcard "*" is also added implicitly.
-   * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz"
-   * and thus preserve all variables with those names. This can be avoided by
-   * providing a more specific name like "barfoobaz".\n Option type: const char
-   * **\n Applies to: link-time optimization specified with CU_JIT_LTO
-   */
-  CU_JIT_REFERENCED_VARIABLE_NAMES,
-
-  /**
-   * Number of entries in ::CU_JIT_REFERENCED_VARIABLE_NAMES array.\n
-   * Option type: unsigned int\n
-   * Applies to: link-time optimization specified with CU_JIT_LTO
-   */
-  CU_JIT_REFERENCED_VARIABLE_COUNT,
-
-  /**
-   * This option serves as a hint to enable the JIT compiler/linker
-   * to remove constant (__constant__) and device (__device__) variables
-   * unreferenced in device code (Disabled by default).\n
-   * Note that host references to constant and device variables using APIs like
-   * ::cuModuleGetGlobal() with this option specified may result in undefined
-   * behavior unless the variables are explicitly specified using
-   * ::CU_JIT_REFERENCED_VARIABLE_NAMES.\n Option type: int\n Applies to:
-   * link-time optimization specified with CU_JIT_LTO
-   */
-  CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES,
-
   CU_JIT_NUM_OPTIONS
 
 } CUjit_option;
@@ -1492,32 +1310,23 @@ typedef enum CUjit_option_enum {
  * Online compilation targets
  */
 typedef enum CUjit_target_enum {
-
   CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */
   CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */
-
   CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
   CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */
   CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */
   CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */
-
   CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */
   CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */
   CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */
-
   CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/
   CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/
   CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/
-
   CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/
   CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/
-
   CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/
-
   CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/
-  CU_TARGET_COMPUTE_86 = 86, /**< Compute device class 8.6.*/
-  CU_TARGET_COMPUTE_87 = 87, /**< Compute device class 8.7.*/
-
+  CU_TARGET_COMPUTE_86 = 86  /**< Compute device class 8.6.*/
 } CUjit_target;
 
 /**
@@ -1755,10 +1564,6 @@ typedef enum CUgraphNodeType_enum {
   CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9, /**< External semaphore wait node */
   CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10,     /**< Memory Allocation Node */
   CU_GRAPH_NODE_TYPE_MEM_FREE = 11       /**< Memory Free Node */
-
-  ,
-  CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12 /**< Batch MemOp Node */
-
 } CUgraphNodeType;
 
 typedef enum CUsynchronizationPolicy_enum {
@@ -1771,35 +1576,24 @@ typedef enum CUsynchronizationPolicy_enum {
 /**
  * Graph kernel node Attributes
  */
-
 typedef enum CUkernelNodeAttrID_enum {
   CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW =
-      1 /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */
-  ,
+      1, /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */
   CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE =
       2 /**< Allows a kernel node to be cooperative (see
            ::cuLaunchCooperativeKernel). */
-
-  ,
-  CU_KERNEL_NODE_ATTRIBUTE_PRIORITY = 8 /**< Sets the priority of the kernel. */
-
 } CUkernelNodeAttrID;
 
 /**
  * Graph kernel node attributes union, used with
  * ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute
  */
-
 typedef union CUkernelNodeAttrValue_union {
   CUaccessPolicyWindow
       accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */
   int cooperative;        /**< Nonzero indicates a cooperative kernel (see
                              ::cuLaunchCooperativeKernel). */
-
-  int priority; /**< Execution priority of the kernel. */
-
 } CUkernelNodeAttrValue_v1;
-
 typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue;
 
 /**
@@ -1826,7 +1620,6 @@ typedef enum CUstreamCaptureMode_enum {
 /**
  * Stream Attributes
  */
-
 typedef enum CUstreamAttrID_enum {
   CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW =
       1, /**< Identifier for ::CUstreamAttrValue::accessPolicyWindow. */
@@ -1838,7 +1631,6 @@ typedef enum CUstreamAttrID_enum {
  * Stream attributes union, used with
  * ::cuStreamSetAttribute/::cuStreamGetAttribute
  */
-
 typedef union CUstreamAttrValue_union {
   CUaccessPolicyWindow
       accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */
@@ -1846,7 +1638,6 @@ typedef union CUstreamAttrValue_union {
       syncPolicy; /**< Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY.
                    */
 } CUstreamAttrValue_v1;
-
 typedef CUstreamAttrValue_v1 CUstreamAttrValue;
 
 /**
@@ -1959,13 +1750,6 @@ typedef enum cudaError_enum {
    */
   CUDA_ERROR_STUB_LIBRARY = 34,
 
-  /**
-   * This indicates that requested CUDA device is unavailable at the current
-   * time. Devices are often unavailable due to use of
-   * ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.
-   */
-  CUDA_ERROR_DEVICE_UNAVAILABLE = 46,
-
   /**
    * This indicates that no CUDA-capable devices were detected by the installed
    * CUDA driver.
@@ -2125,8 +1909,7 @@ typedef enum cudaError_enum {
   CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224,
 
   /**
-   * This indicates that the device kernel source is invalid. This includes
-   * compilation/linker errors encountered in device code or user error.
+   * This indicates that the device kernel source is invalid.
    */
   CUDA_ERROR_INVALID_SOURCE = 300,
 
@@ -2560,9 +2343,9 @@ typedef size_t(CUDA_CB *CUoccupancyB2DSize)(int blockSize);
  * On Windows the flag is a no-op.
  * On Linux that memory is marked as non cache-coherent for the GPU and
  * is expected to be physically contiguous. It may return
- * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
- * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
- * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED
+ * CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
+ * CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
+ * On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED
  * is returned.
  * Flag for ::cuMemHostRegister()
  */
@@ -2571,13 +2354,12 @@ typedef size_t(CUDA_CB *CUoccupancyB2DSize)(int blockSize);
 /**
  * If set, the passed memory pointer is treated as pointing to memory that is
  * considered read-only by the device.  On platforms without
- * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag
+ * CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag
  * is required in order to register memory mapped to the CPU as read-only.
  * Support for the use of this flag can be queried from the device attribute
- * ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag
- * with a current context associated with a device that does not have this
- * attribute set will cause ::cuMemHostRegister to error with
- * ::CUDA_ERROR_NOT_SUPPORTED.
+ * CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
+ * a current context associated with a device that does not have this attribute
+ * set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED.
  */
 #define CU_MEMHOSTREGISTER_READ_ONLY 0x08
 
@@ -2743,16 +2525,6 @@ typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st {
 } CUDA_ARRAY_SPARSE_PROPERTIES_v1;
 typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES;
 
-/**
- * CUDA array memory requirements
- */
-typedef struct CUDA_ARRAY_MEMORY_REQUIREMENTS_st {
-  size_t size;      /**< Total required memory size */
-  size_t alignment; /**< alignment requirement */
-  unsigned int reserved[4];
-} CUDA_ARRAY_MEMORY_REQUIREMENTS_v1;
-typedef CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 CUDA_ARRAY_MEMORY_REQUIREMENTS;
-
 /**
  * CUDA Resource descriptor
  */
@@ -3386,14 +3158,6 @@ typedef enum CUmemAllocationGranularity_flags_enum {
       0x1 /**< Recommended granularity for allocation for best performance */
 } CUmemAllocationGranularity_flags;
 
-/**
- * Specifies the handle type for address range
- */
-typedef enum CUmemRangeHandleType_enum {
-  CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD = 0x1,
-  CU_MEM_RANGE_HANDLE_TYPE_MAX = 0x7FFFFFFF
-} CUmemRangeHandleType;
-
 /**
  * Sparse subresource types
  */
@@ -3556,11 +3320,8 @@ typedef enum CUgraphExecUpdateResult_enum {
       0x6, /**< The update failed because something about the node is not
               supported */
   CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE =
-      0x7, /**< The update failed because the function of a kernel node changed
-              in an unsupported way */
-  CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED =
-      0x8 /**< The update failed because the node attributes changed in a way
-             that is not supported */
+      0x7 /**< The update failed because the function of a kernel node changed
+             in an unsupported way */
 } CUgraphExecUpdateResult;
 
 /**
@@ -3773,12 +3534,6 @@ typedef enum CUgraphMem_attribute_enum {
  */
 #define CUDA_ARRAY3D_SPARSE 0x40
 
-/**
- * This flag if set indicates that the CUDA array or CUDA mipmapped array
- * will allow deferred memory mapping
- */
-#define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80
-
 /**
  * Override the texref format with a format inferred from the array.
  * Flag for ::cuTexRefSetArray()
@@ -3810,27 +3565,11 @@ typedef enum CUgraphMem_attribute_enum {
  */
 #define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION 0x20
 
-/**
- * Enable seamless cube map filtering.
- * Flag for ::cuTexObjectCreate()
- */
-#define CU_TRSF_SEAMLESS_CUBEMAP 0x40
-
-/**
- * C++ compile time constant for CU_LAUNCH_PARAM_END
- */
-#define CU_LAUNCH_PARAM_END_AS_INT 0x00
-
 /**
  * End of array terminator for the \p extra parameter to
  * ::cuLaunchKernel
  */
-#define CU_LAUNCH_PARAM_END ((void *)CU_LAUNCH_PARAM_END_AS_INT)
-
-/**
- * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_POINTER
- */
-#define CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT 0x01
+#define CU_LAUNCH_PARAM_END ((void *)0x00)
 
 /**
  * Indicator that the next value in the \p extra parameter to
@@ -3841,13 +3580,7 @@ typedef enum CUgraphMem_attribute_enum {
  * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
  * effect.
  */
-#define CU_LAUNCH_PARAM_BUFFER_POINTER \
-  ((void *)CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT)
-
-/**
- * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_SIZE
- */
-#define CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT 0x02
+#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *)0x01)
 
 /**
  * Indicator that the next value in the \p extra parameter to
@@ -3857,7 +3590,7 @@ typedef enum CUgraphMem_attribute_enum {
  * in the \p extra array if the value associated with
  * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
  */
-#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *)CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT)
+#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *)0x02)
 
 /**
  * For texture references loaded into the module, use default texunit from
@@ -3955,11 +3688,6 @@ typedef enum CUgraphDebugDot_flags_enum {
       1 << 11, /** Adds memory alloc node parameters to output */
   CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS =
       1 << 12 /** Adds memory free node parameters to output */
-
-  ,
-  CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS =
-      1 << 13 /** Adds batch mem op node parameters to output */
-
 } CUgraphDebugDot_flags;
 
 /**
@@ -3986,12 +3714,6 @@ typedef enum CUgraphInstantiate_flags_enum {
   CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH =
       1 /**< Automatically free memory allocated in a graph before relaunching.
          */
-
-  ,
-  CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY =
-      8 /**< Run the graph using the per-node priority attributes rather than
-           the priority of the stream it is launched into. */
-
 } CUgraphInstantiate_flags;
 
 /** @} */ /* END CUDA_TYPES */
@@ -4402,117 +4124,117 @@ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
  * \p dev. The supported attributes are:
  * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
  *   block;
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid;
  * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
- *   shared memory available to a thread block in bytes
+ *   shared memory available to a thread block in bytes;
  * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
- *   __constant__ variables in a CUDA C kernel in bytes
- * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads
+ *   __constant__ variables in a CUDA C kernel in bytes;
+ * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads;
  * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
  *   memory copy functions that involve memory regions allocated through
- *   ::cuMemAllocPitch()
+ *   ::cuMemAllocPitch();
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
- *  texture width
+ *  texture width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
- *  for a 1D texture bound to linear memory
+ *  for a 1D texture bound to linear memory;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
- *  mipmapped 1D texture width
+ *  mipmapped 1D texture width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
- *  texture width
+ *  texture width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
- *  texture height
+ *  texture height;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
- *  for a 2D texture bound to linear memory
+ *  for a 2D texture bound to linear memory;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
- *  for a 2D texture bound to linear memory
+ *  for a 2D texture bound to linear memory;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
- *  in bytes for a 2D texture bound to linear memory
+ *  in bytes for a 2D texture bound to linear memory;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
- *  mipmapped 2D texture width
+ *  mipmapped 2D texture width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
- *  mipmapped 2D texture height
+ *  mipmapped 2D texture height;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
- *  texture width
+ *  texture width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
- *  texture height
+ *  texture height;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
- *  texture depth
+ *  texture depth;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
  *  Alternate maximum 3D texture width, 0 if no alternate
- *  maximum 3D texture size is supported
+ *  maximum 3D texture size is supported;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
  *  Alternate maximum 3D texture height, 0 if no alternate
- *  maximum 3D texture size is supported
+ *  maximum 3D texture size is supported;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
  *  Alternate maximum 3D texture depth, 0 if no alternate
- *  maximum 3D texture size is supported
+ *  maximum 3D texture size is supported;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
- *  Maximum cubemap texture width or height
+ *  Maximum cubemap texture width or height;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
- *  Maximum 1D layered texture width
+ *  Maximum 1D layered texture width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
- *   Maximum layers in a 1D layered texture
+ *   Maximum layers in a 1D layered texture;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
- *  Maximum 2D layered texture width
+ *  Maximum 2D layered texture width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
- *   Maximum 2D layered texture height
+ *   Maximum 2D layered texture height;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
- *   Maximum layers in a 2D layered texture
+ *   Maximum layers in a 2D layered texture;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
- *   Maximum cubemap layered texture width or height
+ *   Maximum cubemap layered texture width or height;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
- *   Maximum layers in a cubemap layered texture
+ *   Maximum layers in a cubemap layered texture;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
- *   Maximum 1D surface width
+ *   Maximum 1D surface width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
- *   Maximum 2D surface width
+ *   Maximum 2D surface width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
- *   Maximum 2D surface height
+ *   Maximum 2D surface height;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
- *   Maximum 3D surface width
+ *   Maximum 3D surface width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
- *   Maximum 3D surface height
+ *   Maximum 3D surface height;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
- *   Maximum 3D surface depth
+ *   Maximum 3D surface depth;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
- *   Maximum 1D layered surface width
+ *   Maximum 1D layered surface width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
- *   Maximum layers in a 1D layered surface
+ *   Maximum layers in a 1D layered surface;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
- *   Maximum 2D layered surface width
+ *   Maximum 2D layered surface width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
- *   Maximum 2D layered surface height
+ *   Maximum 2D layered surface height;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
- *   Maximum layers in a 2D layered surface
+ *   Maximum layers in a 2D layered surface;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
- *   Maximum cubemap surface width
+ *   Maximum cubemap surface width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
- *   Maximum cubemap layered surface width
+ *   Maximum cubemap layered surface width;
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
- *   Maximum layers in a cubemap layered surface
+ *   Maximum layers in a cubemap layered surface;
  * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
- *   registers available to a thread block
- * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz
+ *   registers available to a thread block;
+ * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz;
  * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
  *   base addresses aligned to ::textureAlign bytes do not need an offset
- *   applied to texture fetches
+ *   applied to texture fetches;
  * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
- *   for 2D texture references bound to pitched memory
+ *   for 2D texture references bound to pitched memory;
  * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
- *   memory between host and device while executing a kernel, or 0 if not
+ *   memory between host and device while executing a kernel, or 0 if not;
  * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
- *   the device
+ *   the device;
  * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
- *   for kernels executed on the device, or 0 if not
+ *   for kernels executed on the device, or 0 if not;
  * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
- *   memory subsystem, or 0 if not
+ *   memory subsystem, or 0 if not;
  * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
- *   memory into the CUDA address space, or 0 if not
+ *   memory into the CUDA address space, or 0 if not;
  * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
  *   in. Available modes are as follows:
  *   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
@@ -4520,139 +4242,102 @@ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
  *   - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
  *     prohibited from creating new CUDA contexts.
  *   - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS:  Compute-exclusive-process mode -
- Device
- *     can have only one context used by a single process at a time.
+ * Device can have only one context used by a single process at a time.
  * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
  *   executing multiple kernels within the same context simultaneously, or 0 if
  *   not. It is not guaranteed that multiple kernels will be resident
  *   on the device concurrently so this feature should not be relied upon for
- *   correctness.
+ *   correctness;
  * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
- *    device, 0 if error correction is disabled or not supported by the device
- * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device
+ *    device, 0 if error correction is disabled or not supported by the device;
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device;
  * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot)
- identifier
- *   of the device
+ * identifier of the device;
  * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
  * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver.
- TCC
- *    is only available on Tesla hardware running Windows Vista or later
+ * TCC is only available on Tesla hardware running Windows Vista or later;
  * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in
- kilohertz
+ * kilohertz;
  * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in
- bits
+ * bits;
  * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the
- device doesn't have L2 cache
+ * device doesn't have L2 cache;
  * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident
- threads per multiprocessor
+ * threads per multiprocessor;
  * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified
- address space with
- *   the host, or 0 if not
+ * address space with the host, or 0 if not;
  * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability
- version number
+ * version number;
  * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability
- version number
+ * version number;
  * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports
- caching globals
- *    in L1 cache, 0 if caching globals in L1 cache is not supported by the
- device
+ * caching globals in L1 cache, 0 if caching globals in L1 cache is not
+ * supported by the device;
  * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports
- caching locals
- *    in L1 cache, 0 if caching locals in L1 cache is not supported by the
- device
+ * caching locals in L1 cache, 0 if caching locals in L1 cache is not supported
+ * by the device;
  * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount
- of
- *   shared memory available to a multiprocessor in bytes; this amount is shared
- *   by all thread blocks simultaneously resident on a multiprocessor
+ * of shared memory available to a multiprocessor in bytes; this amount is
+ * shared by all thread blocks simultaneously resident on a multiprocessor;
  * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of
- 32-bit
- *   registers available to a multiprocessor; this number is shared by all
- thread
- *   blocks simultaneously resident on a multiprocessor
+ * 32-bit registers available to a multiprocessor; this number is shared by all
+ * thread blocks simultaneously resident on a multiprocessor;
  * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating
- managed memory
- *   on this system, 0 if allocating managed memory is not supported by the
- device on this system.
+ * managed memory on this system, 0 if allocating managed memory is not
+ * supported by the device on this system.
  * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board,
- 0 if not.
+ * 0 if not.
  * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a
- group of devices
- *   associated with the same board. Devices on the same multi-GPU board will
- share the same identifier.
+ * group of devices associated with the same board. Devices on the same
+ * multi-GPU board will share the same identifier.
  * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the
- device and the host
- *   supports native atomic operations.
+ * device and the host supports native atomic operations.
  * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of
- single precision performance
- *   (in floating-point operations per second) to double precision performance.
+ * single precision performance (in floating-point operations per second) to
+ * double precision performance.
  * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently
- accessing
- *   pageable memory without calling cudaHostRegister on it.
+ * accessing pageable memory without calling cudaHostRegister on it.
  * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently
- access managed memory
- *   concurrently with the CPU.
+ * access managed memory concurrently with the CPU.
  * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute
- Preemption.
+ * Preemption.
  * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can
- access host registered
- *   memory at the same virtual address as the CPU.
+ * access host registered memory at the same virtual address as the CPU.
  * -  ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per
- block shared memory size
- *    suported on this device. This is the maximum value that can be opted into
- when using the cuFuncSetAttribute() call.
- *    For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+ * block shared memory size suported on this device. This is the maximum value
+ * that can be opted into when using the cuFuncSetAttribute() call. For more
+ * details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
  * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device
- accesses pageable memory via the host's
- *   page tables.
+ * accesses pageable memory via the host's page tables.
  * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can
- directly access managed memory on the device without migration.
+ * directly access managed memory on the device without migration.
  * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED:  Device supports
- virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate,
- ::cuMemMap and related APIs
+ * virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate,
+ * ::cuMemMap and related APIs
  * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device
- supports exporting memory to a posix file descriptor with
- ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * supports exporting memory to a posix file descriptor with
+ * ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
  * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED:  Device supports
- exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if
- requested via ::cuMemCreate
+ * exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if
+ * requested via ::cuMemCreate
  * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device
- supports exporting memory to a Win32 KMT handle with
- ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of
- thread blocks that can reside on a multiprocessor
- * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports
- compressible memory allocation via ::cuMemCreate
+ * supports exporting memory to a Win32 KMT handle with
+ * ::cuMemExportToShareableHandle, if requested ::cuMemCreate
  * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting
- lines capacity setting in bytes
+ * lines capacity setting in bytes.
  * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of
- CUaccessPolicyWindow::num_bytes
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device
- supports specifying the GPUDirect RDMA flag with ::cuMemCreate.
+ * CUaccessPolicyWindow::num_bytes.
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of
+ * thread blocks that can reside on a multiprocessor.
+ * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports
+ * compressible memory allocation via ::cuMemCreate
  * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared
- memory per block reserved by CUDA driver in bytes
- * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse
- CUDA arrays and sparse CUDA mipmapped arrays.
+ * memory per block reserved by CUDA driver in bytes.
  * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports
- using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register
- memory that must be mapped as read-only to the GPU
+ * using the ::cuMemHostRegister flag CU_MEMHOSTERGISTER_READ_ONLY to register
+ * memory that must be mapped as read-only to the GPU
  * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the
- ::cuMemAllocAsync and ::cuMemPool family of APIs
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect
- RDMA APIs, like nvidia_p2p_get_pages (see
- https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned
- attribute shall be interpreted as a bitmask, where the individual bits are
- described by the ::CUflushGPUDirectRDMAWritesOptions enum
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA
- writes to the device do not need to be flushed for consumers within the scope
- indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for
- the numerical values returned here.
- * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle
- types supported with mempool based IPC
-
- * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device
- supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
-
+ * ::cuMemAllocAsync and ::cuMemPool family of APIs
  *
  * \param pi     - Returned device attribute value
  * \param attrib - Device attribute to query
@@ -5443,16 +5128,6 @@ CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx,
  * It is the responsibility of the calling function to ensure that no API
  * call issues using \p ctx while ::cuCtxDestroy() is executing.
  *
- * Destroys and cleans up all resources associated with the context.
- * It is the caller's responsibility to ensure that the context or its resources
- * are not accessed or passed in subsequent API calls and doing so will result
- * in undefined behavior. These resources include CUDA types such as ::CUmodule,
- * ::CUfunction, ::CUstream, ::CUevent,
- * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref,
- * ::CUsurfref,
- * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and
- * ::CUexternalSemaphore.
- *
  * If \p ctx is current to the calling thread then \p ctx will also be
  * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
  * were called).  If \p ctx is current to other threads, then \p ctx will
@@ -5528,7 +5203,7 @@ CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
  * ::cuCtxPushCurrent() was called, this function makes that context current to
  * the CPU thread again.
  *
- * \param pctx - Returned popped context handle
+ * \param pctx - Returned new context handle
  *
  * \return
  * ::CUDA_SUCCESS,
@@ -6437,7 +6112,6 @@ CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
  * ::CUDA_ERROR_INVALID_CONTEXT,
  * ::CUDA_ERROR_INVALID_VALUE
  * \notefnerr
- * \note_destroy_ub
  *
  * \sa ::cuModuleGetFunction,
  * ::cuModuleGetGlobal,
@@ -6763,21 +6437,8 @@ CUresult CUDAAPI cuLinkDestroy(CUlinkState state);
 /**
  * \brief Gets free and total memory
  *
- * Returns in \p *total the total amount of memory available to the the current
- * context. Returns in \p *free the amount of memory on the device that is free
- * according to the OS. CUDA is not guaranteed to be able to allocate all of the
- * memory that the OS reports as free. In a multi-tenet situation, free estimate
- * returned is prone to race condition where a new allocation/free done by a
- * different process or a different thread in the same process between the time
- * when free memory was estimated and reported, will result in deviation in free
- * value reported and actual free memory.
- *
- * The integrated GPU on Tegra shares memory with CPU and other component
- * of the SoC. The free and total values returned by the API excludes
- * the SWAP memory space maintained by the OS on some platforms.
- * The OS may move some of the memory pages into swap area as the GPU or
- * CPU allocate or access memory. See Tegra app note on how to calculate
- * total and free memory on Tegra.
+ * Returns in \p *free and \p *total respectively, the free and total amount of
+ * memory available for allocation by the CUDA context, in bytes.
  *
  * \param free  - Returned free memory in bytes
  * \param total - Returned total memory in bytes
@@ -6909,17 +6570,7 @@ CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
  * \brief Frees device memory
  *
  * Frees the memory space pointed to by \p dptr, which must have been returned
- * by a previous call to one of the following memory allocation APIs -
- * ::cuMemAlloc(),
- * ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(),
- * ::cuMemAllocFromPoolAsync()
- *
- * Note - This API will not perform any implict synchronization when the pointer
- * was allocated with
- * ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all
- * accesses to the pointer have completed before invoking ::cuMemFree. For best
- * performance and memory reuse, users should use ::cuMemFreeAsync to free
- * memory allocated via the stream ordered memory allocator.
+ * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch().
  *
  * \param dptr - Pointer to memory to free
  *
@@ -6933,16 +6584,13 @@ CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemAllocManaged, ::cuMemAllocAsync,
- * ::cuMemAllocFromPoolAsync,
- * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, ::cuMemcpy3D,
- * ::cuMemcpy3DAsync,
- * ::cuMemcpyAtoA, ::cuMemcpyAtoD, ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync,
- * ::cuMemcpyDtoA,
- * ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync,
- * ::cuMemcpyHtoA,
- * ::cuMemcpyHtoAAsync, ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, ::cuMemFreeAsync,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
  * ::cudaFree
@@ -7170,7 +6818,7 @@ CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
  * Addressing is enabled. In such systems, it is valid to access the memory
  * using either pointer on devices that have a non-zero value for the device
  * attribute. Note however that such devices should access the memory using only
- * one of the two pointers and not both.
+ * of the two pointers and not both.
  *
  * \p Flags provides for future releases. For now, it must be set to 0.
  *
@@ -7657,6 +7305,8 @@ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
  * a result, this function is best used sparingly to register staging areas for
  * data exchange between host and device.
  *
+ * This function has limited support on Mac OS X. OS 10.7 or higher is required.
+ *
  * The \p Flags parameter enables different options to be specified that
  * affect the allocation, as follows.
  *
@@ -7673,10 +7323,10 @@ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
  *
  * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to
  * memory that is considered read-only by the device.  On platforms without
- *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this
- * flag is required in order to register memory mapped to the CPU as read-only.
+ *   CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag
+ * is required in order to register memory mapped to the CPU as read-only.
  * Support for the use of this flag can be queried from the device attribute
- *   ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag
+ *   CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag
  * with a current context associated with a device that does not have this
  * attribute set will cause ::cuMemHostRegister to error with
  * CUDA_ERROR_NOT_SUPPORTED.
@@ -9886,7 +9536,7 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
  * float16's:
  * \code
     CUDA_ARRAY_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_HALF;
+    desc.FormatFlags = CU_AD_FORMAT_HALF;
     desc.NumChannels = 4;
     desc.Width = width;
     desc.Height = height;
@@ -9896,7 +9546,7 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
  * of which is two 8-bit unsigned chars:
  * \code
     CUDA_ARRAY_DESCRIPTOR arrayDesc;
-    desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+    desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8;
     desc.NumChannels = 2;
     desc.Width = width;
     desc.Height = height;
@@ -10027,58 +9677,6 @@ CUresult CUDAAPI cuArrayGetSparseProperties(
 CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(
     CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap);
 
-/**
- * \brief Returns the memory requirements of a CUDA array
- *
- * Returns the memory requirements of a CUDA array in \p memoryRequirements
- * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
- * ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size
- * represents the total size of the CUDA array.
- * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment
- * represents the alignment necessary for mapping the CUDA array.
- *
- * \return
- * ::CUDA_SUCCESS
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
- * \param[in] array - CUDA array to get the memory requirements of
- * \param[in] device - Device to get the memory requirements for
- * \sa ::cuMipmappedArrayGetMemoryRequirements, ::cuMemMapArrayAsync
- */
-CUresult CUDAAPI
-cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements,
-                             CUarray array, CUdevice device);
-
-/**
- * \brief Returns the memory requirements of a CUDA mipmapped array
- *
- * Returns the memory requirements of a CUDA mipmapped array in \p
- * memoryRequirements If the CUDA mipmapped array is not allocated with flag
- * ::CUDA_ARRAY3D_DEFERRED_MAPPING
- * ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size
- * represents the total size of the CUDA mipmapped array.
- * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment
- * represents the alignment necessary for mapping the CUDA mipmapped
- * array.
- *
- * \return
- * ::CUDA_SUCCESS
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
- * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of
- * \param[in] device - Device to get the memory requirements for
- * \sa ::cuArrayGetMemoryRequirements, ::cuMemMapArrayAsync
- */
-CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(
-    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap,
-    CUdevice device);
-
 /**
  * \brief Gets a CUDA array plane from a CUDA array
  *
@@ -10323,7 +9921,7 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
  * 4x16-bit float16's:
  * \code
     CUDA_ARRAY3D_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_HALF;
+    desc.FormatFlags = CU_AD_FORMAT_HALF;
     desc.NumChannels = 4;
     desc.Width = width;
     desc.Height = height;
@@ -10876,10 +10474,7 @@ CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
  handle.
  * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have
  been allocated using
- * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE
-
- * or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
-
+ * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE.
  * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel,
  ::CUDA_ERROR_INVALID_VALUE will be returned.
  * If ::CUarrayMapInfo::resourceType is set to
@@ -10888,10 +10483,7 @@ CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
  mipmapped array handle.
  * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped
  array and must have been
- * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE
-
- * or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
-
+ * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE.
  *
  * ::CUarrayMapInfo::subresourceType specifies the type of subresource within
  the resource.
@@ -10954,13 +10546,6 @@ CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
  ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index.
  * Otherwise, must be zero.
  *
-
- * If ::CUarrayMapInfo::resource::array or ::CUarrayMapInfo::resource::mipmap
- was created with ::CUDA_ARRAY3D_DEFERRED_MAPPING
- * flag set the ::CUarrayMapInfo::subresourceType and the contents of
- ::CUarrayMapInfo::subresource will be ignored.
- *
-
  * ::CUarrayMapInfo::memOperationType specifies the type of operation.
  ::CUmemOperationType is defined as: \code typedef enum CUmemOperationType_enum
  { CU_MEM_OPERATION_TYPE_MAP = 1, CU_MEM_OPERATION_TYPE_UNMAP = 2 }
@@ -13859,10 +13444,6 @@ CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
  * ::CUDA_ERROR_INVALID_HANDLE
  * \notefnerr
  *
- * \note On Tegra devices, this API will always attempt to do a compressed
- mapping when the \p extMem is
- * imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD
- *
  * \sa ::cuImportExternalMemory,
  * ::cuDestroyExternalMemory,
  * ::cuExternalMemoryGetMappedBuffer
@@ -14232,7 +13813,7 @@ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
 /** @} */ /* END CUDA_EXTRES_INTEROP */
 
 /**
- * \defgroup CUDA_MEMOP Stream Memory Operations
+ * \defgroup CUDA_MEMOP Stream memory operations
  *
  * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API
  * (___CURRENT_FILE___) ___ENDMANBRIEF___
@@ -14240,13 +13821,8 @@ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
  * This section describes the stream memory operations of the low-level CUDA
  * driver application programming interface.
  *
- * There are two versions of these APIs, a legacy version and a newer V2
- * version.
- *
- * V1:
- *
- * The V1 API is disabled by default. Users are required
- * to explicitly enable it, e.g. on Linux by passing the kernel module
+ * The whole set of operations is disabled by default. Users are required
+ * to explicitly enable them, e.g. on Linux by passing the kernel module
  * parameter shown below:
  *     modprobe nvidia NVreg_EnableStreamMemOPs=1
  * There is currently no way to enable these operations on other operating
@@ -14269,25 +13845,6 @@ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
  * hardware features and can be queried with ::cuDeviceGetAttribute() and
  * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
  *
- * V2:
- *
- * The V2 APIs are available by default on all platforms.
- *
- * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2.
- *
- * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64()
- * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and
- * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2.
- *
- * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and
- * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform
- * hardware features and can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
- *
- * V1 & V2:
- *
  * Note that all memory pointers passed as parameters to these operations
  * are device pointers. Where necessary a device pointer should be
  * obtained, for example with ::cuMemHostGetDevicePointer().
@@ -14295,16 +13852,6 @@ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
  * None of the operations accepts pointers to managed memory buffers
  * (::cuMemAllocManaged).
  *
- * \note
- * Warning:
- * Improper use of these APIs may deadlock the application. Synchronization
- * ordering established through these APIs is not visible to CUDA. CUDA tasks
- * that are (even indirectly) ordered by these APIs should also have that order
- * expressed with CUDA-visible dependencies such as events. This ensures that
- * the scheduler does not serialize them in an improper order. For more
- * information, see the Stream Memory Operations section in the programming
- * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
- *
  * @{
  */
 
@@ -14328,16 +13875,6 @@ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
  * ::cuDeviceGetAttribute() and
  * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
  *
- * \note
- * Warning:
- * Improper use of this API may deadlock the application. Synchronization
- * ordering established through this API is not visible to CUDA. CUDA tasks
- * that are (even indirectly) ordered by this API should also have that order
- * expressed with CUDA-visible dependencies such as events. This ensures that
- * the scheduler does not serialize them in an improper order. For more
- * information, see the Stream Memory Operations section in the programming
- * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
- *
  * \param stream The stream to synchronize on the memory location.
  * \param addr The memory location to wait on.
  * \param value The value to compare with the memory location.
@@ -14374,16 +13911,6 @@ CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
  * Support for this can be queried with ::cuDeviceGetAttribute() and
  * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
  *
- * \note
- * Warning:
- * Improper use of this API may deadlock the application. Synchronization
- * ordering established through this API is not visible to CUDA. CUDA tasks
- * that are (even indirectly) ordered by this API should also have that order
- * expressed with CUDA-visible dependencies such as events. This ensures that
- * the scheduler does not serialize them in an improper order. For more
- * information, see the Stream Memory Operations section in the programming
- * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
- *
  * \param stream The stream to synchronize on the memory location.
  * \param addr The memory location to wait on.
  * \param value The value to compare with the memory location.
@@ -14493,16 +14020,6 @@ CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
  * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details
  * on querying support for specific operations.
  *
- * \note
- * Warning:
- * Improper use of this API may deadlock the application. Synchronization
- * ordering established through this API is not visible to CUDA. CUDA tasks
- * that are (even indirectly) ordered by this API should also have that order
- * expressed with CUDA-visible dependencies such as events. This ensures that
- * the scheduler does not serialize them in an improper order. For more
- * information, see the Stream Memory Operations section in the programming
- * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
- *
  * \param stream The stream to enqueue the operations in.
  * \param count The number of operations in the array. Must be less than 256.
  * \param paramArray The types and parameters of the individual operations.
@@ -14524,209 +14041,6 @@ CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
                                     CUstreamBatchMemOpParams *paramArray,
                                     unsigned int flags);
 
-/**
- * \brief Wait on a memory location
- *
- * Enqueues a synchronization of the stream on the given memory location. Work
- * ordered after the operation will block until the given condition on the
- * memory is satisfied. By default, the condition is to wait for
- * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
- * Other condition types can be specified via \p flags.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
- * be used with managed memory (::cuMemAllocManaged).
- *
- * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with
- * ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2.
- *
- * \note
- * Warning:
- * Improper use of this API may deadlock the application. Synchronization
- * ordering established through this API is not visible to CUDA. CUDA tasks
- * that are (even indirectly) ordered by this API should also have that order
- * expressed with CUDA-visible dependencies such as events. This ensures that
- * the scheduler does not serialize them in an improper order. For more
- * information, see the Stream Memory Operations section in the programming
- * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
- *
- * \param stream The stream to synchronize on the memory location.
- * \param addr The memory location to wait on.
- * \param value The value to compare with the memory location.
- * \param flags See ::CUstreamWaitValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWaitValue64_v2,
- * ::cuStreamWriteValue32_v2,
- * ::cuStreamWriteValue64_v2,
- * ::cuStreamBatchMemOp_v2,
- * ::cuMemHostRegister,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr,
-                                        cuuint32_t value, unsigned int flags);
-
-/**
- * \brief Wait on a memory location
- *
- * Enqueues a synchronization of the stream on the given memory location. Work
- * ordered after the operation will block until the given condition on the
- * memory is satisfied. By default, the condition is to wait for
- * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal.
- * Other condition types can be specified via \p flags.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer().
- *
- * Support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2.
- *
- * \note
- * Warning:
- * Improper use of this API may deadlock the application. Synchronization
- * ordering established through this API is not visible to CUDA. CUDA tasks
- * that are (even indirectly) ordered by this API should also have that order
- * expressed with CUDA-visible dependencies such as events. This ensures that
- * the scheduler does not serialize them in an improper order. For more
- * information, see the Stream Memory Operations section in the programming
- * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
- *
- * \param stream The stream to synchronize on the memory location.
- * \param addr The memory location to wait on.
- * \param value The value to compare with the memory location.
- * \param flags See ::CUstreamWaitValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWaitValue32_v2,
- * ::cuStreamWriteValue32_v2,
- * ::cuStreamWriteValue64_v2,
- * ::cuStreamBatchMemOp_v2,
- * ::cuMemHostRegister,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr,
-                                        cuuint64_t value, unsigned int flags);
-
-/**
- * \brief Write a value to memory
- *
- * Write a value to memory.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
- * be used with managed memory (::cuMemAllocManaged).
- *
- * \param stream The stream to do the write in.
- * \param addr The device address to write to.
- * \param value The value to write.
- * \param flags See ::CUstreamWriteValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWriteValue64_v2,
- * ::cuStreamWaitValue32_v2,
- * ::cuStreamWaitValue64_v2,
- * ::cuStreamBatchMemOp_v2,
- * ::cuMemHostRegister,
- * ::cuEventRecord
- */
-CUresult CUDAAPI cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr,
-                                         cuuint32_t value, unsigned int flags);
-
-/**
- * \brief Write a value to memory
- *
- * Write a value to memory.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer().
- *
- * Support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2.
- *
- * \param stream The stream to do the write in.
- * \param addr The device address to write to.
- * \param value The value to write.
- * \param flags See ::CUstreamWriteValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWriteValue32_v2,
- * ::cuStreamWaitValue32_v2,
- * ::cuStreamWaitValue64_v2,
- * ::cuStreamBatchMemOp_v2,
- * ::cuMemHostRegister,
- * ::cuEventRecord
- */
-CUresult CUDAAPI cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr,
-                                         cuuint64_t value, unsigned int flags);
-
-/**
- * \brief Batch operations to synchronize the stream via memory operations
- *
- * This is a batch version of ::cuStreamWaitValue32_v2() and
- * ::cuStreamWriteValue32_v2(). Batching operations may avoid some performance
- * overhead in both the API call and the device execution versus adding them to
- * the stream in separate API calls. The operations are enqueued in the order
- * they appear in the array.
- *
- * See ::CUstreamBatchMemOpType for the full set of supported operations, and
- * ::cuStreamWaitValue32_v2(), ::cuStreamWaitValue64_v2(),
- * ::cuStreamWriteValue32_v2(), and ::cuStreamWriteValue64_v2() for details of
- * specific operations.
- *
- * See related APIs for details on querying support for specific operations.
- *
- * \note
- * Warning:
- * Improper use of this API may deadlock the application. Synchronization
- * ordering established through this API is not visible to CUDA. CUDA tasks
- * that are (even indirectly) ordered by this API should also have that order
- * expressed with CUDA-visible dependencies such as events. This ensures that
- * the scheduler does not serialize them in an improper order. For more
- * information, see the Stream Memory Operations section in the programming
- * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
- *
- * \param stream The stream to enqueue the operations in.
- * \param count The number of operations in the array. Must be less than 256.
- * \param paramArray The types and parameters of the individual operations.
- * \param flags Reserved for future expansion; must be 0.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWaitValue32_v2,
- * ::cuStreamWaitValue64_v2,
- * ::cuStreamWriteValue32_v2,
- * ::cuStreamWriteValue64_v2,
- * ::cuMemHostRegister
- */
-CUresult CUDAAPI cuStreamBatchMemOp_v2(CUstream stream, unsigned int count,
-                                       CUstreamBatchMemOpParams *paramArray,
-                                       unsigned int flags);
-
 /** @} */ /* END CUDA_MEMOP */
 
 /**
@@ -14774,35 +14088,9 @@ CUresult CUDAAPI cuStreamBatchMemOp_v2(CUstream stream, unsigned int count,
  * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has
  *   been compiled with user specified option "-Xptxas --dlcm=ca" set .
  * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in
- bytes of
- *   dynamically-allocated shared memory.
+ * bytes of dynamically-allocated shared memory.
  * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared
- memory-L1
- *   cache split ratio in percent of total shared memory.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+ * memory-L1 cache split ratio in percent of total shared memory.
  *
  * \param pi     - Returned attribute value
  * \param attrib - Attribute requested
@@ -14831,52 +14119,26 @@ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
  * \brief Sets information about a function
  *
  * This call sets the value of a specified attribute \p attrib on the kernel
- given
- * by \p hfunc to an integer value specified by \p val
- * This function returns CUDA_SUCCESS if the new value of the attribute could be
- * successfully set. If the set fails, this call will return an error.
- * Not all attributes can have values set. Attempting to set a value on a
- read-only
- * attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+ * given by \p hfunc to an integer value specified by \p val This function
+ * returns CUDA_SUCCESS if the new value of the attribute could be successfully
+ * set. If the set fails, this call will return an error. Not all attributes can
+ * have values set. Attempting to set a value on a read-only attribute will
+ * result in an error (CUDA_ERROR_INVALID_VALUE)
  *
  * Supported attributes for the cuFuncSetAttribute call are:
  * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in
- bytes of
- *   dynamically-allocated shared memory. The value should contain the requested
- *   maximum size of dynamically-allocated shared memory. The sum of this value
- and
- *   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed
- the
- *   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
- *   The maximal size of requestable dynamic shared memory may differ by GPU
- *   architecture.
+ * bytes of dynamically-allocated shared memory. The value should contain the
+ * requested maximum size of dynamically-allocated shared memory. The sum of
+ * this value and the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
+ * cannot exceed the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. The maximal size of
+ * requestable dynamic shared memory may differ by GPU architecture.
  * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the
- L1
- *   cache and shared memory use the same hardware resources, this sets the
- shared memory
- *   carveout preference, in percent of the total shared memory.
- *   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
- *   This is only a hint, and the driver can choose a different ratio if
- required to execute the function.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+ * L1 cache and shared memory use the same hardware resources, this sets the
+ * shared memory carveout preference, in percent of the total shared memory. See
+ * ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR This is only a
+ * hint, and the driver can choose a different ratio if required to execute the
+ * function.
  *
  * \param hfunc  - Function to query attribute of
  * \param attrib - Attribute requested
@@ -16935,170 +16197,6 @@ CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(
 CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(
     CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
 
-/**
- * \brief Creates a batch memory operation node and adds it to a graph
- *
- * Creates a new batch memory operation node and adds it to \p hGraph with \p
- * numDependencies dependencies specified via \p dependencies and arguments
- * specified in \p nodeParams. It is possible for \p numDependencies to be 0, in
- * which case the node will be placed at the root of the graph. \p dependencies
- * may not have any duplicate entries. A handle to the new node will be returned
- * in \p phGraphNode.
- *
- * When the node is added, the paramArray inside \p nodeParams is copied and
- * therefore it can be freed after the call returns.
- *
- * These nodes may not be used in loops or conditionals.
- *
- * \note
- * Warning:
- * Improper use of this API may deadlock the application. Synchronization
- * ordering established through this API is not visible to CUDA. CUDA tasks
- * that are (even indirectly) ordered by this API should also have that order
- * expressed with CUDA-visible dependencies such as events. This ensures that
- * the scheduler does not serialize them in an improper order. For more
- * information, see the Stream Memory Operations section in the programming
- * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuStreamBatchMemOp_v2,
- * ::cuStreamWaitValue32_v2,
- * ::cuStreamWriteValue32_v2,
- * ::cuStreamWaitValue64_v2,
- * ::cuStreamWriteValue64_v2,
- * ::cuGraphBatchMemOpNodeGetParams,
- * ::cuGraphBatchMemOpNodeSetParams,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- */
-CUresult CUDAAPI cuGraphAddBatchMemOpNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns a batch mem op node's parameters
- *
- * Returns the parameters of batch mem op node \p hNode in \p nodeParams_out.
- * The \p paramArray returned in \p nodeParams_out is owned by the node.
- * This memory remains valid until the node is destroyed or its
- * parameters are modified, and should not be modified
- * directly. Use ::cuGraphBatchMemOpNodeSetParams to update the
- * parameters of this node.
- *
- * \param hNode          - Node to get the parameters for
- * \param nodeParams_out - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuStreamBatchMemOp_v2,
- * ::cuGraphAddBatchMemOpNode,
- * ::cuGraphBatchMemOpNodeSetParams
- */
-CUresult CUDAAPI cuGraphBatchMemOpNodeGetParams(
-    CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
-
-/**
- * \brief Sets a batch mem op node's parameters
- *
- * Sets the parameters of batch mem op node \p hNode to \p nodeParams.
- *
- * The paramArray inside \p nodeParams is copied and therefore it can be
- * freed after the call returns.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuStreamBatchMemOp_v2,
- * ::cuGraphAddBatchMemOpNode,
- * ::cuGraphBatchMemOpNodeGetParams
- */
-CUresult CUDAAPI cuGraphBatchMemOpNodeSetParams(
-    CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets the parameters for a batch mem op node in the given graphExec
- *
- * Sets the parameters of a batch mem op node in an executable graph \p
- * hGraphExec. The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * The following fields on operations may be modified on an executable graph:
- *
- *  op.waitValue.address
- *  op.waitValue.value[64]
- *  op.waitValue.flags bits corresponding to wait type (i.e.
- * CU_STREAM_WAIT_VALUE_FLUSH bit cannot be modified) op.writeValue.address
- *  op.writeValue.value[64]
- *
- * Other fields, such as the context, count or type of operations, and other
- * types of operations such as membars, may not be modified.
- *
- * \p hNode must not have been removed from the original graph.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * The paramArray inside \p nodeParams is copied and therefore it can be
- * freed after the call returns.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Batch mem op node from the graph from which graphExec was
- * instantiated \param nodeParams - Updated Parameters to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuStreamBatchMemOp_v2,
- * ::cuGraphAddBatchMemOpNode,
- * ::cuGraphBatchMemOpNodeGetParams,
- * ::cuGraphBatchMemOpNodeSetParams,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecBatchMemOpNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
-
 /**
  * \brief Creates an allocation node and adds it to a graph
  *
@@ -17116,8 +16214,8 @@ CUresult CUDAAPI cuGraphExecBatchMemOpNodeSetParams(
  * \param nodeParams      - Parameters for the node
  *
  * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the
- * address of the allocation in \p nodeParams.dptr.  The allocation's address
- * remains fixed across instantiations and launches.
+ * address of the allocation in \param nodeParams.dptr.  The allocation's
+ * address remains fixed across instantiations and launches.
  *
  * If the allocation is freed in the same graph, by creating a free node using
  * ::cuGraphAddMemFreeNode, the allocation can be accessed by nodes ordered
@@ -17311,9 +16409,7 @@ CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode,
  *
  * \sa
  * ::cuGraphAddMemAllocNode,
- * ::cuGraphAddMemFreeNode,
- * ::cuDeviceSetGraphMemAttribute,
- * ::cuDeviceGetGraphMemAttribute
+ * ::cuGraphAddMemFreeNode
  */
 CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device);
 
@@ -17341,7 +16437,6 @@ CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device);
  * ::CUDA_ERROR_INVALID_DEVICE
  *
  * \sa
- * ::cuDeviceSetGraphMemAttribute,
  * ::cuGraphAddMemAllocNode,
  * ::cuGraphAddMemFreeNode
  */
@@ -17369,7 +16464,6 @@ CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device,
  * ::CUDA_ERROR_INVALID_DEVICE
  *
  * \sa
- * ::cuDeviceGetGraphMemAttribute,
  * ::cuGraphAddMemAllocNode,
  * ::cuGraphAddMemFreeNode
  */
@@ -17780,26 +16874,17 @@ CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph,
  * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
  * graph containing memory allocation nodes to automatically free any
  * unfreed memory allocations before the graph is relaunched.
-
- *
- * - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph
- * to use the priorities from the per-node attributes rather than the priority
- * of the launch stream during execution. Note that priorities are only
- available
- * on kernel nodes, and are copied from stream priority during stream capture.
-
  *
  * If \p hGraph contains any allocation or free nodes, there can be at most one
  * executable graph in existence for that graph at a time.
  *
  * An attempt to instantiate a second executable graph before destroying the
- first
- * with ::cuGraphExecDestroy will result in an error.
+ * first with ::cuGraphExecDestroy will result in an error.
  *
  * \param phGraphExec - Returns instantiated graph
  * \param hGraph      - Graph to instantiate
  * \param flags       - Flags to control instantiation.  See
- ::CUgraphInstantiate_flags.
+ * ::CUgraphInstantiate_flags.
  *
  * \return
  * ::CUDA_SUCCESS,
@@ -17827,13 +16912,9 @@ CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec,
  * The node is identified by the corresponding node \p hNode in the
  * non-executable graph, from which the executable graph was instantiated.
  *
- * \p hNode must not have been removed from the original graph. All \p
- * nodeParams fields may change, but the following restrictions apply to \p func
- * updates:
- *
- *   - The owning context of the function cannot change.
- *   - A node whose function originally did not use CUDA dynamic parallelism
- * cannot be updated to a function which uses CDP
+ * \p hNode must not have been removed from the original graph. The \p func
+ * field of \p nodeParams cannot be modified and must match the original value.
+ * All other values can be modified.
  *
  * The modifications only affect future launches of \p hGraphExec. Already
  * enqueued or running launches of \p hGraphExec are not affected by this call.
@@ -18230,78 +17311,6 @@ CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(
     CUgraphExec hGraphExec, CUgraphNode hNode,
     const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
 
-/**
- * \brief Enables or disables the specified node in the given graphExec
- *
- * Sets \p hNode to be either enabled or disabled. Disabled nodes are
- * functionally equivalent to empty nodes until they are reenabled. Existing
- * node parameters are not affected by disabling/enabling the node.
- *
- * The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * \p hNode must not have been removed from the original graph.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * \note Currently only kernel, memset and memcpy nodes are supported.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Node from the graph from which graphExec was instantiated
- * \param isEnabled  - Node is enabled if != 0, otherwise the node is disabled
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphNodeGetEnabled,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- * ::cuGraphLaunch
- */
-
-CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec,
-                                       CUgraphNode hNode,
-                                       unsigned int isEnabled);
-
-/**
- * \brief Query whether a node in the given graphExec is enabled
- *
- * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled.
- *
- * The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * \p hNode must not have been removed from the original graph.
- *
- * \note Currently only kernel, memset and memcpy nodes are supported.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Node from the graph from which graphExec was instantiated
- * \param isEnabled  - Location to return the enabled status of the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphNodeSetEnabled,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- * ::cuGraphLaunch
- */
-
-CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec,
-                                       CUgraphNode hNode,
-                                       unsigned int *isEnabled);
-
 /**
  * \brief Uploads an executable graph in a stream
  *
@@ -18419,13 +17428,7 @@ CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
  * - Kernel nodes:
  *   - The owning context of the function cannot change.
  *   - A node whose function originally did not use CUDA dynamic parallelism
- * cannot be updated to a function which uses CDP.
- *   - A cooperative node cannot be updated to a non-cooperative node, and
- * vice-versa.
- *   - If the graph was instantiated with
- * CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, the priority attribute cannot
- * change. Equality is checked on the originally requested priority values,
- * before they are clamped to the device's supported range.
+ * cannot be updated to a function which uses CDP
  * - Memset and memcpy nodes:
  *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot
  * change.
@@ -18464,9 +17467,6 @@ CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
  * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node
  * changed in a way that is not supported, in which case \p hErrorNode_out is
  * set to the node from \p hGraph.
- * - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node
- * changed in a way that is not supported, in which case \p hErrorNode_out is
- * set to the node from \p hGraph.
  * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is
  * unsupported, like the node's type or configuration, in which case \p
  * hErrorNode_out is set to the node from \p hGraph
@@ -18497,7 +17497,6 @@ CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
  * \sa
  * ::cuGraphInstantiate,
  */
-
 CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph,
                                    CUgraphNode *hErrorNode_out,
                                    CUgraphExecUpdateResult *updateResult_out);
@@ -20117,21 +19116,6 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray,
  *   filtering optimizations. Trilinear optimizations improve texture filtering
  *   performance by allowing bilinear filtering on textures in scenarios where
  *   it can closely approximate the expected results.
- *   - ::CU_TRSF_SEAMLESS_CUBEMAP, which enables seamless cube map filtering.
- *   This flag can only be specified if the underlying resource is a CUDA array
- *   or a CUDA mipmapped array that was created with the flag
- ::CUDA_ARRAY3D_CUBEMAP.
- *   When seamless cube map filtering is enabled, texture address modes
- specified
- *   by ::CUDA_TEXTURE_DESC::addressMode are ignored. Instead, if the
- ::CUDA_TEXTURE_DESC::filterMode
- *   is set to ::CU_TR_FILTER_MODE_POINT the address mode
- ::CU_TR_ADDRESS_MODE_CLAMP
- *   will be applied for all dimensions. If the ::CUDA_TEXTURE_DESC::filterMode
- is
- *   set to ::CU_TR_FILTER_MODE_LINEAR seamless cube map filtering will be
- performed
- *   when sampling along the cube face borders.
  *
  * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio
  to be used when doing anisotropic filtering. This value will be
@@ -20945,74 +19929,6 @@ CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn,
 
 /** @} */ /* END CUDA_DRIVER_ENTRY_POINT */
 
-/**
- * CUDA Lazy Loading status
- */
-typedef enum CUmoduleLoadingMode_enum {
-  CU_MODULE_EAGER_LOADING = 0x1, /**< Lazy Kernel Loading is not enabled */
-  CU_MODULE_LAZY_LOADING = 0x2,  /**< Lazy Kernel Loading is enabled */
-} CUmoduleLoadingMode;
-
-/**
- * \brief Query lazy loading mode
- *
- * Returns lazy loading mode
- * Module loading mode is controlled by CUDA_MODULE_LOADING env variable
- *
- * \param mode      - Returns the lazy loading mode
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa
- * ::cuModuleLoad,
- */
-CUresult CUDAAPI cuModuleGetLoadingMode(CUmoduleLoadingMode *mode);
-
-/*
- * @brief Retrieve handle for an address range
- *
- * Get a handle of the specified type to an address range. The address range
- * must have been obtained by a prior call to either ::cuMemAlloc or
- * ::cuMemAddressReserve. If the address range was obtained via
- * ::cuMemAddressReserve, it must also be fully mapped via ::cuMemMap.
- *
- * Users must ensure the \p dptr and \p size are aligned to the host page size.
- *
- * When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
- * users are expected to query for dma_buf support for the platform
- * by using ::CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED device attribute before
- * calling this API. The \p handle will be interpreted as a pointer to an
- * integer to store the dma_buf file descriptor. Users must ensure the entire
- * address range is backed and mapped when the address range is allocated by
- * ::cuMemAddressReserve. All the physical allocations backing the address range
- * must be resident on the same device and have identical allocation properties.
- * Users are also expected to retrieve a new handle every time the underlying
- * physical allocation(s) corresponding to a previously queried VA range are
- * changed.
- *
- * @param[out] handle     - Pointer to the location where the returned handle
- * will be stored.
- * @param[in] dptr        - Pointer to a valid CUDA device allocation. Must be
- * aligned to host page size.
- * @param[in] size        - Length of the address range. Must be aligned to host
- * page size.
- * @param[in] handleType  - Type of handle requested (defines type and size of
- * the \p handle output parameter)
- * @param[in] flags       - Reserved, must be zero
- *
- * @returns
- * CUDA_SUCCESS
- * CUDA_ERROR_INVALID_VALUE
- * CUDA_ERROR_NOT_SUPPORTED
- */
-CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr,
-                                               size_t size,
-                                               CUmemRangeHandleType handleType,
-                                               unsigned long long flags);
-
 CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
                                   const CUuuid *pExportTableId);
 
@@ -21095,7 +20011,6 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
 #undef cuEventRecord
 #undef cuEventRecordWithFlags
 #undef cuLaunchKernel
-
 #undef cuLaunchHostFunc
 #undef cuGraphicsMapResources
 #undef cuGraphicsUnmapResources
@@ -21104,11 +20019,6 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
 #undef cuStreamWriteValue64
 #undef cuStreamWaitValue64
 #undef cuStreamBatchMemOp
-#undef cuStreamWriteValue32_v2
-#undef cuStreamWaitValue32_v2
-#undef cuStreamWriteValue64_v2
-#undef cuStreamWaitValue64_v2
-#undef cuStreamBatchMemOp_v2
 #undef cuMemPrefetchAsync
 #undef cuLaunchCooperativeKernel
 #undef cuSignalExternalSemaphoresAsync
@@ -21118,7 +20028,6 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
 #undef cuStreamIsCapturing
 #undef cuStreamGetCaptureInfo
 #undef cuStreamGetCaptureInfo_v2
-
 #undef cuGraphUpload
 #undef cuGraphLaunch
 #undef cuDevicePrimaryCtxRelease
@@ -21422,7 +20331,6 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
                                 unsigned int blockDimZ,
                                 unsigned int sharedMemBytes, CUstream hStream,
                                 void **kernelParams, void **extra);
-
 CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
                                   void *userData);
 CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
@@ -21442,19 +20350,6 @@ CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
 CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
                                     CUstreamBatchMemOpParams *paramArray,
                                     unsigned int flags);
-
-CUresult CUDAAPI cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr,
-                                         cuuint32_t value, unsigned int flags);
-CUresult CUDAAPI cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr,
-                                        cuuint32_t value, unsigned int flags);
-CUresult CUDAAPI cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr,
-                                         cuuint64_t value, unsigned int flags);
-CUresult CUDAAPI cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr,
-                                        cuuint64_t value, unsigned int flags);
-CUresult CUDAAPI cuStreamBatchMemOp_v2(CUstream stream, unsigned int count,
-                                       CUstreamBatchMemOpParams *paramArray,
-                                       unsigned int flags);
-
 CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
                                     CUdevice dstDevice, CUstream hStream);
 CUresult CUDAAPI cuLaunchCooperativeKernel(
@@ -21484,7 +20379,6 @@ CUresult CUDAAPI cuStreamGetCaptureInfo_v2(
     CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
     cuuint64_t *id_out, CUgraph *graph_out,
     const CUgraphNode **dependencies_out, size_t *numDependencies_out);
-
 CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream);
 CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream);
 CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream);
@@ -21512,7 +20406,6 @@ CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream,
                                                    size_t numDependencies,
                                                    unsigned int flags);
 #elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
-
 static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr,
                                              int driverVersion,
                                              cuuint64_t flags) {
@@ -21524,7 +20417,6 @@ static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr,
   return cuGetProcAddress(symbol, funcPtr, driverVersion, flags);
 }
 #define cuGetProcAddress cuGetProcAddress_ptsz
-
 #endif
 
 #ifdef __cplusplus
diff --git a/thirdparty/nvcodec/Interface/nvcodec_api.h b/thirdparty/nvcodec/Interface/nvcodec_api.h
deleted file mode 100644
index 5d3e47c..0000000
--- a/thirdparty/nvcodec/Interface/nvcodec_api.h
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * @Author: DI JUNKUN
- * @Date: 2024-08-12
- * Copyright (c) 2024 by DI JUNKUN, All Rights Reserved.
- */
-
-#ifndef _NVCODEC_API_H_
-#define _NVCODEC_API_H_
-
-#include <Windows.h>
-
-#include <iostream>
-
-#include "cuda.h"
-#include "cuviddec.h"
-#include "nvEncodeAPI.h"
-#include "nvcuvid.h"
-
-typedef CUresult (*TcuInit)(unsigned int Flags);
-
-typedef CUresult (*TcuDeviceGet)(CUdevice *device, int ordinal);
-
-typedef CUresult (*TcuDeviceGetCount)(int *count);
-
-typedef CUresult (*TcuCtxCreate)(CUcontext *pctx, unsigned int flags,
-                                 CUdevice dev);
-
-typedef CUresult (*TcuGetErrorName)(CUresult error, const char **pStr);
-
-typedef CUresult (*TcuCtxPushCurrent)(CUcontext ctx);
-
-typedef CUresult (*TcuCtxPopCurrent)(CUcontext *pctx);
-
-typedef CUresult (*TcuMemAlloc)(CUdeviceptr *dptr, size_t bytesize);
-
-typedef CUresult (*TcuMemAllocPitch)(CUdeviceptr *dptr, size_t *pPitch,
-                                     size_t WidthInBytes, size_t Height,
-                                     unsigned int ElementSizeBytes);
-
-typedef CUresult (*TcuMemFree)(CUdeviceptr dptr);
-
-typedef CUresult (*TcuMemcpy2DAsync)(const CUDA_MEMCPY2D *pCopy,
-                                     CUstream hStream);
-
-typedef CUresult (*TcuStreamSynchronize)(CUstream hStream);
-
-typedef CUresult (*TcuMemcpy2D)(const CUDA_MEMCPY2D *pCopy);
-
-typedef CUresult (*TcuMemcpy2DUnaligned)(const CUDA_MEMCPY2D *pCopy);
-
-// API
-static TcuInit cuInit_ld;
-static TcuDeviceGet cuDeviceGet_ld;
-static TcuDeviceGetCount cuDeviceGetCount_ld;
-static TcuCtxCreate cuCtxCreate_ld;
-static TcuGetErrorName cuGetErrorName_ld;
-static TcuCtxPushCurrent cuCtxPushCurrent_ld;
-static TcuCtxPopCurrent cuCtxPopCurrent_ld;
-static TcuMemAlloc cuMemAlloc_ld;
-static TcuMemAllocPitch cuMemAllocPitch_ld;
-static TcuMemFree cuMemFree_ld;
-static TcuMemcpy2DAsync cuMemcpy2DAsync_ld;
-static TcuStreamSynchronize cuStreamSynchronize_ld;
-static TcuMemcpy2D cuMemcpy2D_ld;
-static TcuMemcpy2DUnaligned cuMemcpy2DUnaligned_ld;
-
-//
-typedef CUresult (*TcuvidCtxLockCreate)(CUvideoctxlock *pLock, CUcontext ctx);
-typedef CUresult (*TcuvidGetDecoderCaps)(CUVIDDECODECAPS *pdc);
-typedef CUresult (*TcuvidCreateDecoder)(CUvideodecoder *phDecoder,
-                                        CUVIDDECODECREATEINFO *pdci);
-typedef CUresult (*TcuvidDestroyDecoder)(CUvideodecoder hDecoder);
-typedef CUresult (*TcuvidDecodePicture)(CUvideodecoder hDecoder,
-                                        CUVIDPICPARAMS *pPicParams);
-typedef CUresult (*TcuvidGetDecodeStatus)(CUvideodecoder hDecoder, int nPicIdx,
-                                          CUVIDGETDECODESTATUS *pDecodeStatus);
-typedef CUresult (*TcuvidReconfigureDecoder)(
-    CUvideodecoder hDecoder, CUVIDRECONFIGUREDECODERINFO *pDecReconfigParams);
-typedef CUresult (*TcuvidMapVideoFrame64)(CUvideodecoder hDecoder, int nPicIdx,
-                                          unsigned long long *pDevPtr,
-                                          unsigned int *pPitch,
-                                          CUVIDPROCPARAMS *pVPP);
-typedef CUresult (*TcuvidUnmapVideoFrame64)(CUvideodecoder hDecoder,
-                                            unsigned long long DevPtr);
-typedef CUresult (*TcuvidCtxLockDestroy)(CUvideoctxlock lck);
-typedef CUresult (*TcuvidCreateVideoParser)(CUvideoparser *pObj,
-                                            CUVIDPARSERPARAMS *pParams);
-typedef CUresult (*TcuvidParseVideoData)(CUvideoparser obj,
-                                         CUVIDSOURCEDATAPACKET *pPacket);
-typedef CUresult (*TcuvidDestroyVideoParser)(CUvideoparser obj);
-
-//
-static TcuvidCtxLockCreate cuvidCtxLockCreate_ld;
-static TcuvidGetDecoderCaps cuvidGetDecoderCaps_ld;
-static TcuvidCreateDecoder cuvidCreateDecoder_ld;
-static TcuvidDestroyDecoder cuvidDestroyDecoder_ld;
-static TcuvidDecodePicture cuvidDecodePicture_ld;
-static TcuvidGetDecodeStatus cuvidGetDecodeStatus_ld;
-static TcuvidReconfigureDecoder cuvidReconfigureDecoder_ld;
-static TcuvidMapVideoFrame64 cuvidMapVideoFrame64_ld;
-static TcuvidUnmapVideoFrame64 cuvidUnmapVideoFrame64_ld;
-static TcuvidCtxLockDestroy cuvidCtxLockDestroy_ld;
-static TcuvidCreateVideoParser cuvidCreateVideoParser_ld;
-static TcuvidParseVideoData cuvidParseVideoData_ld;
-static TcuvidDestroyVideoParser cuvidDestroyVideoParser_ld;
-
-//
-typedef NVENCSTATUS (*TNvEncodeAPICreateInstance)(
-    NV_ENCODE_API_FUNCTION_LIST *functionList);
-typedef NVENCSTATUS (*TNvEncodeAPIGetMaxSupportedVersion)(uint32_t *version);
-
-//
-static TNvEncodeAPICreateInstance NvEncodeAPICreateInstance_ld;
-static TNvEncodeAPIGetMaxSupportedVersion NvEncodeAPIGetMaxSupportedVersion_ld;
-
-static int InitNvCodecApi() {
-  // Load library
-  HMODULE nvcuda_dll = LoadLibrary(TEXT("nvcuda.dll"));
-  if (nvcuda_dll == NULL) {
-    std::cerr << "Unable to load nvcuda.dll!" << std::endl;
-    return -1;
-  }
-
-  cuInit_ld = (TcuInit)GetProcAddress(nvcuda_dll, "cuInit");
-  if (cuInit_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuDeviceGet_ld = (TcuDeviceGet)GetProcAddress(nvcuda_dll, "cuDeviceGet");
-  if (cuDeviceGet_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuDeviceGetCount_ld =
-      (TcuDeviceGetCount)GetProcAddress(nvcuda_dll, "cuDeviceGetCount");
-  if (cuDeviceGetCount_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuCtxCreate_ld = (TcuCtxCreate)GetProcAddress(nvcuda_dll, "cuCtxCreate");
-  if (cuCtxCreate_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuGetErrorName_ld =
-      (TcuGetErrorName)GetProcAddress(nvcuda_dll, "cuGetErrorName");
-  if (cuGetErrorName_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuCtxPushCurrent_ld =
-      (TcuCtxPushCurrent)GetProcAddress(nvcuda_dll, "cuCtxPushCurrent");
-  if (cuCtxPushCurrent_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuCtxPopCurrent_ld =
-      (TcuCtxPopCurrent)GetProcAddress(nvcuda_dll, "cuCtxPopCurrent");
-  if (cuCtxPopCurrent_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-  cuMemAlloc_ld = (TcuMemAlloc)GetProcAddress(nvcuda_dll, "cuMemAlloc");
-  if (cuMemAlloc_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuMemAllocPitch_ld =
-      (TcuMemAllocPitch)GetProcAddress(nvcuda_dll, "cuMemAllocPitch");
-  if (cuMemAllocPitch_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuMemFree_ld = (TcuMemFree)GetProcAddress(nvcuda_dll, "cuMemFree");
-  if (cuMemFree_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuMemcpy2DAsync_ld =
-      (TcuMemcpy2DAsync)GetProcAddress(nvcuda_dll, "cuMemcpy2DAsync");
-  if (cuMemcpy2DAsync_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuStreamSynchronize_ld =
-      (TcuStreamSynchronize)GetProcAddress(nvcuda_dll, "cuStreamSynchronize");
-  if (cuStreamSynchronize_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuMemcpy2D_ld = (TcuMemcpy2D)GetProcAddress(nvcuda_dll, "cuMemcpy2D");
-  if (cuMemcpy2D_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuMemcpy2DUnaligned_ld =
-      (TcuMemcpy2DUnaligned)GetProcAddress(nvcuda_dll, "cuMemcpy2DUnaligned");
-  if (cuMemcpy2DUnaligned_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  //
-  HMODULE nvcuvid_dll = LoadLibrary(TEXT("nvcuvid.dll"));
-  if (nvcuvid_dll == NULL) {
-    std::cerr << "Unable to load nvcuvid.dll!" << std::endl;
-    return -1;
-  }
-
-  cuvidCtxLockCreate_ld =
-      (TcuvidCtxLockCreate)GetProcAddress(nvcuda_dll, "cuvidCtxLockCreate");
-  if (cuvidCtxLockCreate_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidGetDecoderCaps_ld =
-      (TcuvidGetDecoderCaps)GetProcAddress(nvcuda_dll, "cuvidGetDecoderCaps");
-  if (cuvidGetDecoderCaps_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidCreateDecoder_ld =
-      (TcuvidCreateDecoder)GetProcAddress(nvcuda_dll, "cuvidCreateDecoder");
-  if (cuvidCreateDecoder_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidDestroyDecoder_ld =
-      (TcuvidDestroyDecoder)GetProcAddress(nvcuda_dll, "cuvidDestroyDecoder");
-  if (cuvidDestroyDecoder_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidDecodePicture_ld =
-      (TcuvidDecodePicture)GetProcAddress(nvcuda_dll, "cuvidDecodePicture");
-  if (cuvidDecodePicture_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidGetDecodeStatus_ld =
-      (TcuvidGetDecodeStatus)GetProcAddress(nvcuda_dll, "cuvidGetDecodeStatus");
-  if (cuvidGetDecodeStatus_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidReconfigureDecoder_ld = (TcuvidReconfigureDecoder)GetProcAddress(
-      nvcuda_dll, "cuvidReconfigureDecoder");
-  if (cuvidReconfigureDecoder_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidMapVideoFrame64_ld =
-      (TcuvidMapVideoFrame64)GetProcAddress(nvcuda_dll, "cuvidMapVideoFrame64");
-  if (cuvidMapVideoFrame64_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidUnmapVideoFrame64_ld = (TcuvidUnmapVideoFrame64)GetProcAddress(
-      nvcuda_dll, "cuvidUnmapVideoFrame64");
-  if (cuvidUnmapVideoFrame64_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidCtxLockDestroy_ld =
-      (TcuvidCtxLockDestroy)GetProcAddress(nvcuda_dll, "cuvidCtxLockDestroy");
-  if (cuvidCtxLockDestroy_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidCreateVideoParser_ld = (TcuvidCreateVideoParser)GetProcAddress(
-      nvcuda_dll, "cuvidCreateVideoParser");
-  if (cuvidCreateVideoParser_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidParseVideoData_ld =
-      (TcuvidParseVideoData)GetProcAddress(nvcuda_dll, "cuvidParseVideoData");
-  if (cuvidParseVideoData_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  cuvidDestroyVideoParser_ld = (TcuvidDestroyVideoParser)GetProcAddress(
-      nvcuda_dll, "cuvidDestroyVideoParser");
-  if (cuvidDestroyVideoParser_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  //
-  HMODULE nvEncodeAPI64_dll = LoadLibrary(TEXT("nvEncodeAPI64.dll"));
-  if (nvEncodeAPI64_dll == NULL) {
-    std::cerr << "Unable to load nvEncodeAPI64.dll!" << std::endl;
-    return -1;
-  }
-
-  NvEncodeAPICreateInstance_ld = (TNvEncodeAPICreateInstance)GetProcAddress(
-      nvcuda_dll, "NvEncodeAPICreateInstance");
-  if (NvEncodeAPICreateInstance_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  NvEncodeAPIGetMaxSupportedVersion_ld =
-      (TNvEncodeAPIGetMaxSupportedVersion)GetProcAddress(
-          nvcuda_dll, "NvEncodeAPIGetMaxSupportedVersion");
-  if (NvEncodeAPIGetMaxSupportedVersion_ld == NULL) {
-    std::cerr << "Unable to find function!" << std::endl;
-    FreeLibrary(nvcuda_dll);
-    return -1;
-  }
-
-  return 0;
-}
-
-#endif
\ No newline at end of file
diff --git a/xmake.lua b/xmake.lua
index db86dca..bd0cb75 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -21,9 +21,7 @@ add_packages("vcpkg::libnice")
 includes("thirdparty")
 
 if is_os("windows") then
-    add_packages("cuda")
     add_defines("_WEBSOCKETPP_CPP11_INTERNAL_")
-    add_requires("cuda")
 elseif is_os("linux") then
     add_requires("glib", {system = true})
     add_packages("glib", "cuda")
@@ -113,7 +111,8 @@ target("media")
         "src/media/video/encode/openh264/*.cpp",
         "src/media/video/decode/openh264/*.cpp",
         "src/media/video/encode/aom/*.cpp",
-        "src/media/video/decode/dav1d/*.cpp")
+        "src/media/video/decode/dav1d/*.cpp",
+        "src/media/nvcodec/*.cpp")
         add_includedirs("src/media/video/encode",
         "src/media/video/decode",
         "src/media/video/encode/nvcodec",
@@ -122,6 +121,7 @@ target("media")
         "src/media/video/decode/openh264",
         "src/media/video/encode/aom",
         "src/media/video/decode/dav1d",
+        "src/media/nvcodec",
         "thirdparty/nvcodec/Interface",
         "thirdparty/nvcodec/Samples", {public = true})
     elseif is_os(("linux")) then
@@ -132,7 +132,8 @@ target("media")
         "src/media/video/encode/openh264/*.cpp",
         "src/media/video/decode/openh264/*.cpp",
         "src/media/video/encode/aom/*.cpp",
-        "src/media/video/decode/dav1d/*.cpp")
+        "src/media/video/decode/dav1d/*.cpp",
+        "src/media/nvcodec/*.cpp")
         add_includedirs("src/media/video/encode",
         "src/media/video/decode",
         "src/media/video/encode/nvcodec",
@@ -141,6 +142,7 @@ target("media")
         "src/media/video/decode/openh264",
         "src/media/video/encode/aom",
         "src/media/video/decode/dav1d",
+        "src/media/nvcodec",
         "thirdparty/nvcodec/Interface",
         "thirdparty/nvcodec/Samples", {public = true})
     elseif is_os("macosx") then