/* * Copyright 2017-2020 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ #include #include "NvCodecUtils.h" template static __global__ void Resize(cudaTextureObject_t texY, cudaTextureObject_t texUv, uint8_t *pDst, uint8_t *pDstUV, int nPitch, int nWidth, int nHeight, float fxScale, float fyScale) { int ix = blockIdx.x * blockDim.x + threadIdx.x, iy = blockIdx.y * blockDim.y + threadIdx.y; if (ix >= nWidth / 2 || iy >= nHeight / 2) { return; } int x = ix * 2, y = iy * 2; typedef decltype(YuvUnitx2::x) YuvUnit; const int MAX = (1 << (sizeof(YuvUnit) * 8)) - 1; *(YuvUnitx2 *)(pDst + y * nPitch + x * sizeof(YuvUnit)) = YuvUnitx2 { (YuvUnit)(tex2D(texY, x / fxScale, y / fyScale) * MAX), (YuvUnit)(tex2D(texY, (x + 1) / fxScale, y / fyScale) * MAX) }; y++; *(YuvUnitx2 *)(pDst + y * nPitch + x * sizeof(YuvUnit)) = YuvUnitx2 { (YuvUnit)(tex2D(texY, x / fxScale, y / fyScale) * MAX), (YuvUnit)(tex2D(texY, (x + 1) / fxScale, y / fyScale) * MAX) }; float2 uv = tex2D(texUv, ix / fxScale, (nHeight + iy) / fyScale + 0.5f); *(YuvUnitx2 *)(pDstUV + iy * nPitch + ix * 2 * sizeof(YuvUnit)) = YuvUnitx2{ (YuvUnit)(uv.x * MAX), (YuvUnit)(uv.y * MAX) }; } template static void Resize(unsigned char *dpDst, unsigned char* dpDstUV, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight) { cudaResourceDesc resDesc = {}; resDesc.resType = cudaResourceTypePitch2D; resDesc.res.pitch2D.devPtr = dpSrc; resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); resDesc.res.pitch2D.width = nSrcWidth; resDesc.res.pitch2D.height = nSrcHeight; resDesc.res.pitch2D.pitchInBytes = nSrcPitch; cudaTextureDesc texDesc = {}; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeNormalizedFloat; cudaTextureObject_t texY=0; ck(cudaCreateTextureObject(&texY, &resDesc, &texDesc, NULL)); resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); resDesc.res.pitch2D.width = nSrcWidth / 2; resDesc.res.pitch2D.height = nSrcHeight * 3 / 2; cudaTextureObject_t texUv=0; ck(cudaCreateTextureObject(&texUv, &resDesc, &texDesc, NULL)); Resize << > >(texY, texUv, dpDst, dpDstUV, nDstPitch, nDstWidth, nDstHeight, 1.0f * nDstWidth / nSrcWidth, 1.0f * nDstHeight / nSrcHeight); ck(cudaDestroyTextureObject(texY)); ck(cudaDestroyTextureObject(texUv)); } void ResizeNv12(unsigned char *dpDstNv12, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcNv12, int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char* dpDstNv12UV) { unsigned char* dpDstUV = dpDstNv12UV ? dpDstNv12UV : dpDstNv12 + (nDstPitch*nDstHeight); return Resize(dpDstNv12, dpDstUV, nDstPitch, nDstWidth, nDstHeight, dpSrcNv12, nSrcPitch, nSrcWidth, nSrcHeight); } void ResizeP016(unsigned char *dpDstP016, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcP016, int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char* dpDstP016UV) { unsigned char* dpDstUV = dpDstP016UV ? dpDstP016UV : dpDstP016 + (nDstPitch*nDstHeight); return Resize(dpDstP016, dpDstUV, nDstPitch, nDstWidth, nDstHeight, dpSrcP016, nSrcPitch, nSrcWidth, nSrcHeight); } static __global__ void Scale(cudaTextureObject_t texSrc, uint8_t *pDst, int nPitch, int nWidth, int nHeight, float fxScale, float fyScale) { int x = blockIdx.x * blockDim.x + threadIdx.x, y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= nWidth || y >= nHeight) { return; } *(unsigned char*)(pDst + (y * nPitch) + x) = (unsigned char)(fminf((tex2D(texSrc, x * fxScale, y * fyScale)) * 255.0f, 255.0f)); } static __global__ void Scale_uv(cudaTextureObject_t texSrc, uint8_t *pDst, int nPitch, int nWidth, int nHeight, float fxScale, float fyScale) { int x = blockIdx.x * blockDim.x + threadIdx.x, y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= nWidth || y >= nHeight) { return; } float2 uv = tex2D(texSrc, x * fxScale, y * fyScale); uchar2 uvOut = uchar2{ (unsigned char)(fminf(uv.x * 255.0f, 255.0f)), (unsigned char)(fminf(uv.y * 255.0f, 255.0f)) }; *(uchar2*)(pDst + (y * nPitch) + 2 * x) = uvOut; } void ScaleKernelLaunch(unsigned char *dpDst, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight, bool bUVPlane = false) { cudaResourceDesc resDesc = {}; resDesc.resType = cudaResourceTypePitch2D; resDesc.res.pitch2D.devPtr = dpSrc; resDesc.res.pitch2D.desc = bUVPlane ? cudaCreateChannelDesc() : cudaCreateChannelDesc(); resDesc.res.pitch2D.width = nSrcWidth; resDesc.res.pitch2D.height = nSrcHeight; resDesc.res.pitch2D.pitchInBytes = nSrcPitch; cudaTextureDesc texDesc = {}; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeNormalizedFloat; texDesc.addressMode[0] = cudaAddressModeClamp; texDesc.addressMode[1] = cudaAddressModeClamp; texDesc.addressMode[2] = cudaAddressModeClamp; cudaTextureObject_t texSrc = 0; ck(cudaCreateTextureObject(&texSrc, &resDesc, &texDesc, NULL)); dim3 blockSize(16, 16, 1); dim3 gridSize(((uint32_t)nDstWidth + blockSize.x - 1) / blockSize.x, ((uint32_t)nDstHeight + blockSize.y - 1) / blockSize.y, 1); if (bUVPlane) { Scale_uv << > >(texSrc, dpDst, nDstPitch, nDstWidth, nDstHeight, 1.0f * nSrcWidth / nDstWidth, 1.0f * nSrcHeight / nDstHeight); } else { Scale << > >(texSrc, dpDst, nDstPitch, nDstWidth, nDstHeight, 1.0f * nSrcWidth / nDstWidth, 1.0f * nSrcHeight / nDstHeight); } ck(cudaGetLastError()); ck(cudaDestroyTextureObject(texSrc)); } void ScaleYUV420(unsigned char *dpDstY, unsigned char* dpDstU, unsigned char* dpDstV, int nDstPitch, int nDstChromaPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcY, unsigned char* dpSrcU, unsigned char* dpSrcV, int nSrcPitch, int nSrcChromaPitch, int nSrcWidth, int nSrcHeight, bool bSemiplanar) { int chromaWidthDst = (nDstWidth + 1) / 2; int chromaHeightDst = (nDstHeight + 1) / 2; int chromaWidthSrc = (nSrcWidth + 1) / 2; int chromaHeightSrc = (nSrcHeight + 1) / 2; ScaleKernelLaunch(dpDstY, nDstPitch, nDstWidth, nDstHeight, dpSrcY, nSrcPitch, nSrcWidth, nSrcHeight); if (bSemiplanar) { ScaleKernelLaunch(dpDstU, nDstChromaPitch, chromaWidthDst, chromaHeightDst, dpSrcU, nSrcChromaPitch, chromaWidthSrc, chromaHeightSrc, true); } else { ScaleKernelLaunch(dpDstU, nDstChromaPitch, chromaWidthDst, chromaHeightDst, dpSrcU, nSrcChromaPitch, chromaWidthSrc, chromaHeightSrc); ScaleKernelLaunch(dpDstV, nDstChromaPitch, chromaWidthDst, chromaHeightDst, dpSrcV, nSrcChromaPitch, chromaWidthSrc, chromaHeightSrc); } }