Hi, I render different poses of a 3D model using OpenGL. Until now I read the renderd images back into a cv::Mat using glReadPixel() which is fast enough for small images.
Now I want to render, read and process(Sobel, Pyramids...) bigger images. Since the rendering takes place on the GPU it seems obvious to use gpu::GpuMat and to tell OpenCV where the rendered textures are on the Gpu Memory. Thus I started reading into CUDA Kernels and that stuff.
Here is my cuda gateway function:
void PostprocessCUDA(GpuMat& nB, GpuMat& nG, GpuMat& nR, cudaGraphicsResource_t& srcNormal, unsigned int width, unsigned int height)
{
cutilSafeCall(cudaGraphicsMapResources(1, &srcNormal ));
cudaArray* srcArrayNormal;
// Get a device pointer to the OpenGL buffers
cutilSafeCall( cudaGraphicsSubResourceGetMappedArray( &srcArrayNormal, srcNormal, 0, 0 ) );
// Map the source texture to a texture reference.
cutilSafeCall(cudaBindTextureToArray( Tex, srcArrayNormal));
dim3 blDim(BLOCK_SIZE, BLOCK_SIZE);
dim3 grDim(divUp(nB.cols, blDim.x), divUp(nB.rows, blDim.y));
PostprocessKernel<<<grDim, blDim>>>( nB.ptr(), nG.ptr(), nR.ptr(), nB.step, width, height);
// PostprocessKernel<<<grDim, blDim>>>( nB.data, nG.data, nR.data, nB.step, width, height);
//cudaDeviceSynchronize();
cutilSafeCall(cudaUnbindTexture(Tex));
cutilSafeCall(cudaGraphicsUnmapResources( 1, &srcNormal));
}
and this is the cuda kernel:
__global__ void PostprocessKernel(uchar* destNblue, uchar* destNgreen, uchar* destNred,
int step, int width, int height)
{
unsigned int tx = threadIdx.x;
unsigned int ty = threadIdx.y;
unsigned int bw = blockDim.x;
unsigned int bh = blockDim.y;
// Non-normalized U, V coordinates of input texture for current thread.
unsigned int u = ( bw * blockIdx.x ) + tx;
unsigned int v = ( bh * blockIdx.y ) + ty;
if ( u > width || v > height ) return;
uchar* rowsrcNPtr_blue = (destNblue + v*step);
uchar* rowsrcNPtr_green = (destNgreen + v*step);
uchar* rowsrcNPtr_red = (destNred + v*step);
uchar4 normalTexValues = tex2D(Tex, u, v);
rowsrcNPtr_blue[u] = Clamp<uchar, uchar>(normalTexValues.x, 0, 255);
rowsrcNPtr_green[u] = Clamp<uchar, uchar>(normalTexValues.y, 0, 255);
rowsrcNPtr_red[u] = Clamp<uchar, uchar>(normalTexValues.z, 0, 255);
}
And then I call the function like this:
cudaGraphicsGLRegisterImage( &g_CUDAGraphicsResource, renderedTexture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
PostprocessCUDA(nblue, ngreen, nred, g_CUDAGraphicsResource, winWidth, winHeight);
MergeGpuMatrices(nblue, ngreen, nred, mergedN);
nblue, ngreen and nred are allocated once with gpu::createContinuous(Size(winWidth, winHeight), CV_8U);
in a constructor call of a class.
Now all this works fine BUT when I call this function within a loop the program becomes slower and slower with each cycle.
After about 500 render cycles the program eventually crashes with "OpenCV error: Gpu Api Call out of memory". I already tried allocating the 3 gpu matrices locally in each function call which also lead to the same result. The weird thing is, that the GPU memory usage isn't even increasing which I found out with the Nvidia system monitor...
Maybe someone encountered similar effects and has a solution for this? Btw: I'am using a Nvidia GTX680M, on a Laptop with 16GB of Ram, Win7 64bit Software is: VS2010, OpenCV 2.4.3 (preCompiled binaries), OpenGL 4.2 Thanks... urbste