TensorRT-tensorflow模型tensorrt部署_tensorflow tensorrt_谢欣燕

网络 02-07 950

模型持久化部署tensorflow模型的第一步是模型持久化，将模型结构和去那种保存到一个.pb文件中。 pb_graph = tf.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), [v.op.name for v in outputs]) with tf.gfile,FastGFile('./pbmodel_name.pb',model='wb') as f: f.write(pb_graph.SerializeToString())\

只需要在模型定义和读取权重之后执行以上代码，调用tf.graph_util.convert_variables_to_constants函数将权重转为常量，其中outputs是需要作为输出的tensor的列表，最后用pb_graph.SerializeToString()将graph序列化并写入到pb文件当中，就生成了pb模型。

生成uff模型有了pb模型，需要将其转换为tensorRT可用的uff模型，只需要调用uff包自带的convert脚本即可 python /usr/lib/python2.7/site-packages/uff/bin/convert_to_uff.py pbmodel_name.pb

转换成功后会输出包含总结点的个数以及推断出的输入输出节点的信息。 3. tensorrt部署模型使用tensorrt部署生成好的uff模型需要先将uff中保存的模型权值以及网络结构导入进来，然后执行优化算法生成对应的inference engine。

定义一个IBuilder* builder定义一个用来解析uff文件的parserbuilder创建的network给parser赋输入输出节点parser将uff文件中的模型参数和网络结构解析出来存到network解析后的builder可以根据network中定义的网络结构创建engine。创建engine前需要指定最大的batchsize大小，之后使用engin时输入的batchsize不能超过这个数值否则会出错，推断时如果batchsize和设定最大值一样时效率最高。举个例子，如果设定最大batchsize为10，实际推理输入一个batch 10张图的时候平均每张推断时间是4ms的话，输入一个batch少于10张图的时候平均每张图推断时间会高于4ms。 //初始化NVINFER PLUGINS initLibNvInferPlugins(&gLogger.getTRTLogger(), ""); //1.IBulider IBuilder * builder = createInferBuilder(gLogger.getTRTLogger()); assert(builder ! = nullptr); //建立UFFParser auto parser = createUffParser(); //登记输入的node名，尺寸，通道顺序 parser->registerInput(inputtensor_name,DimsCHW(INPUT_C,INPUT_H,INPUT_W),UffInputOrder::kNCHW); // MarkOutput_0 is a node created by the UFF converter when we specify an ouput with -O. parser->registerOutput() parser->registerOutput(outputtensor_name); // Parse the UFF model to populate the network, then set the outputs. INetworkDefinition* network=builder->createeNetwork(); gLogInfo << "Begin parsing model..." << std::endl; if(!parser->parse(uffFile,*network,nvinfer1::DataType::kFLOAT)) { gLogError<<"Failure while parsing UFF file"<<std::endl; return nullptr; } gLogInfo << "End parsing model..." << std::endl; // Build the engine. builder->setMaxBatchSize(maxBatchSize); // The _GB literal operator is defined in common/common.h builder->setMaxWorkspaceSize(MAX_WORKSPACE); // We need about 1GB of scratch space for the plugin layer for batch size 5. if (gArgs.runInInt8) { builder->setInt8Mode(gArgs.runInInt8); builder->setInt8Calibrator(calibrator); } builder->setFp16Mode(gArgs.runInFp16); samplesCommon::enableDLA(builder, gArgs.useDLACore); gLogInfo << "Begin building engine..." << std::endl; ICudaEngine* engine = builder->buildCudaEngine(*network); if (!engine) { gLogError << "Unable to create engine" << std::endl; return nullptr; } gLogInfo << "End building engine..." << std::endl; // We don't need the network any more, and we can destroy the parser. network->destroy(); parser->destroy(); builder->destroy(); shutdownProtobufLibrary();

生成engine之后就可以进行推断了，执行推断时需要有一个上下文执行上下文IExecutionContext* context，通过engine->creatExecutionContext()获得。

context = engine->createExecutionContext(); assert(context != nullptr);

推断的核心代码为

context.execute(batchSize, &buffers[0]);

其中buffers是一个void*数组对应的是模型输入输出tensor的设备地址，通过cudaMalloc开辟输入输出所需要的设备空间（显存）将对应指针存到buffer数组中，在执行execute操作前通过cudaMemcpy把输入数据（输入图像）拷贝到对应输入的设备空间，执行execute之后还是通过cudaMemcpy把输出的结果从设备上拷贝出来。

// Run inference. doInference(*context, &data[0], &detectionOut[0], &keepCount[0], N); void doInference(IExecutionContext& context, float* inputData, float* detectionOut, int* keepCount, int batchSize) { //建立 //auto t_start = std::chrono::high_resolution_clock::now(); const ICudaEngine& engine = context.getEngine(); // Input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), // of these, but in this case we know that there is exactly 1 input and 2 output. int nbBindings = engine.getNbBindings(); std::vector<void*> buffers(nbBindings); std::vector<std::pair<int64_t, DataType>> buffersSizes = calculateBindingBufferSizes(engine, nbBindings, batchSize); for (int i = 0; i < nbBindings; ++i) { auto bufferSizesOutput = buffersSizes[i]; buffers[i] = samplesCommon::safeCudaMalloc(bufferSizesOutput.first * samplesCommon::getElementSize(bufferSizesOutput.second)); } // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings(). int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), outputIndex0 = engine.getBindingIndex(OUTPUT_BLOB_NAME0), outputIndex1 = outputIndex0 + 1; //engine.getBindingIndex(OUTPUT_BLOB_NAME1); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA the input to the GPU, execute the batch asynchronously, and DMA it back: CHECK(cudaMemcpyAsync(buffers[inputIndex], inputData, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); auto t_start = std::chrono::high_resolution_clock::now(); context.execute(batchSize, &buffers[0]); auto t_end = std::chrono::high_resolution_clock::now(); float total = std::chrono::duration<float, std::milli>(t_end - t_start).count(); gLogInfo << "Time taken for inference is " << total << " ms." << std::endl; for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx) { if (engine.bindingIsInput(bindingIdx)) continue; auto bufferSizesOutput = buffersSizes[bindingIdx]; printOutput(bufferSizesOutput.first, bufferSizesOutput.second, buffers[bindingIdx]); } CHECK(cudaMemcpyAsync(detectionOut, buffers[outputIndex0], batchSize * detectionOutputParam.keepTopK * 7 * sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK(cudaMemcpyAsync(keepCount, buffers[outputIndex1], batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release the stream and the buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex0])); CHECK(cudaFree(buffers[outputIndex1])); }