[232] TensorRT를 활용한 딥러닝 Inference 최적화

Step 1: TF모델을 TRT 포맷으로 변환
Step 2: 모델 Parser 생성
Step 3: 입/출력 레이어 정보 입력
Step 4: 모델의 최적화 및
런타임 Engine 생성
Step 5: 엔진을 파일로 저장
Step 6: 엔진을 파일에서 읽음
Step 7: Inference 수행

•
•
•
PReLUPlugin::PReLUPlugin(const Weights *weights, int nbWeights) {
mWeights = weights[0];
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type));
}

int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace,
cudaStream_t stream) {
const float zerof{0.0f}; const __half zeroh = fp16::__float2half(0.0f);
if (mWeights.type == DataType::__float) {
CHECK(Forward_gpu<__float>(batchSize * mNbInputCount, mNbInputChannels,
mNbInputHeight * mNbInputHeight, reinterpret_cast<const __float *>(mDeviceKernel),
reinterpret_cast<const __float *>(inputs[0]), reinterpret_cast<__float *>(outputs[0]),
zerof, mChannelShared ? mNbInputChannels : 1, stream));
} else { // DataType::kFLOAT }
return 0;
}

template <typename Ftype>
__global__ void PReLUForward(const int n, const int channels, const int dim, const Ftype* slope_data, const
Ftype* in, Ftype* out, const Ftype zero, const int div_factor) {
CUDA_KERNEL_LOOP(index, n) {
int c = (index / dim) % channels / div_factor;
out[index] = (in[index] > (Ftype(zero))) ? in[index] :
in[index] * *(reinterpret_cast<const Ftype*>(slope_data)+c);
}
}

template <typename Ftype>
cudaError_t Forward_gpu(const int count, const int channels, const int dim, const Ftype* mDeviceKernel,
const Ftype* bottom_data, Ftype* top_data, const Ftype zero, const int div_factor, const cudaStream_t stream) {
PReLUForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>
(count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor);
return cudaGetLastError();
}

IPluginExt *PReLUPlugin::clone() const override {
return new PReLUPlugin(&mWeights, 1);
}
IPlugin* pluginFactory::createPlugin(const char* layerName, const Weights* serialData, int nbWeights) override {
return new PReLUPlugin(serialData, serialLength);
}

PluginFactory parserPluginFactory;
parser->setPluginFactoryExt(&parserPluginFactory);
const IBlobNameToTensor *blobNameToTensor =
parser->parse(gParams.deployFile.c_str(), // caffe deploy file
gParams.modelFile.c_str(), // caffe model file
*network, // network definition that the parser will populate
gParams.fp16 ? DataType::kHALF : DataType::kFLOAT);

builder->setMaxBatchSize(gParams.batchSize);
builder->setMaxWorkspaceSize(size_t(gParams.workspaceSize) << 20);
builder->setFp16Mode(gParams.fp16);
ICudaEngine* engine = builder->buildCudaEngine(*network);

void PReLUPlugin::serialize(void *buffer) {
char *d = static_cast<char *>(buffer), *a = d;
write(d, mNbInputChannels); write(d, mNbInputHeight); write(d, mNbInputWidth); write(d, mNbInputCount);
write(d, mChannelShared); write(d, mWeights.count); write(d, mWeights.type);
convertAndCopyToBuffer(d, mWeights);
assert(d == a + getSerializationSize());
}

PReLUPlugin::PReLUPlugin(const void *data, size_t length) {
const char *d = static_cast<const char *>(data), *a = d;
read<int>(d, mNbInputChannels); read<int>(d, mNbInputHeight); read<int>(d, mNbInputWidth);
read<int>(d, mNbInputCount); read<bool>(d, mChannelShared); read<int64_t>(d, mWeights.count);
read<DataType>(d, mWeights.type);
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type));
deserializeToDevice(d, mDeviceKernel, mWeights.count * type2size(mWeights.type));
assert(d == a + length);
}

Iplugin *PluginFactory::createPlugin(const char *layerName, const void *serialData, size_t serialLength) override
{
return new PReLUPlugin(serialData, serialLength);
}

PluginFactory pluginFactory;
engine = infer->deserializeCudaEngine(trt_plan_file, size, &pluginFactory);

cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float),
cudaMemcpyHostToDevice, stream);
context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
cudaStreamCreate(&stream));
IExecutionContext* context = engine->createExecutionContext();

[232] TensorRT를 활용한 딥러닝 Inference 최적화

Related slideshows

Recommended for you

Recommended for you

Recommended for you

Recommended for you

Recommended for you

Recommended for you

Recommended for you

Recommended for you

Recommended for you

Recommended for you

More Related Content

What's hot

What's hot (20)

Similar to [232] TensorRT를 활용한 딥러닝 Inference 최적화

Similar to [232] TensorRT를 활용한 딥러닝 Inference 최적화 (20)

More from NAVER D2

More from NAVER D2 (20)

Recently uploaded

Recently uploaded (20)

[232] TensorRT를 활용한 딥러닝 Inference 최적화