CPU Kernel Custom Operations¶
To enable operations not supported by OpenVINO™ out of the box, you need a custom extension for Model Optimizer, a custom nGraph operation set, and a custom kernel for the device you will target. This page describes custom kernel support for the CPU device.
The primary means of the performance of the CPU codepath in the Inference Engine is the Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN), and new CPU kernels extend the Inference Engine plugin for the Intel MKL-DNN. Implementing the InferenceEngine::ILayerExecImpl API call defines a general CPU-side extension. There are no Intel MKL-DNN specifics in the way you need to implement a kernel.
Implementation Class¶
All custom kernels for the CPU plugin should be inherited from the InferenceEngine::ILayerExecImpl interface. Based on that, declaration of a kernel implementation class can look as follows:
class OpImplementation : public InferenceEngine::ILayerExecImpl {
public:
explicit OpImplementation(const std::shared_ptr<ngraph::Node>& node);
InferenceEngine::StatusCode getSupportedConfigurations(std::vector<InferenceEngine::LayerConfig>& conf,
InferenceEngine::ResponseDesc* resp) noexcept override;
InferenceEngine::StatusCode init(InferenceEngine::LayerConfig& config, InferenceEngine::ResponseDesc* resp) noexcept override;
InferenceEngine::StatusCode execute(std::vector<InferenceEngine::Blob::Ptr>& inputs, std::vector<InferenceEngine::Blob::Ptr>& outputs,
InferenceEngine::ResponseDesc* resp) noexcept override;
private:
int64_t add;
ngraph::Shape inShape;
ngraph::Shape outShape;
std::string error;
};
Class Fields¶
The provided implementation has several fields:
add
of the typeint64_t
is an attribute of a custom operation.inShape
of the typengraph::Shape
is an input shape.outShape
of the typengraph::Shape
is an output shape.error
of the typestd::string
is a field to handle errors from a constructor.
Constructor of Implementation¶
An implementation constructor checks parameters of an nGraph operation, stores required attributes, and stores an error message in case of an error.
OpImplementation::OpImplementation(const std::shared_ptr<ngraph::Node>& node) {
try {
auto castedNode = std::dynamic_pointer_cast<Operation>(node);
if (!castedNode)
IE_THROW() << "Cannot create implementation for unknown operation!";
if (castedNode->inputs().size() != 1 || castedNode->outputs().size() != 1)
IE_THROW() << "Cannot create implementation for operation with incorrect number of inputs or outputs!";
if (castedNode->get_input_partial_shape(0).is_dynamic() || castedNode->get_output_partial_shape(0).is_dynamic())
IE_THROW() << "Cannot create implementation for op with dynamic shapes!";
if (castedNode->get_input_shape(0).size() != 4 || castedNode->get_output_shape(0).size() != 4)
IE_THROW() << "Operation supports only 4d tensors for input and output.";
if (castedNode->get_input_element_type(0) != ngraph::element::f32 || castedNode->get_output_element_type(0) != ngraph::element::f32)
IE_THROW() << "Operation supports only FP32 tensors.";
add = castedNode->getAddAttr();
inShape = castedNode->get_input_shape(0);
outShape = castedNode->get_output_shape(0);
} catch (InferenceEngine::Exception& ex) {
error = ex.what();
}
}
The InferenceEngine::ILayerExecImpl::getSupportedConfigurations method returns all supported configuration formats (input/output tensor layouts) for your implementation. To specify formats of data, use InferenceEngine::TensorDesc. Refer to the Memory Primitives section for instructions.
InferenceEngine::StatusCode OpImplementation::getSupportedConfigurations(std::vector<InferenceEngine::LayerConfig>& conf,
InferenceEngine::ResponseDesc* resp) noexcept {
auto createConfig = [](const InferenceEngine::SizeVector inShape, const InferenceEngine::SizeVector& outShape, bool planar) {
InferenceEngine::LayerConfig config;
config.dynBatchSupport = false;
InferenceEngine::DataConfig inData;
InferenceEngine::DataConfig outData;
InferenceEngine::SizeVector order = {0, 1, 2, 3};
// Allow any offset before data
size_t offset((std::numeric_limits<size_t>::max)());
if (planar) {
inData.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, inShape, {inShape, order, offset});
config.inConfs.push_back(inData);
outData.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, outShape, {outShape, order, offset});
config.outConfs.push_back(outData);
} else {
// Add blocked (nChw8c) format
auto div_up = [](const int a, const int b) -> int {
if (!b)
return 0;
return (a + b - 1) / b;
};
order.push_back(1);
InferenceEngine::SizeVector inBlkDims = inShape;
inBlkDims[1] = div_up(inBlkDims[1], 8);
inBlkDims.push_back(8);
InferenceEngine::SizeVector outBlkDims = outShape;
outBlkDims[1] = div_up(outBlkDims[1], 8);
outBlkDims.push_back(8);
inData.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, inShape, {inBlkDims, order, offset});
config.inConfs.push_back(inData);
outData.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, outShape, {outBlkDims, order, offset});
config.outConfs.push_back(outData);
}
return config;
};
if (!error.empty()) {
if (resp) {
strncpy(resp->msg, error.c_str(), sizeof(resp->msg) - 1);
resp->msg[sizeof(resp->msg) - 1] = 0;
}
return InferenceEngine::GENERAL_ERROR;
}
// Add planar format
conf.emplace_back(createConfig(inShape, outShape, true));
// Add blocked format nChw8c
conf.emplace_back(createConfig(inShape, outShape, false));
return InferenceEngine::OK;
}
The InferenceEngine::ILayerExecImpl::init method gets a runtime-selected configuration from a vector that is populated from the getSupportedConfigurations
method and checks the parameters:
InferenceEngine::StatusCode OpImplementation::init(InferenceEngine::LayerConfig& config, InferenceEngine::ResponseDesc* resp) noexcept {
try {
if (config.inConfs.size() != 1 || config.outConfs.size() != 1) {
IE_THROW() << "Operation cannot be initialized with incorrect number of inputs/outputs!";
}
if (config.inConfs[0].desc.getDims().size() != 4 || config.outConfs[0].desc.getDims().size() != 4) {
IE_THROW() << "Operation can be initialized only with 4d input/output tensors!";
}
if (config.outConfs[0].desc.getPrecision() != InferenceEngine::Precision::FP32 ||
config.inConfs[0].desc.getPrecision() != InferenceEngine::Precision::FP32) {
IE_THROW() << "Operation supports only FP32 precisions!";
}
} catch (InferenceEngine::Exception& ex) {
if (resp) {
strncpy(resp->msg, error.c_str(), sizeof(resp->msg) - 1);
resp->msg[sizeof(resp->msg) - 1] = 0;
}
return InferenceEngine::GENERAL_ERROR;
}
return InferenceEngine::OK;
}
The InferenceEngine::ILayerExecImpl::execute method accepts and processes the actual tensors as input/output blobs:
InferenceEngine::StatusCode OpImplementation::execute(std::vector<InferenceEngine::Blob::Ptr>& inputs, std::vector<InferenceEngine::Blob::Ptr>& outputs,
InferenceEngine::ResponseDesc* resp) noexcept {
const float* src_data = inputs[0]->cbuffer().as<const float*>() + inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
float* dst_data = outputs[0]->buffer().as<float*>() + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
for (size_t i = 0; i < inputs[0]->size(); i++) {
dst_data[i] = src_data[i] + add;
}
return InferenceEngine::OK;
}
Register Implementation in Class¶
To register custom kernel implementation in the Extension class, implement the following methods:
InferenceEngine::IExtension::getImplTypes returns a vector of implementation types for an operation.
std::vector<std::string> Extension::getImplTypes(const std::shared_ptr<ngraph::Node>& node) {
if (std::dynamic_pointer_cast<Operation>(node)) {
return {"CPU"};
}
#ifdef OPENCV_IMPORT_ENABLED
if (std::dynamic_pointer_cast<FFTOp>(node)) {
return {"CPU"};
}
#endif
return {};
}
InferenceEngine::IExtension::getImplementation returns the kernel implementation with a specified type for an operation.
InferenceEngine::ILayerImpl::Ptr Extension::getImplementation(const std::shared_ptr<ngraph::Node>& node, const std::string& implType) {
if (implType == "CPU") {
if (std::dynamic_pointer_cast<Operation>(node)) {
return std::make_shared<OpImplementation>(node);
}
#ifdef OPENCV_IMPORT_ENABLED
if (std::dynamic_pointer_cast<FFTOp>(node) && implType == "CPU") {
return std::make_shared<FFTImpl>(node);
}
#endif
}
return nullptr;
}
Load Extension with Executable Kernels to Plugin¶
Use the AddExtension
method of the general plugin interface to load your primitives:
InferenceEngine::Core core;
// Load CPU extension as a shared library
auto extension_ptr = std::make_shared<InferenceEngine::Extension>(std::string{"<shared lib path>"});
// Add extension to the CPU device
core.AddExtension(extension_ptr, "CPU");