KServe API Clients

Python Client

When creating a Python-based client application, you can use Triton client library - tritonclient.

Install the Package

pip3 install tritonclient[all]

Request Health Endpoints

import tritonclient.grpc as grpcclient

client = grpcclient.InferenceServerClient("localhost:9000")

server_live = client.is_server_live()

server_ready = client.is_server_ready()

model_ready = client.is_model_ready("model_name")
import tritonclient.http as httpclient

client = httpclient.InferenceServerClient("localhost:9000")

server_live = client.is_server_live()

server_ready = client.is_server_ready()

model_ready = client.is_model_ready("model_name")
#include "grpc_client.h"

namespace tc = triton::client;
int main() {
    std::unique_ptr<tc::InferenceServerGrpcClient> client;
    tc::InferenceServerGrpcClient::Create(&client, "localhost:9000");

    bool serverLive = client->IsServerLive(&serverLive);

    bool serverReady = client->IsServerReady(&serverReady);

    bool modelReady = client->IsModelReady(&modelReady, "model_name", "model_version");
}
#include "http_client.h"

namespace tc = triton::client;
int main() {
    std::unique_ptr<tc::InferenceServerHttpClient> client;
    tc::InferenceServerHttpClient::Create(&client, "localhost:9000");

    bool serverLive = client->IsServerLive(&serverLive);

    bool serverReady = client->IsServerReady(&serverReady);

    bool modelReady = client->IsModelReady(&modelReady, "model_name", "model_version");
}
public static void main(String[] args) {
    ManagedChannel channel = ManagedChannelBuilder
                    .forAddress("localhost", 9000)
                    .usePlaintext().build();
    GRPCInferenceServiceBlockingStub grpc_stub = GRPCInferenceServiceGrpc.newBlockingStub(channel);

    ServerLiveRequest.Builder serverLiveRequest = ServerLiveRequest.newBuilder();
    ServerLiveResponse serverLiveResponse = grpc_stub.serverLive(serverLiveRequest.build());

    bool serverLive = serverLiveResponse.getLive();

    ServerReadyRequest.Builder serverReadyRequest = ServerReadyRequest.newBuilder();
    ServerReadyResponse serverReadyResponse = grpc_stub.serverReady(serverReadyRequest.build());

    bool serverReady = serverReadyResponse.getReady();

    ModelReadyRequest.Builder modelReadyRequest = ModelReadyRequest.newBuilder();
    modelReadyRequest.setName("model_name");
    modelReadyRequest.setVersion("version");
    ModelReadyResponse modelReadyResponse = grpc_stub.modelReady(modelReadyRequest.build());

    bool modelReady = modelReadyResponse.getReady();

    channel.shutdownNow();
}
func main() {
    grpc.Dial("localhost:9000", grpc.WithInsecure())
    client := grpc_client.NewGRPCInferenceServiceClient(conn)

    serverLiveRequest := grpc_client.ServerLiveRequest{}
    serverLiveResponse, err := client.ServerLive(ctx, &serverLiveRequest)

    serverReadyRequest := grpc_client.ServerReadyRequest{}
    serverReadyResponse, err := client.ServerReady(ctx, &serverReadyRequest)

    modelReadyRequest := grpc_client.ModelReadyRequest{
            Name:    "modelName",
            Version: "modelVersion",
    }
    modelReadyResponse, err := client.ModelReady(ctx, &modelReadyRequest)
}
curl http://localhost:8000/v2/health/live
curl http://localhost:8000/v2/health/ready
curl http://localhost:8000/v2/models/model_name/ready

Request Server Metadata

import tritonclient.grpc as grpcclient

client = grpcclient.InferenceServerClient("localhost:9000")
server_metadata = client.get_server_metadata()
import tritonclient.http as httpclient

client = httpclient.InferenceServerClient("localhost:9000")
server_metadata = client.get_server_metadata()
#include "grpc_client.h"

namespace tc = triton::client;
int main() {
    std::unique_ptr<tc::InferenceServerGrpcClient> client;
    tc::InferenceServerGrpcClient::Create(&client, "localhost:9000");

    inference::ServerMetadataResponse server_metadata;
    client->ServerMetadata(&server_metadata);

    std::string name = server_metadata.name();
    std::string version = server_metadata.version();
}
#include "http_client.h"

namespace tc = triton::client;
int main() {
    std::unique_ptr<tc::InferenceServerHttpClient> client;
    tc::InferenceServerHttpClient::Create(&client, "localhost:9000");

    std::string server_metadata;
    client->ServerMetadata(&server_metadata);
}
public static void main(String[] args) {
    ManagedChannel channel = ManagedChannelBuilder
                    .forAddress("localhost", 9000)
                    .usePlaintext().build();
    GRPCInferenceServiceBlockingStub grpc_stub = GRPCInferenceServiceGrpc.newBlockingStub(channel);

    ServerMetadataRequest.Builder request = ServerMetadataRequest.newBuilder();
    ServerMetadataResponse response = grpc_stub.serverMetadata(request.build());

    channel.shutdownNow();
}
grpc.Dial("localhost:9000", grpc.WithInsecure())
client := grpc_client.NewGRPCInferenceServiceClient(conn)

serverMetadataRequest := grpc_client.ServerMetadataRequest{}
serverMetadataResponse, err := client.ServerMetadata(ctx, &serverMetadataRequest)
curl http://localhost:8000/v2

Request Model Metadata

import tritonclient.grpc as grpcclient

client = grpcclient.InferenceServerClient("localhost:9000")
model_metadata = client.get_model_metadata("model_name")
import tritonclient.http as httpclient

client = httpclient.InferenceServerClient("localhost:9000")
model_metadata = client.get_model_metadata("model_name")
#include "grpc_client.h"

namespace tc = triton::client;
int main() {
    std::unique_ptr<tc::InferenceServerGrpcClient> client;
    tc::InferenceServerGrpcClient::Create(&client, "localhost:9000");

    inference::ModelMetadataResponse model_metadata;
    client->ModelMetadata(&model_metadata, "model_name", "model_version");
}
#include "http_client.h"

namespace tc = triton::client;
int main() {
    std::unique_ptr<tc::InferenceServerHttpClient> client;
    tc::InferenceServerHttpClient::Create(&client, "localhost:9000");

    std::string model_metadata;
    client->ModelMetadata(&model_metadata, "model_name", "model_version")
}
public static void main(String[] args) {
    ManagedChannel channel = ManagedChannelBuilder
                    .forAddress("localhost", 9000)
                    .usePlaintext().build();
    GRPCInferenceServiceBlockingStub grpc_stub = GRPCInferenceServiceGrpc.newBlockingStub(channel);

    ModelMetadataRequest.Builder request = ModelMetadataRequest.newBuilder();
    request.setName("model_name");
    request.setVersion("model_version");
    ModelMetadataResponse response = grpc_stub.modelMetadata(request.build());

    channel.shutdownNow();
}
grpc.Dial("localhost:9000", grpc.WithInsecure())
client := grpc_client.NewGRPCInferenceServiceClient(conn)

modelMetadataRequest := grpc_client.ModelMetadataRequest{
    Name:    "modelName",
    Version: "modelVersion",
}
modelMetadataResponse, err := client.ModelMetadata(ctx, &modelMetadataRequest)
curl http://localhost:8000/v2/models/model_name

Request Prediction on an Encoded Image

from tritonclient.grpc import service_pb2, service_pb2_grpc
from tritonclient.utils import *

client = grpcclient.InferenceServerClient("localhost:9000")
image_data = []
with open("image_path", 'rb') as f:
    image_data.append(f.read())
inputs = []
inputs.append(service_pb2.ModelInferRequest().InferInputTensor())
inputs[0].name = args['input_name']
inputs[0].datatype = "BYTES"
inputs[0].shape.extend([1])
inputs[0].contents.bytes_contents.append(image_data[0])

outputs = []
outputs.append(service_pb2.ModelInferRequest().InferRequestedOutputTensor())
outputs[0].name = "output_name"

request = service_pb2.ModelInferRequest()
request.model_name = "model_name'"
request.inputs.extend(inputs)
request.outputs.extend(outputs)
response = grpc_stub.ModelInfer(request)
import requests
import json

url = f"http://{address}/v2/models/{model_name}/infer"
http_session = requests.session()

image_data = []
image_binary_size = []
with open("image_path", 'rb') as f:
    image_data.append(f.read())
    image_binary_size.append(len(image_data[-1]))
image_binary_size_str = ",".join(map(str, image_binary_size))
inference_header = {"inputs":[{"name":input_name,"shape":[batch_i],"datatype":"BYTES","parameters":{"binary_data_size":image_binary_size_str}}]}
inference_header_binary = json.dumps(inference_header).encode()

results = http_session.post(url, inference_header_binary + b''.join(image_data), headers={"Inference-Header-Content-Length":str(len(inference_header_binary))})
#include "grpc_client.h"

namespace tc = triton::client;
int main() {
    std::unique_ptr<tc::InferenceServerGrpcClient> client;
    tc::InferenceServerGrpcClient::Create(&client, "localhost:9000");

    std::vector<int64_t> shape{1, 10};
    tc::InferInput* input;
    tc::InferInput::Create(&input, "input_name", shape, "FP32");
    std::shared_ptr<tc::InferInput> input_ptr;
    input_ptr.reset(input)

    std::ifstream fileImg("image_path", std::ios::binary);
    fileImg.seekg(0, std::ios::end);
    int bufferLength = fileImg.tellg();
    fileImg.seekg(0, std::ios::beg);

    char* buffer = new char[bufferLength];
    fileImg.read(buffer, bufferLength);

    std::vector<uint8_t> input_data = std::vector<uint8_t>(buffer, buffer + bufferLength);
    input_ptr->AppendRaw(input_data);

    tc::InferOptions options("model_name");
    tc::InferResult* result;
    client->Infer(&result, options, inputs);
    input->Reset();

    delete buffer;
}
#include "http_client.h"

namespace tc = triton::client;
int main() {
    std::unique_ptr<tc::InferenceServerHttpClient> client;
    tc::InferenceServerHttpClient::Create(&client, "localhost:9000");

    std::vector<int64_t> shape{1};
    tc::InferInput* input;
    tc::InferInput::Create(&input, input_name, shape, "BYTES");
    std::shared_ptr<tc::InferInput> input_ptr;
    input_ptr.reset(input)

    std::ifstream fileImg("image_path", std::ios::binary);
    fileImg.seekg(0, std::ios::end);
    int bufferLength = fileImg.tellg();
    fileImg.seekg(0, std::ios::beg);

    char* buffer = new char[bufferLength];
    fileImg.read(buffer, bufferLength);

    std::vector<uint8_t> input_data = std::vector<uint8_t>(buffer, buffer + bufferLength);
    input_ptr->AppendRaw(input_data);

    tc::InferOptions options("model_name");
    tc::InferResult* result;
    client->Infer(&result, options, inputs);
    input->Reset();

    delete buffer;
}
public static void main(String[] args) {
    ManagedChannel channel = ManagedChannelBuilder
                    .forAddress("localhost", 9000)
                    .usePlaintext().build();
    GRPCInferenceServiceBlockingStub grpc_stub = GRPCInferenceServiceGrpc.newBlockingStub(channel);

    ModelInferRequest.Builder request = ModelInferRequest.newBuilder();
    request.setModelName("model_name");
    request.setModelVersion("model_version");

    ModelInferRequest.InferInputTensor.Builder input = ModelInferRequest.InferInputTensor
            .newBuilder();
    String defaultInputName = "b";
    input.setName("input_name");
    input.setDatatype("BYTES");
    input.addShape(1);

    FileInputStream imageStream = new FileInputStream("image_path");
    request.clearRawInputContents();
    request.addRawInputContents(ByteString.readFrom(imageStream));

    ModelInferResponse response = grpc_stub.modelInfer(request.build());

    channel.shutdownNow();
}
grpc.Dial("localhost:9000", grpc.WithInsecure())
client := grpc_client.NewGRPCInferenceServiceClient(conn)

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)

inferInputs := []*grpc_client.ModelInferRequest_InferInputTensor{
    &grpc_client.ModelInferRequest_InferInputTensor{
        Name:     "0",
        Datatype: "BYTES",
        Shape:    []int64{1},
    },
}

bytes, err := ioutil.ReadFile(fileName)
modelInferRequest.RawInputContents = append(modelInferRequest.RawInputContents, bytes)

modelInferRequest := grpc_client.ModelInferRequest{
    ModelName:    "model_name",
    ModelVersion: "model_version",
    Inputs:       inferInputs,
}

modelInferResponse, err := client.ModelInfer(ctx, &modelInferRequest)
echo -n '{"inputs” : [{"name" : "0", "shape" : [1], "datatype" : "BYTES"}]}' > request.json
stat --format=%s request.json
66
cat ./image.jpeg >> request.json
curl --data-binary "@./request.json" -X POST http://localhost:8000/v2/models/resnet/versions/0/infer -H "Inference-Header-Content-Length: 66"

Request Prediction on a Numpy Array

import numpy as np
import tritonclient.grpc as grpcclient

client = grpcclient.InferenceServerClient("localhost:9000")
data = np.array([1.0, 2.0, ..., 1000.0])
infer_input = grpcclient.InferInput("input_name", data.shape, "FP32")
infer_input.set_data_from_numpy(data)
results = client.infer("model_name", [infer_input])
import numpy as np
import tritonclient.http as httpclient

client = httpclient.InferenceServerClient("localhost:9000")
data = np.array([1.0, 2.0, ..., 1000.0])
infer_input = httpclient.InferInput("input_name", data.shape, "FP32")
infer_input.set_data_from_numpy(data)
results = client.infer("model_name", [infer_input]
#include "grpc_client.h"

namespace tc = triton::client;
int main() {
    std::unique_ptr<tc::InferenceServerGrpcClient> client;
    tc::InferenceServerGrpcClient::Create(&client, "localhost:9000");

    std::vector<int64_t> shape{1, 10};
    tc::InferInput* input;
    tc::InferInput::Create(&input, "input_name", shape, "FP32");
    std::shared_ptr<tc::InferInput> input_ptr;
    input_ptr.reset(input)

    std::vector<float> input_data(10);
    for (size_t i = 0; i < 10; ++i) {
        input_data[i] = i;
    }
    std::vector<tc::InferInput*> inputs = {input_ptr.get()};
    tc::InferOptions options("model_name");
    tc::InferResult* result;
    input_ptr->AppendRaw(input_data);
    client->Infer(&result, options, inputs);
    input->Reset();
}
#include "http_client.h"

namespace tc = triton::client;
int main() {
    std::unique_ptr<tc::InferenceServerHttpClient> client;
    tc::InferenceServerHttpClient::Create(&client, "localhost:9000");

    std::vector<int64_t> shape{1, 10};
    tc::InferInput* input;
    tc::InferInput::Create(&input, "input_name", shape, "FP32");
    std::shared_ptr<tc::InferInput> input_ptr;
    input_ptr.reset(input)

    std::vector<float> input_data(10);
    for (size_t i = 0; i < 10; ++i) {
        input_data[i] = i;
    }
    std::vector<tc::InferInput*> inputs = {input_ptr.get()};
    tc::InferOptions options("model_name");
    tc::InferResult* result;
    input_ptr->AppendRaw(input_data);
    client->Infer(&result, options, inputs);
    input->Reset();
}
public static void main(String[] args) {
    ManagedChannel channel = ManagedChannelBuilder
                    .forAddress("localhost", 9000)
                    .usePlaintext().build();
    GRPCInferenceServiceBlockingStub grpc_stub = GRPCInferenceServiceGrpc.newBlockingStub(channel);

    ModelInferRequest.Builder request = ModelInferRequest.newBuilder();
    request.setModelName("model_name");
    request.setModelVersion("model_version");

    List<Float> lst = Arrays.asList(0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f);
    InferTensorContents.Builder input_data = InferTensorContents.newBuilder();
    input_data.addAllFp32Contents(lst);

    ModelInferRequest.InferInputTensor.Builder input = ModelInferRequest.InferInputTensor
            .newBuilder();
    String defaultInputName = "b";
    input.setName("input_name");
    input.setDatatype("FP32");
    input.addShape(1);
    input.addShape(10);
    input.setContents(input_data);

    request.addInputs(0, input);

    ModelInferResponse response = grpc_stub.modelInfer(request.build());

    channel.shutdownNow();
}
grpc.Dial("localhost:9000", grpc.WithInsecure())
client := grpc_client.NewGRPCInferenceServiceClient(conn)

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)

inputData := make([]float32, inputSize)
for i := 0; i < inputSize; i++ {
    inputData[i] = float32(i)
}

inferInputs := []*grpc_client.ModelInferRequest_InferInputTensor{
    &grpc_client.ModelInferRequest_InferInputTensor{
        Name:     "b",
        Datatype: "FP32",
        Shape:    []int64{1, 10},
        Contents: &grpc_client.InferTensorContents{
            Fp32Contents: inputData,
        },
    },
}

modelInferRequest := grpc_client.ModelInferRequest{
    ModelName:    "model_name",
    ModelVersion: "model_version",
    Inputs:       inferInputs,
}

modelInferResponse, err := client.ModelInfer(ctx, &modelInferRequest)
curl -X POST http://localhost:8000/v2/models/model_name/infer
-H 'Content-Type: application/json'
-d '{"inputs" : [ {"name" : "input_name", "shape" : [ 1, 10 ], "datatype"  : "FP32", "data" : [1,2,3,4,5,6,7,8,9,10]} ]}'

For complete usage examples see Kserve samples.