Bert Benchmark Python Sample

This sample demonstrates how to estimate performance of a Bert model using Asynchronous Inference Request API. Unlike demos this sample does not have configurable command line arguments. Feel free to modify sample’s source code to try out different options.

How It Works

The sample downloads a model and a tokenizer, exports the model to ONNX format, reads the exported model and reshapes it to enforce dynamic input shapes. Then, it compiles the resulting model, downloads a dataset and runs a benchmark on the dataset.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (C) 2022 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import logging as log
from pathlib import Path
import sys
import tempfile
from time import perf_counter

import datasets
import openvino as ov
from openvino.runtime import get_version
from transformers import AutoTokenizer
from transformers.onnx import export
from transformers.onnx.features import FeaturesManager


def main():
    log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
    log.info('OpenVINO:')
    log.info(f"{'Build ':.<39} {get_version()}")
    model_name = 'bert-base-uncased'
    # Download the model
    transformers_model = FeaturesManager.get_model_from_feature('default', model_name)
    _, model_onnx_config = FeaturesManager.check_supported_model_or_raise(transformers_model, feature='default')
    onnx_config = model_onnx_config(transformers_model.config)
    # Download the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    core = ov.Core()

    with tempfile.TemporaryDirectory() as tmp:
        onnx_path = Path(tmp) / f'{model_name}.onnx'
        # Export .onnx
        export(tokenizer, transformers_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
        # Read .onnx with OpenVINO
        model = core.read_model(onnx_path)

    # Enforce dynamic input shape
    try:
        model.reshape({model_input.any_name: ov.PartialShape([1, '?']) for model_input in model.inputs})
    except RuntimeError:
        log.error("Can't set dynamic shape")
        raise
    # Optimize for throughput. Best throughput can be reached by
    # running multiple openvino.runtime.InferRequest instances asyncronously
    tput = {'PERFORMANCE_HINT': 'THROUGHPUT'}
    # Pick a device by replacing CPU, for example MULTI:CPU(4),GPU(8).
    # It is possible to set CUMULATIVE_THROUGHPUT as PERFORMANCE_HINT for AUTO device
    compiled_model = core.compile_model(model, 'CPU', tput)
    # AsyncInferQueue creates optimal number of InferRequest instances
    ireqs = ov.AsyncInferQueue(compiled_model)

    sst2 = datasets.load_dataset('glue', 'sst2')
    sst2_sentences = sst2['validation']['sentence']
    # Warm up
    encoded_warm_up = dict(tokenizer('Warm up sentence is here.', return_tensors='np'))
    for _ in range(len(ireqs)):
        ireqs.start_async(encoded_warm_up)
    ireqs.wait_all()
    # Benchmark
    sum_seq_len = 0
    start = perf_counter()
    for sentence in sst2_sentences:
        encoded = dict(tokenizer(sentence, return_tensors='np'))
        sum_seq_len += next(iter(encoded.values())).size  # get sequence length to compute average length
        ireqs.start_async(encoded)
    ireqs.wait_all()
    end = perf_counter()
    duration = end - start
    log.info(f'Average sequence length: {sum_seq_len / len(sst2_sentences):.2f}')
    log.info(f'Average processing time: {duration / len(sst2_sentences) * 1e3:.2f} ms')
    log.info(f'Duration:                {duration:.2f} seconds')


if __name__ == '__main__':
    main()

You can see the explicit description of each sample step at Integration Steps section of “Integrate OpenVINO™ Runtime with Your Application” guide.

Running

  1. Install the openvino Python package:

    python -m pip install openvino
    
  2. Install packages from requirements.txt:

    python -m pip install -r requirements.txt
    
  3. Run the sample

    python bert_benchmark.py
    

Sample Output

The sample outputs how long it takes to process a dataset.