16 #define IE_THREAD_TBB 0
17 #define IE_THREAD_OMP 1
18 #define IE_THREAD_SEQ 2
19 #define IE_THREAD_TBB_AUTO 3
21 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
22 #define TBB_PREVIEW_LOCAL_OBSERVER 1
23 #include "tbb/task_scheduler_observer.h"
24 #include "tbb/parallel_for.h"
25 #include "tbb/task_arena.h"
27 #include "tbb/parallel_sort.h"
28 #include "tbb/parallel_reduce.h"
29 #include "tbb/blocked_range.h"
30 #include "tbb/blocked_range2d.h"
31 #include "tbb/blocked_range3d.h"
33 inline int parallel_get_max_threads() {
return tbb::this_task_arena::max_concurrency(); }
34 inline int parallel_get_num_threads() {
return parallel_get_max_threads(); }
35 inline int parallel_get_thread_num() {
return tbb::this_task_arena::current_thread_index(); }
36 inline void parallel_set_num_threads(
int n) {
return; }
37 inline int parallel_get_env_threads() {
return 0; }
38 #if IE_THREAD == IE_THREAD_TBB
39 #define PARTITIONING , tbb::static_partitioner()
43 #elif IE_THREAD == IE_THREAD_OMP
51 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
53 #endif // defined(_MSC_VER) && !defined(__INTEL_COMPILER)
54 inline int parallel_get_max_threads() {
return omp_get_max_threads(); }
55 inline int parallel_get_num_threads() {
return omp_get_num_threads(); }
56 inline int parallel_get_thread_num() {
return omp_get_thread_num(); }
57 inline void parallel_set_num_threads(
int n) { omp_set_num_threads(n); }
58 inline int parallel_get_env_threads() {
60 if (getenv(
"OMP_NUM_THREADS") !=
nullptr) {
62 env_cores = std::stoi(getenv(
"OMP_NUM_THREADS"));
63 }
catch (
const std::exception&) {
70 #elif IE_THREAD == IE_THREAD_SEQ
72 inline int parallel_get_env_threads() {
return 1; }
73 inline int parallel_get_max_threads() {
return 1; }
74 inline int parallel_get_num_threads() {
return 1; }
75 inline int parallel_get_thread_num() {
return 0; }
76 inline void parallel_set_num_threads(
int n) {
return; }
83 void parallel_nt(
int nthr,
const F &func) {
84 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
85 if (nthr == 0) nthr = parallel_get_max_threads();
91 tbb::parallel_for(0, nthr, [&](
int ithr) {
94 #elif IE_THREAD == IE_THREAD_OMP
100 # pragma omp parallel num_threads(nthr)
101 func(parallel_get_thread_num(), parallel_get_num_threads());
102 #elif IE_THREAD == IE_THREAD_SEQ
107 template <
typename F>
108 void parallel_nt_static(
int nthr,
const F &func) {
109 #if IE_THREAD == IE_THREAD_SEQ
110 const bool serial =
true;
112 const bool serial =
false;
115 if (serial || nthr == 1) {
120 if (nthr == 0) nthr = parallel_get_max_threads();
121 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
122 tbb::parallel_for(0, nthr, [&](
int ithr) {
125 , tbb::static_partitioner{});
127 #elif IE_THREAD == IE_THREAD_OMP
129 # pragma omp parallel num_threads(nthr)
131 func(parallel_get_thread_num(), parallel_get_num_threads());
136 template <
typename I,
typename F>
137 void parallel_sort(I begin, I end,
const F &comparator) {
138 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
139 tbb::parallel_sort(begin, end, comparator);
140 #elif IE_THREAD == IE_THREAD_OMP
142 std::sort(begin, end, comparator);
143 #elif IE_THREAD == IE_THREAD_SEQ
144 std::sort(begin, end, comparator);
148 template <
typename T0,
typename R,
typename F>
149 R parallel_sum(
const T0 &D0,
const R &input,
const F &func) {
150 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
151 return tbb::parallel_reduce(
152 tbb::blocked_range<T0>(0, D0), input,
153 [&](
const tbb::blocked_range<T0>& r, R init)->R {
155 for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1)
166 using T0_IT =
typename std::make_signed<T0>::type;
171 #if IE_THREAD == IE_THREAD_OMP
172 #pragma omp parallel for reduction(+ : sum) schedule(static)
174 for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
175 sum +=
static_cast<R
>(func(dim1));
181 template <
typename T0,
typename T1,
typename R,
typename F>
182 R parallel_sum2d(
const T0 &D0,
const T1 &D1,
const R &input,
const F &func) {
183 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
184 return tbb::parallel_reduce(
185 tbb::blocked_range2d<T0, T1>(0, D0, 0, D1), input,
186 [&](
const tbb::blocked_range2d<T0, T1>& r, R init)->R {
188 for (T0 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
189 for (T1 dim1 = r.cols().begin(); dim1 < r.cols().end(); dim1++) {
190 sum += func(dim2, dim1);
202 using T0_IT =
typename std::make_signed<T0>::type;
203 using T1_IT =
typename std::make_signed<T1>::type;
209 #if IE_THREAD == IE_THREAD_OMP
210 #pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static)
212 for (T0_IT dim2 = 0; dim2 < D0; dim2++) {
213 for (T1_IT dim1 = 0; dim1 < D1; dim1++) {
214 sum += func(dim2, dim1);
220 template <
typename T0,
typename T1,
typename T2,
typename R,
typename F>
221 R parallel_sum3d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const R &input,
const F &func) {
222 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
223 return tbb::parallel_reduce(
224 tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2), input,
225 [&](
const tbb::blocked_range3d<T0, T1, T2>& r, R init)->R {
227 for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) {
228 for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
229 for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) {
230 sum += func(dim1, dim2, dim3);
243 using T0_IT =
typename std::make_signed<T0>::type;
244 using T1_IT =
typename std::make_signed<T1>::type;
245 using T2_IT =
typename std::make_signed<T2>::type;
252 #if IE_THREAD == IE_THREAD_OMP
253 #pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static)
255 for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
256 for (T1_IT dim2 = 0; dim2 < static_cast<T1_IT>(D1); dim2++) {
257 for (T2_IT dim3 = 0; dim3 < static_cast<T2_IT>(D2); dim3++) {
258 sum += func(dim1, dim2, dim3);
267 inline T parallel_it_init(T start) {
return start; }
268 template<
typename T,
typename Q,
typename R,
typename... Args>
269 inline T parallel_it_init(T start, Q &x,
const R &X, Args &&... tuple) {
270 start = parallel_it_init(start, static_cast<Args>(tuple)...);
275 inline bool parallel_it_step() {
return true; }
276 template<
typename Q,
typename R,
typename... Args>
277 inline bool parallel_it_step(Q &x,
const R &X, Args &&... tuple) {
278 if (parallel_it_step(static_cast<Args>(tuple)...)) {
285 template <
typename T,
typename Q>
286 inline void splitter(
const T &n,
const Q &team,
const Q &tid, T &n_start, T &n_end) {
287 if (team <= 1 || n == 0) {
291 T n1 = (n + (T)team - 1) / (T)team;
293 T T1 = n - n2 * (T)team;
294 n_end = (T)tid < T1 ? n1 : n2;
295 n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2;
302 template <
typename T0,
typename F>
303 void for_1d(
const int &ithr,
const int &nthr,
const T0 &D0,
const F &func) {
304 T0 d0{ 0 }, end{ 0 };
305 splitter(D0, nthr, ithr, d0, end);
306 for (; d0 < end; ++d0) func(d0);
309 template <
typename T0,
typename F>
310 void parallel_for(
const T0 &D0,
const F &func) {
311 #if IE_THREAD == IE_THREAD_TBB
312 auto work_amount =
static_cast<size_t>(D0);
313 int nthr = parallel_get_max_threads();
314 if (static_cast<size_t>(nthr) > work_amount)
315 nthr =
static_cast<int>(work_amount);
317 for_1d(0, 1, D0, func);
319 tbb::parallel_for(0, nthr, [&](
int ithr) {
320 for_1d(ithr, nthr, D0, func);
321 }, tbb::static_partitioner());
323 #elif IE_THREAD == IE_THREAD_TBB_AUTO
324 const int nthr = parallel_get_max_threads();
325 tbb::parallel_for(0, nthr, [&](
int ithr) {
326 for_1d(ithr, nthr, D0, func);
328 #elif IE_THREAD == IE_THREAD_OMP
329 # pragma omp parallel
330 for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func);
331 #elif IE_THREAD == IE_THREAD_SEQ
332 for_1d(0, 1, D0, func);
337 template <
typename T0,
typename T1,
typename F>
338 void for_2d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
const F &func) {
339 const size_t work_amount = (size_t)D0 * D1;
340 if (work_amount == 0)
return;
341 size_t start{ 0 }, end{ 0 };
342 splitter(work_amount, nthr, ithr, start, end);
344 T0 d0{ 0 }; T1 d1{ 0 };
345 parallel_it_init(start, d0, D0, d1, D1);
346 for (
size_t iwork = start; iwork < end; ++iwork) {
348 parallel_it_step(d0, D0, d1, D1);
352 template <
typename T0,
typename T1,
typename F>
353 void parallel_for2d(
const T0 &D0,
const T1 &D1,
const F &func) {
354 #if IE_THREAD == IE_THREAD_TBB
355 auto work_amount =
static_cast<size_t>(D0 * D1);
356 int nthr = parallel_get_max_threads();
357 if (static_cast<size_t>(nthr) > work_amount)
358 nthr =
static_cast<int>(work_amount);
360 for_2d(0, 1, D0, D1, func);
362 tbb::parallel_for(0, nthr, [&](
int ithr) {
363 for_2d(ithr, nthr, D0, D1, func);
364 }, tbb::static_partitioner());
366 #elif IE_THREAD == IE_THREAD_TBB_AUTO
367 const int nthr = parallel_get_max_threads();
368 tbb::parallel_for(0, nthr, [&](
int ithr) {
369 for_2d(ithr, nthr, D0, D1, func);
371 #elif IE_THREAD == IE_THREAD_OMP
372 # pragma omp parallel
373 for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func);
374 #elif IE_THREAD == IE_THREAD_SEQ
375 for_2d(0, 1, D0, D1, func);
380 template <
typename T0,
typename T1,
typename T2,
typename F>
381 void for_3d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
382 const T2 &D2,
const F &func) {
383 const size_t work_amount = (size_t)D0 * D1 * D2;
384 if (work_amount == 0)
return;
385 size_t start{ 0 }, end{ 0 };
386 splitter(work_amount, nthr, ithr, start, end);
388 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 };
389 parallel_it_init(start, d0, D0, d1, D1, d2, D2);
390 for (
size_t iwork = start; iwork < end; ++iwork) {
392 parallel_it_step(d0, D0, d1, D1, d2, D2);
396 template <
typename T0,
typename T1,
typename T2,
typename F>
397 void parallel_for3d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const F &func) {
398 #if IE_THREAD == IE_THREAD_TBB
399 auto work_amount =
static_cast<size_t>(D0 * D1 * D2);
400 int nthr = parallel_get_max_threads();
401 if (static_cast<size_t>(nthr) > work_amount)
402 nthr =
static_cast<int>(work_amount);
404 for_3d(0, 1, D0, D1, D2, func);
406 tbb::parallel_for(0, nthr, [&](
int ithr) {
407 for_3d(ithr, nthr, D0, D1, D2, func);
408 }, tbb::static_partitioner());
410 #elif IE_THREAD == IE_THREAD_TBB_AUTO
411 const int nthr = parallel_get_max_threads();
412 tbb::parallel_for(0, nthr, [&](
int ithr) {
413 for_3d(ithr, nthr, D0, D1, D2, func);
415 #elif IE_THREAD == IE_THREAD_OMP
416 # pragma omp parallel
417 for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func);
418 #elif IE_THREAD == IE_THREAD_SEQ
419 for_3d(0, 1, D0, D1, D2, func);
423 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename F>
424 void for_4d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
425 const T2 &D2,
const T3 &D3,
const F &func) {
426 const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
427 if (work_amount == 0)
return;
428 size_t start{ 0 }, end{ 0 };
429 splitter(work_amount, nthr, ithr, start, end);
431 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 };
432 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3);
433 for (
size_t iwork = start; iwork < end; ++iwork) {
434 func(d0, d1, d2, d3);
435 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3);
439 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename F>
440 void parallel_for4d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const T3 &D3,
const F &func) {
441 #if IE_THREAD == IE_THREAD_TBB
442 auto work_amount =
static_cast<size_t>(D0 * D1 * D2 * D3);
443 int nthr = parallel_get_max_threads();
444 if (static_cast<size_t>(nthr) > work_amount)
445 nthr =
static_cast<int>(work_amount);
447 for_4d(0, 1, D0, D1, D2, D3, func);
449 tbb::parallel_for(0, nthr, [&](
int ithr) {
450 for_4d(ithr, nthr, D0, D1, D2, D3, func);
451 }, tbb::static_partitioner());
453 #elif IE_THREAD == IE_THREAD_TBB_AUTO
454 const int nthr = parallel_get_max_threads();
455 tbb::parallel_for(0, nthr, [&](
int ithr) {
456 for_4d(ithr, nthr, D0, D1, D2, D3, func);
458 #elif IE_THREAD == IE_THREAD_OMP
459 # pragma omp parallel
460 for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func);
461 #elif IE_THREAD == IE_THREAD_SEQ
462 for_4d(0, 1, D0, D1, D2, D3, func);
466 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename F>
467 void for_5d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
468 const T2 &D2,
const T3 &D3,
const T4 &D4,
const F &func) {
469 const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4;
470 if (work_amount == 0)
return;
471 size_t start{ 0 }, end{ 0 };
472 splitter(work_amount, nthr, ithr, start, end);
474 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 };
475 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
476 for (
size_t iwork = start; iwork < end; ++iwork) {
477 func(d0, d1, d2, d3, d4);
478 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
482 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename F>
483 void parallel_for5d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const T3 &D3,
484 const T4 &D4,
const F &func) {
485 #if IE_THREAD == IE_THREAD_TBB
486 auto work_amount =
static_cast<size_t>(D0 * D1 * D2 * D3 * D4);
487 int nthr = parallel_get_max_threads();
488 if (static_cast<size_t>(nthr) > work_amount)
489 nthr =
static_cast<int>(work_amount);
491 for_5d(0, 1, D0, D1, D2, D3, D4, func);
493 tbb::parallel_for(0, nthr, [&](
int ithr) {
494 for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
495 }, tbb::static_partitioner());
497 #elif IE_THREAD == IE_THREAD_TBB_AUTO
498 const int nthr = parallel_get_max_threads();
499 tbb::parallel_for(0, nthr, [&](
int ithr) {
500 for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
502 #elif IE_THREAD == IE_THREAD_OMP
503 # pragma omp parallel
504 for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func);
505 #elif IE_THREAD == IE_THREAD_SEQ
506 for_5d(0, 1, D0, D1, D2, D3, D4, func);
Inference Engine API.
Definition: ie_argmax_layer.hpp:11