16 #define IE_THREAD_TBB 0
17 #define IE_THREAD_OMP 1
18 #define IE_THREAD_SEQ 2
19 #define IE_THREAD_TBB_AUTO 3
21 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
22 #define TBB_PREVIEW_LOCAL_OBSERVER 1
23 #include "tbb/task_scheduler_observer.h"
24 #include "tbb/parallel_for.h"
25 #include "tbb/task_arena.h"
27 #include "tbb/parallel_reduce.h"
28 #include "tbb/blocked_range.h"
29 #include "tbb/blocked_range2d.h"
30 #include "tbb/blocked_range3d.h"
32 inline int parallel_get_max_threads() {
return tbb::this_task_arena::max_concurrency(); }
33 inline int parallel_get_num_threads() {
return parallel_get_max_threads(); }
34 inline int parallel_get_thread_num() {
return tbb::this_task_arena::current_thread_index(); }
35 inline void parallel_set_num_threads(
int n) {
return; }
36 inline int parallel_get_env_threads() {
return 0; }
37 #if IE_THREAD == IE_THREAD_TBB
38 #define PARTITIONING , tbb::static_partitioner()
42 #elif IE_THREAD == IE_THREAD_OMP
49 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
51 #endif // defined(_MSC_VER) && !defined(__INTEL_COMPILER)
52 inline int parallel_get_max_threads() {
return omp_get_max_threads(); }
53 inline int parallel_get_num_threads() {
return omp_get_num_threads(); }
54 inline int parallel_get_thread_num() {
return omp_get_thread_num(); }
55 inline void parallel_set_num_threads(
int n) { omp_set_num_threads(n); }
56 inline int parallel_get_env_threads() {
58 if (getenv(
"OMP_NUM_THREADS") !=
nullptr) {
60 env_cores = std::stoi(getenv(
"OMP_NUM_THREADS"));
61 }
catch (
const std::exception&) {
68 #elif IE_THREAD == IE_THREAD_SEQ
69 inline int parallel_get_env_threads() {
return 1; }
70 inline int parallel_get_max_threads() {
return 1; }
71 inline int parallel_get_num_threads() {
return 1; }
72 inline int parallel_get_thread_num() {
return 0; }
73 inline void parallel_set_num_threads(
int n) {
return; }
80 void parallel_nt(
int nthr,
const F &func) {
81 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
82 if (nthr == 0) nthr = parallel_get_max_threads();
88 tbb::parallel_for(0, nthr, [&](
int ithr) {
91 #elif IE_THREAD == IE_THREAD_OMP
97 # pragma omp parallel num_threads(nthr)
98 func(parallel_get_thread_num(), parallel_get_num_threads());
99 #elif IE_THREAD == IE_THREAD_SEQ
104 template <
typename F>
105 void parallel_nt_static(
int nthr,
const F &func) {
106 #if IE_THREAD == IE_THREAD_SEQ
107 const bool serial =
true;
109 const bool serial =
false;
112 if (serial || nthr == 1) {
117 if (nthr == 0) nthr = parallel_get_max_threads();
118 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
119 tbb::parallel_for(0, nthr, [&](
int ithr) {
122 , tbb::static_partitioner{});
124 #elif IE_THREAD == IE_THREAD_OMP
126 # pragma omp parallel num_threads(nthr)
128 func(parallel_get_thread_num(), parallel_get_num_threads());
133 template <
typename T0,
typename R,
typename F>
134 R parallel_sum(
const T0 &D0,
const R &input,
const F &func) {
135 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
136 return tbb::parallel_reduce(
137 tbb::blocked_range<T0>(0, D0), input,
138 [&](
const tbb::blocked_range<T0>& r, R init)->R {
140 for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1)
151 using T0_IT =
typename std::make_signed<T0>::type;
156 #if IE_THREAD == IE_THREAD_OMP
157 #pragma omp parallel for reduction(+ : sum) schedule(static)
159 for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
160 sum +=
static_cast<R
>(func(dim1));
166 template <
typename T0,
typename T1,
typename R,
typename F>
167 R parallel_sum2d(
const T0 &D0,
const T1 &D1,
const R &input,
const F &func) {
168 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
169 return tbb::parallel_reduce(
170 tbb::blocked_range2d<T0, T1>(0, D0, 0, D1), input,
171 [&](
const tbb::blocked_range2d<T0, T1>& r, R init)->R {
173 for (T0 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
174 for (T1 dim1 = r.cols().begin(); dim1 < r.cols().end(); dim1++) {
175 sum += func(dim2, dim1);
187 using T0_IT =
typename std::make_signed<T0>::type;
188 using T1_IT =
typename std::make_signed<T1>::type;
194 #if IE_THREAD == IE_THREAD_OMP
195 #pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static)
197 for (T0_IT dim2 = 0; dim2 < D0; dim2++) {
198 for (T1_IT dim1 = 0; dim1 < D1; dim1++) {
199 sum += func(dim2, dim1);
205 template <
typename T0,
typename T1,
typename T2,
typename R,
typename F>
206 R parallel_sum3d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const R &input,
const F &func) {
207 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
208 return tbb::parallel_reduce(
209 tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2), input,
210 [&](
const tbb::blocked_range3d<T0, T1, T2>& r, R init)->R {
212 for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) {
213 for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
214 for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) {
215 sum += func(dim1, dim2, dim3);
228 using T0_IT =
typename std::make_signed<T0>::type;
229 using T1_IT =
typename std::make_signed<T1>::type;
230 using T2_IT =
typename std::make_signed<T2>::type;
237 #if IE_THREAD == IE_THREAD_OMP
238 #pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static)
240 for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
241 for (T1_IT dim2 = 0; dim2 < static_cast<T1_IT>(D1); dim2++) {
242 for (T2_IT dim3 = 0; dim3 < static_cast<T2_IT>(D2); dim3++) {
243 sum += func(dim1, dim2, dim3);
252 inline T parallel_it_init(T start) {
return start; }
253 template<
typename T,
typename Q,
typename R,
typename... Args>
254 inline T parallel_it_init(T start, Q &x,
const R &X, Args &&... tuple) {
255 start = parallel_it_init(start, static_cast<Args>(tuple)...);
260 inline bool parallel_it_step() {
return true; }
261 template<
typename Q,
typename R,
typename... Args>
262 inline bool parallel_it_step(Q &x,
const R &X, Args &&... tuple) {
263 if (parallel_it_step(static_cast<Args>(tuple)...)) {
270 template <
typename T,
typename Q>
271 inline void splitter(
const T &n,
const Q &team,
const Q &tid, T &n_start, T &n_end) {
272 if (team <= 1 || n == 0) {
276 T n1 = (n + (T)team - 1) / (T)team;
278 T T1 = n - n2 * (T)team;
279 n_end = (T)tid < T1 ? n1 : n2;
280 n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2;
287 template <
typename T0,
typename F>
288 void for_1d(
const int &ithr,
const int &nthr,
const T0 &D0,
const F &func) {
289 T0 d0{ 0 }, end{ 0 };
290 splitter(D0, nthr, ithr, d0, end);
291 for (; d0 < end; ++d0) func(d0);
294 template <
typename T0,
typename F>
295 void parallel_for(
const T0 &D0,
const F &func) {
296 #if IE_THREAD == IE_THREAD_TBB
297 auto work_amount =
static_cast<size_t>(D0);
298 int nthr = parallel_get_max_threads();
299 if (static_cast<size_t>(nthr) > work_amount)
300 nthr =
static_cast<int>(work_amount);
302 for_1d(0, 1, D0, func);
304 tbb::parallel_for(0, nthr, [&](
int ithr) {
305 for_1d(ithr, nthr, D0, func);
306 }, tbb::static_partitioner());
308 #elif IE_THREAD == IE_THREAD_TBB_AUTO
309 const int nthr = parallel_get_max_threads();
310 tbb::parallel_for(0, nthr, [&](
int ithr) {
311 for_1d(ithr, nthr, D0, func);
313 #elif IE_THREAD == IE_THREAD_OMP
314 # pragma omp parallel
315 for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func);
316 #elif IE_THREAD == IE_THREAD_SEQ
317 for_1d(0, 1, D0, func);
322 template <
typename T0,
typename T1,
typename F>
323 void for_2d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
const F &func) {
324 const size_t work_amount = (size_t)D0 * D1;
325 if (work_amount == 0)
return;
326 size_t start{ 0 }, end{ 0 };
327 splitter(work_amount, nthr, ithr, start, end);
329 T0 d0{ 0 }; T1 d1{ 0 };
330 parallel_it_init(start, d0, D0, d1, D1);
331 for (
size_t iwork = start; iwork < end; ++iwork) {
333 parallel_it_step(d0, D0, d1, D1);
337 template <
typename T0,
typename T1,
typename F>
338 void parallel_for2d(
const T0 &D0,
const T1 &D1,
const F &func) {
339 #if IE_THREAD == IE_THREAD_TBB
340 auto work_amount =
static_cast<size_t>(D0 * D1);
341 int nthr = parallel_get_max_threads();
342 if (static_cast<size_t>(nthr) > work_amount)
343 nthr =
static_cast<int>(work_amount);
345 for_2d(0, 1, D0, D1, func);
347 tbb::parallel_for(0, nthr, [&](
int ithr) {
348 for_2d(ithr, nthr, D0, D1, func);
349 }, tbb::static_partitioner());
351 #elif IE_THREAD == IE_THREAD_TBB_AUTO
352 const int nthr = parallel_get_max_threads();
353 tbb::parallel_for(0, nthr, [&](
int ithr) {
354 for_2d(ithr, nthr, D0, D1, func);
356 #elif IE_THREAD == IE_THREAD_OMP
357 # pragma omp parallel
358 for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func);
359 #elif IE_THREAD == IE_THREAD_SEQ
360 for_2d(0, 1, D0, D1, func);
365 template <
typename T0,
typename T1,
typename T2,
typename F>
366 void for_3d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
367 const T2 &D2,
const F &func) {
368 const size_t work_amount = (size_t)D0 * D1 * D2;
369 if (work_amount == 0)
return;
370 size_t start{ 0 }, end{ 0 };
371 splitter(work_amount, nthr, ithr, start, end);
373 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 };
374 parallel_it_init(start, d0, D0, d1, D1, d2, D2);
375 for (
size_t iwork = start; iwork < end; ++iwork) {
377 parallel_it_step(d0, D0, d1, D1, d2, D2);
381 template <
typename T0,
typename T1,
typename T2,
typename F>
382 void parallel_for3d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const F &func) {
383 #if IE_THREAD == IE_THREAD_TBB
384 auto work_amount =
static_cast<size_t>(D0 * D1 * D2);
385 int nthr = parallel_get_max_threads();
386 if (static_cast<size_t>(nthr) > work_amount)
387 nthr =
static_cast<int>(work_amount);
389 for_3d(0, 1, D0, D1, D2, func);
391 tbb::parallel_for(0, nthr, [&](
int ithr) {
392 for_3d(ithr, nthr, D0, D1, D2, func);
393 }, tbb::static_partitioner());
395 #elif IE_THREAD == IE_THREAD_TBB_AUTO
396 const int nthr = parallel_get_max_threads();
397 tbb::parallel_for(0, nthr, [&](
int ithr) {
398 for_3d(ithr, nthr, D0, D1, D2, func);
400 #elif IE_THREAD == IE_THREAD_OMP
401 # pragma omp parallel
402 for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func);
403 #elif IE_THREAD == IE_THREAD_SEQ
404 for_3d(0, 1, D0, D1, D2, func);
408 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename F>
409 void for_4d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
410 const T2 &D2,
const T3 &D3,
const F &func) {
411 const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
412 if (work_amount == 0)
return;
413 size_t start{ 0 }, end{ 0 };
414 splitter(work_amount, nthr, ithr, start, end);
416 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 };
417 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3);
418 for (
size_t iwork = start; iwork < end; ++iwork) {
419 func(d0, d1, d2, d3);
420 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3);
424 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename F>
425 void parallel_for4d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const T3 &D3,
const F &func) {
426 #if IE_THREAD == IE_THREAD_TBB
427 auto work_amount =
static_cast<size_t>(D0 * D1 * D2 * D3);
428 int nthr = parallel_get_max_threads();
429 if (static_cast<size_t>(nthr) > work_amount)
430 nthr =
static_cast<int>(work_amount);
432 for_4d(0, 1, D0, D1, D2, D3, func);
434 tbb::parallel_for(0, nthr, [&](
int ithr) {
435 for_4d(ithr, nthr, D0, D1, D2, D3, func);
436 }, tbb::static_partitioner());
438 #elif IE_THREAD == IE_THREAD_TBB_AUTO
439 const int nthr = parallel_get_max_threads();
440 tbb::parallel_for(0, nthr, [&](
int ithr) {
441 for_4d(ithr, nthr, D0, D1, D2, D3, func);
443 #elif IE_THREAD == IE_THREAD_OMP
444 # pragma omp parallel
445 for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func);
446 #elif IE_THREAD == IE_THREAD_SEQ
447 for_4d(0, 1, D0, D1, D2, D3, func);
451 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename F>
452 void for_5d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
453 const T2 &D2,
const T3 &D3,
const T4 &D4,
const F &func) {
454 const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4;
455 if (work_amount == 0)
return;
456 size_t start{ 0 }, end{ 0 };
457 splitter(work_amount, nthr, ithr, start, end);
459 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 };
460 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
461 for (
size_t iwork = start; iwork < end; ++iwork) {
462 func(d0, d1, d2, d3, d4);
463 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
467 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename F>
468 void parallel_for5d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const T3 &D3,
469 const T4 &D4,
const F &func) {
470 #if IE_THREAD == IE_THREAD_TBB
471 auto work_amount =
static_cast<size_t>(D0 * D1 * D2 * D3 * D4);
472 int nthr = parallel_get_max_threads();
473 if (static_cast<size_t>(nthr) > work_amount)
474 nthr =
static_cast<int>(work_amount);
476 for_5d(0, 1, D0, D1, D2, D3, D4, func);
478 tbb::parallel_for(0, nthr, [&](
int ithr) {
479 for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
480 }, tbb::static_partitioner());
482 #elif IE_THREAD == IE_THREAD_TBB_AUTO
483 const int nthr = parallel_get_max_threads();
484 tbb::parallel_for(0, nthr, [&](
int ithr) {
485 for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
487 #elif IE_THREAD == IE_THREAD_OMP
488 # pragma omp parallel
489 for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func);
490 #elif IE_THREAD == IE_THREAD_SEQ
491 for_5d(0, 1, D0, D1, D2, D3, D4, func);
Definition: ie_argmax_layer.hpp:11