14 #define IE_THREAD_TBB 0
15 #define IE_THREAD_OMP 1
16 #define IE_THREAD_SEQ 2
18 #if IE_THREAD == IE_THREAD_TBB
19 #define TBB_PREVIEW_LOCAL_OBSERVER 1
20 #include "tbb/task_scheduler_observer.h"
21 #include "tbb/parallel_for.h"
22 #include "tbb/task_arena.h"
24 #include "tbb/parallel_reduce.h"
25 #include "tbb/blocked_range.h"
26 #include "tbb/blocked_range2d.h"
27 #include "tbb/blocked_range3d.h"
29 inline int parallel_get_max_threads() {
return tbb::this_task_arena::max_concurrency(); }
30 inline int parallel_get_num_threads() {
return parallel_get_max_threads(); }
31 inline int parallel_get_thread_num() {
return tbb::this_task_arena::current_thread_index(); }
32 inline void parallel_set_num_threads(
int n) {
return; }
33 inline int parallel_get_env_threads() {
return 0; }
35 #elif IE_THREAD == IE_THREAD_OMP
42 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
44 #endif // defined(_MSC_VER) && !defined(__INTEL_COMPILER)
45 inline int parallel_get_max_threads() {
return omp_get_max_threads(); }
46 inline int parallel_get_num_threads() {
return omp_get_num_threads(); }
47 inline int parallel_get_thread_num() {
return omp_get_thread_num(); }
48 inline void parallel_set_num_threads(
int n) { omp_set_num_threads(n); }
49 inline int parallel_get_env_threads() {
51 if (getenv(
"OMP_NUM_THREADS") !=
nullptr) {
53 env_cores = std::stoi(getenv(
"OMP_NUM_THREADS"));
54 }
catch (
const std::exception&) {
61 #elif IE_THREAD == IE_THREAD_SEQ
62 inline int parallel_get_env_threads() {
return 1; }
63 inline int parallel_get_max_threads() {
return 1; }
64 inline int parallel_get_num_threads() {
return 1; }
65 inline int parallel_get_thread_num() {
return 0; }
66 inline void parallel_set_num_threads(
int n) {
return; }
73 void parallel_nt(
int nthr, F func) {
74 #if IE_THREAD == IE_THREAD_TBB
75 if (nthr == 0) nthr = parallel_get_max_threads();
81 tbb::parallel_for(0, nthr, [&](
int ithr) {
84 #elif IE_THREAD == IE_THREAD_OMP
90 # pragma omp parallel num_threads(nthr)
91 func(parallel_get_thread_num(), parallel_get_num_threads());
92 #elif IE_THREAD == IE_THREAD_SEQ
98 void parallel_nt_static(
int nthr, F func) {
99 #if IE_THREAD == IE_THREAD_SEQ
100 const bool serial =
true;
102 const bool serial =
false;
105 if (serial || nthr == 1) {
110 if (nthr == 0) nthr = parallel_get_max_threads();
111 #if IE_THREAD == IE_THREAD_TBB
112 tbb::parallel_for(0, nthr, [&](
int ithr) {
115 , tbb::static_partitioner{});
117 #elif IE_THREAD == IE_THREAD_OMP
119 # pragma omp parallel num_threads(nthr)
121 func(parallel_get_thread_num(), parallel_get_num_threads());
126 template <
typename T0,
typename R,
typename F>
127 R parallel_sum(
const T0 D0, R &input, F func) {
128 #if IE_THREAD == IE_THREAD_TBB
129 return tbb::parallel_reduce(
130 tbb::blocked_range<T0>(0, D0), input,
131 [&](
const tbb::blocked_range<T0>& r, R init)->R {
133 for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1)
144 using T0_IT =
typename std::make_signed<T0>::type;
149 #if IE_THREAD == IE_THREAD_OMP
150 #pragma omp parallel for reduction(+ : sum) schedule(static)
152 for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
153 sum +=
static_cast<R
>(func(dim1));
159 template <
typename T0,
typename T1,
typename R,
typename F>
160 R parallel_sum2d(
const T0 D0,
const T1 D1, R input, F func) {
161 #if IE_THREAD == IE_THREAD_TBB
162 return tbb::parallel_reduce(
163 tbb::blocked_range2d<T0, T1>(0, D0, 0, D1), input,
164 [&](
const tbb::blocked_range2d<T0, T1>& r, R init)->R {
166 for (T0 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
167 for (T1 dim1 = r.cols().begin(); dim1 < r.cols().end(); dim1++) {
168 sum += func(dim2, dim1);
180 using T0_IT =
typename std::make_signed<T0>::type;
181 using T1_IT =
typename std::make_signed<T1>::type;
187 #if IE_THREAD == IE_THREAD_OMP
188 #pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static)
190 for (T0_IT dim2 = 0; dim2 < D0; dim2++) {
191 for (T1_IT dim1 = 0; dim1 < D1; dim1++) {
192 sum += func(dim2, dim1);
198 template <
typename T0,
typename T1,
typename T2,
typename R,
typename F>
199 R parallel_sum3d(
const T0 D0,
const T1 D1,
const T2 D2, R input, F func) {
200 #if IE_THREAD == IE_THREAD_TBB
201 return tbb::parallel_reduce(
202 tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2), input,
203 [&](
const tbb::blocked_range3d<T0, T1, T2>& r, R init)->R {
205 for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) {
206 for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
207 for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) {
208 sum += func(dim1, dim2, dim3);
221 using T0_IT =
typename std::make_signed<T0>::type;
222 using T1_IT =
typename std::make_signed<T1>::type;
223 using T2_IT =
typename std::make_signed<T2>::type;
230 #if IE_THREAD == IE_THREAD_OMP
231 #pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static)
233 for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
234 for (T1_IT dim2 = 0; dim2 < static_cast<T1_IT>(D1); dim2++) {
235 for (T2_IT dim3 = 0; dim3 < static_cast<T2_IT>(D2); dim3++) {
236 sum += func(dim1, dim2, dim3);
245 inline T parallel_it_init(T start) {
return start; }
246 template<
typename T,
typename Q,
typename R,
typename... Args>
247 inline T parallel_it_init(T start, Q &x,
const R &X, Args &&... tuple) {
248 start = parallel_it_init(start, static_cast<Args>(tuple)...);
253 inline bool parallel_it_step() {
return true; }
254 template<
typename Q,
typename R,
typename... Args>
255 inline bool parallel_it_step(Q &x,
const R &X, Args &&... tuple) {
256 if (parallel_it_step(static_cast<Args>(tuple)...)) {
263 template <
typename T,
typename Q>
264 inline void splitter(T n, Q team, Q tid, T &n_start, T &n_end) {
265 if (team <= 1 || n == 0) {
269 T n1 = (n + (T)team - 1) / (T)team;
271 T T1 = n - n2 * (T)team;
272 n_end = (T)tid < T1 ? n1 : n2;
273 n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2;
280 template <
typename T0,
typename F>
281 void for_1d(
const int ithr,
const int nthr,
const T0 &D0, F func) {
282 T0 d0{ 0 }, end{ 0 };
283 splitter(D0, nthr, ithr, d0, end);
284 for (; d0 < end; ++d0) func(d0);
287 template <
typename T0,
typename F>
288 void parallel_for(
const T0 &D0, F func) {
289 #if IE_THREAD == IE_THREAD_TBB
290 const int nthr = parallel_get_max_threads();
291 tbb::parallel_for(0, nthr, [&](
int ithr) {
292 for_1d(ithr, nthr, D0, func);
294 #elif IE_THREAD == IE_THREAD_OMP
295 # pragma omp parallel
296 for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func);
297 #elif IE_THREAD == IE_THREAD_SEQ
298 for_1d(0, 1, D0, func);
303 template <
typename T0,
typename T1,
typename F>
304 void for_2d(
const int ithr,
const int nthr,
const T0 &D0,
const T1 &D1, F func) {
305 const size_t work_amount = (size_t)D0 * D1;
306 if (work_amount == 0)
return;
307 size_t start{ 0 }, end{ 0 };
308 splitter(work_amount, nthr, ithr, start, end);
310 T0 d0{ 0 }; T1 d1{ 0 };
311 parallel_it_init(start, d0, D0, d1, D1);
312 for (
size_t iwork = start; iwork < end; ++iwork) {
314 parallel_it_step(d0, D0, d1, D1);
318 template <
typename T0,
typename T1,
typename F>
319 void parallel_for2d(
const T0 &D0,
const T1 &D1, F func) {
320 #if IE_THREAD == IE_THREAD_TBB
321 const int nthr = parallel_get_max_threads();
322 tbb::parallel_for(0, nthr, [&](
int ithr) {
323 for_2d(ithr, nthr, D0, D1, func);
325 #elif IE_THREAD == IE_THREAD_OMP
326 # pragma omp parallel
327 for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func);
328 #elif IE_THREAD == IE_THREAD_SEQ
329 for_2d(0, 1, D0, D1, func);
334 template <
typename T0,
typename T1,
typename T2,
typename F>
335 void for_3d(
const int ithr,
const int nthr,
const T0 &D0,
const T1 &D1,
336 const T2 &D2, F func) {
337 const size_t work_amount = (size_t)D0 * D1 * D2;
338 if (work_amount == 0)
return;
339 size_t start{ 0 }, end{ 0 };
340 splitter(work_amount, nthr, ithr, start, end);
342 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 };
343 parallel_it_init(start, d0, D0, d1, D1, d2, D2);
344 for (
size_t iwork = start; iwork < end; ++iwork) {
346 parallel_it_step(d0, D0, d1, D1, d2, D2);
350 template <
typename T0,
typename T1,
typename T2,
typename F>
351 void parallel_for3d(
const T0 &D0,
const T1 &D1,
const T2 &D2, F func) {
352 #if IE_THREAD == IE_THREAD_TBB
353 const int nthr = parallel_get_max_threads();
354 tbb::parallel_for(0, nthr, [&](
int ithr) {
355 for_3d(ithr, nthr, D0, D1, D2, func);
357 #elif IE_THREAD == IE_THREAD_OMP
358 # pragma omp parallel
359 for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func);
360 #elif IE_THREAD == IE_THREAD_SEQ
361 for_3d(0, 1, D0, D1, D2, func);
365 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename F>
366 void for_4d(
const int ithr,
const int nthr,
const T0 &D0,
const T1 &D1,
367 const T2 &D2,
const T3 &D3, F func) {
368 const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
369 if (work_amount == 0)
return;
370 size_t start{ 0 }, end{ 0 };
371 splitter(work_amount, nthr, ithr, start, end);
373 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 };
374 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3);
375 for (
size_t iwork = start; iwork < end; ++iwork) {
376 func(d0, d1, d2, d3);
377 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3);
381 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename F>
382 void parallel_for4d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const T3 &D3, F func) {
383 #if IE_THREAD == IE_THREAD_TBB
384 const int nthr = parallel_get_max_threads();
385 tbb::parallel_for(0, nthr, [&](
int ithr) {
386 for_4d(ithr, nthr, D0, D1, D2, D3, func);
388 #elif IE_THREAD == IE_THREAD_OMP
389 # pragma omp parallel
390 for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func);
391 #elif IE_THREAD == IE_THREAD_SEQ
392 for_4d(0, 1, D0, D1, D2, D3, func);
396 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename F>
397 void for_5d(
const int ithr,
const int nthr,
const T0 &D0,
const T1 &D1,
398 const T2 &D2,
const T3 &D3,
const T4 &D4, F func) {
399 const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4;
400 if (work_amount == 0)
return;
401 size_t start{ 0 }, end{ 0 };
402 splitter(work_amount, nthr, ithr, start, end);
404 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 };
405 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
406 for (
size_t iwork = start; iwork < end; ++iwork) {
407 func(d0, d1, d2, d3, d4);
408 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
412 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename F>
413 void parallel_for5d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const T3 &D3,
414 const T4 &D4, F func) {
415 #if IE_THREAD == IE_THREAD_TBB
416 const int nthr = parallel_get_max_threads();
417 tbb::parallel_for(0, nthr, [&](
int ithr) {
418 for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
420 #elif IE_THREAD == IE_THREAD_OMP
421 # pragma omp parallel
422 for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func);
423 #elif IE_THREAD == IE_THREAD_SEQ
424 for_5d(0, 1, D0, D1, D2, D3, D4, func);
429 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename F>
430 void for_6d(
const int ithr,
const int nthr,
const T0 &D0,
const T1 &D1,
431 const T2 &D2,
const T3 &D3,
const T4 &D4,
const T5 &D5, F func) {
432 const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5;
433 if (work_amount == 0)
return;
434 size_t start{ 0 }, end{ 0 };
435 splitter(work_amount, nthr, ithr, start, end);
437 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 }; T5 d5{ 0 };
438 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4,
440 for (
size_t iwork = start; iwork < end; ++iwork) {
441 func(d0, d1, d2, d3, d4, d5);
442 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5);
446 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename F>
447 void parallel_for6d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const T3 &D3,
448 const T4 &D4,
const T5 &D5, F func) {
449 #if IE_THREAD == IE_THREAD_TBB
450 const int nthr = parallel_get_max_threads();
451 tbb::parallel_for(0, nthr, [&](
int ithr) {
452 for_6d(ithr, nthr, D0, D1, D2, D3, D4, D5, func);
454 #elif IE_THREAD == IE_THREAD_OMP
455 # pragma omp parallel
456 for_6d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, D5, func);
457 #elif IE_THREAD == IE_THREAD_SEQ
458 for_6d(0, 1, D0, D1, D2, D3, D4, D5, func);
Definition: ie_argmax_layer.hpp:11