16 #define IE_THREAD_TBB 0
17 #define IE_THREAD_OMP 1
18 #define IE_THREAD_SEQ 2
20 #if IE_THREAD == IE_THREAD_TBB
21 #define TBB_PREVIEW_LOCAL_OBSERVER 1
22 #include "tbb/task_scheduler_observer.h"
23 #include "tbb/parallel_for.h"
24 #include "tbb/task_arena.h"
26 #include "tbb/parallel_reduce.h"
27 #include "tbb/blocked_range.h"
28 #include "tbb/blocked_range2d.h"
29 #include "tbb/blocked_range3d.h"
31 inline int parallel_get_max_threads() {
return tbb::this_task_arena::max_concurrency(); }
32 inline int parallel_get_num_threads() {
return parallel_get_max_threads(); }
33 inline int parallel_get_thread_num() {
return tbb::this_task_arena::current_thread_index(); }
34 inline void parallel_set_num_threads(
int n) {
return; }
35 inline int parallel_get_env_threads() {
return 0; }
37 #elif IE_THREAD == IE_THREAD_OMP
44 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
46 #endif // defined(_MSC_VER) && !defined(__INTEL_COMPILER)
47 inline int parallel_get_max_threads() {
return omp_get_max_threads(); }
48 inline int parallel_get_num_threads() {
return omp_get_num_threads(); }
49 inline int parallel_get_thread_num() {
return omp_get_thread_num(); }
50 inline void parallel_set_num_threads(
int n) { omp_set_num_threads(n); }
51 inline int parallel_get_env_threads() {
53 if (getenv(
"OMP_NUM_THREADS") !=
nullptr) {
55 env_cores = std::stoi(getenv(
"OMP_NUM_THREADS"));
56 }
catch (
const std::exception&) {
63 #elif IE_THREAD == IE_THREAD_SEQ
64 inline int parallel_get_env_threads() {
return 1; }
65 inline int parallel_get_max_threads() {
return 1; }
66 inline int parallel_get_num_threads() {
return 1; }
67 inline int parallel_get_thread_num() {
return 0; }
68 inline void parallel_set_num_threads(
int n) {
return; }
75 void parallel_nt(
int nthr,
const F &func) {
76 #if IE_THREAD == IE_THREAD_TBB
77 if (nthr == 0) nthr = parallel_get_max_threads();
83 tbb::parallel_for(0, nthr, [&](
int ithr) {
86 #elif IE_THREAD == IE_THREAD_OMP
92 # pragma omp parallel num_threads(nthr)
93 func(parallel_get_thread_num(), parallel_get_num_threads());
94 #elif IE_THREAD == IE_THREAD_SEQ
100 void parallel_nt_static(
int nthr,
const F &func) {
101 #if IE_THREAD == IE_THREAD_SEQ
102 const bool serial =
true;
104 const bool serial =
false;
107 if (serial || nthr == 1) {
112 if (nthr == 0) nthr = parallel_get_max_threads();
113 #if IE_THREAD == IE_THREAD_TBB
114 tbb::parallel_for(0, nthr, [&](
int ithr) {
117 , tbb::static_partitioner{});
119 #elif IE_THREAD == IE_THREAD_OMP
121 # pragma omp parallel num_threads(nthr)
123 func(parallel_get_thread_num(), parallel_get_num_threads());
128 template <
typename T0,
typename R,
typename F>
129 R parallel_sum(
const T0 &D0,
const R &input,
const F &func) {
130 #if IE_THREAD == IE_THREAD_TBB
131 return tbb::parallel_reduce(
132 tbb::blocked_range<T0>(0, D0), input,
133 [&](
const tbb::blocked_range<T0>& r, R init)->R {
135 for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1)
146 using T0_IT =
typename std::make_signed<T0>::type;
151 #if IE_THREAD == IE_THREAD_OMP
152 #pragma omp parallel for reduction(+ : sum) schedule(static)
154 for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
155 sum +=
static_cast<R
>(func(dim1));
161 template <
typename T0,
typename T1,
typename R,
typename F>
162 R parallel_sum2d(
const T0 &D0,
const T1 &D1,
const R &input,
const F &func) {
163 #if IE_THREAD == IE_THREAD_TBB
164 return tbb::parallel_reduce(
165 tbb::blocked_range2d<T0, T1>(0, D0, 0, D1), input,
166 [&](
const tbb::blocked_range2d<T0, T1>& r, R init)->R {
168 for (T0 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
169 for (T1 dim1 = r.cols().begin(); dim1 < r.cols().end(); dim1++) {
170 sum += func(dim2, dim1);
182 using T0_IT =
typename std::make_signed<T0>::type;
183 using T1_IT =
typename std::make_signed<T1>::type;
189 #if IE_THREAD == IE_THREAD_OMP
190 #pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static)
192 for (T0_IT dim2 = 0; dim2 < D0; dim2++) {
193 for (T1_IT dim1 = 0; dim1 < D1; dim1++) {
194 sum += func(dim2, dim1);
200 template <
typename T0,
typename T1,
typename T2,
typename R,
typename F>
201 R parallel_sum3d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const R &input,
const F &func) {
202 #if IE_THREAD == IE_THREAD_TBB
203 return tbb::parallel_reduce(
204 tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2), input,
205 [&](
const tbb::blocked_range3d<T0, T1, T2>& r, R init)->R {
207 for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) {
208 for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
209 for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) {
210 sum += func(dim1, dim2, dim3);
223 using T0_IT =
typename std::make_signed<T0>::type;
224 using T1_IT =
typename std::make_signed<T1>::type;
225 using T2_IT =
typename std::make_signed<T2>::type;
232 #if IE_THREAD == IE_THREAD_OMP
233 #pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static)
235 for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
236 for (T1_IT dim2 = 0; dim2 < static_cast<T1_IT>(D1); dim2++) {
237 for (T2_IT dim3 = 0; dim3 < static_cast<T2_IT>(D2); dim3++) {
238 sum += func(dim1, dim2, dim3);
247 inline T parallel_it_init(T start) {
return start; }
248 template<
typename T,
typename Q,
typename R,
typename... Args>
249 inline T parallel_it_init(T start, Q &x,
const R &X, Args &&... tuple) {
250 start = parallel_it_init(start, static_cast<Args>(tuple)...);
255 inline bool parallel_it_step() {
return true; }
256 template<
typename Q,
typename R,
typename... Args>
257 inline bool parallel_it_step(Q &x,
const R &X, Args &&... tuple) {
258 if (parallel_it_step(static_cast<Args>(tuple)...)) {
265 template <
typename T,
typename Q>
266 inline void splitter(
const T &n,
const Q &team,
const Q &tid, T &n_start, T &n_end) {
267 if (team <= 1 || n == 0) {
271 T n1 = (n + (T)team - 1) / (T)team;
273 T T1 = n - n2 * (T)team;
274 n_end = (T)tid < T1 ? n1 : n2;
275 n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2;
282 template <
typename T0,
typename F>
283 void for_1d(
const int &ithr,
const int &nthr,
const T0 &D0,
const F &func) {
284 T0 d0{ 0 }, end{ 0 };
285 splitter(D0, nthr, ithr, d0, end);
286 for (; d0 < end; ++d0) func(d0);
289 template <
typename T0,
typename F>
290 void parallel_for(
const T0 &D0,
const F &func) {
291 #if IE_THREAD == IE_THREAD_TBB
292 const int nthr = parallel_get_max_threads();
293 tbb::parallel_for(0, nthr, [&](
int ithr) {
294 for_1d(ithr, nthr, D0, func);
296 #elif IE_THREAD == IE_THREAD_OMP
297 # pragma omp parallel
298 for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func);
299 #elif IE_THREAD == IE_THREAD_SEQ
300 for_1d(0, 1, D0, func);
305 template <
typename T0,
typename T1,
typename F>
306 void for_2d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
const F &func) {
307 const size_t work_amount = (size_t)D0 * D1;
308 if (work_amount == 0)
return;
309 size_t start{ 0 }, end{ 0 };
310 splitter(work_amount, nthr, ithr, start, end);
312 T0 d0{ 0 }; T1 d1{ 0 };
313 parallel_it_init(start, d0, D0, d1, D1);
314 for (
size_t iwork = start; iwork < end; ++iwork) {
316 parallel_it_step(d0, D0, d1, D1);
320 template <
typename T0,
typename T1,
typename F>
321 void parallel_for2d(
const T0 &D0,
const T1 &D1,
const F &func) {
322 #if IE_THREAD == IE_THREAD_TBB
323 const int nthr = parallel_get_max_threads();
324 tbb::parallel_for(0, nthr, [&](
int ithr) {
325 for_2d(ithr, nthr, D0, D1, func);
327 #elif IE_THREAD == IE_THREAD_OMP
328 # pragma omp parallel
329 for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func);
330 #elif IE_THREAD == IE_THREAD_SEQ
331 for_2d(0, 1, D0, D1, func);
336 template <
typename T0,
typename T1,
typename T2,
typename F>
337 void for_3d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
338 const T2 &D2,
const F &func) {
339 const size_t work_amount = (size_t)D0 * D1 * D2;
340 if (work_amount == 0)
return;
341 size_t start{ 0 }, end{ 0 };
342 splitter(work_amount, nthr, ithr, start, end);
344 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 };
345 parallel_it_init(start, d0, D0, d1, D1, d2, D2);
346 for (
size_t iwork = start; iwork < end; ++iwork) {
348 parallel_it_step(d0, D0, d1, D1, d2, D2);
352 template <
typename T0,
typename T1,
typename T2,
typename F>
353 void parallel_for3d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const F &func) {
354 #if IE_THREAD == IE_THREAD_TBB
355 const int nthr = parallel_get_max_threads();
356 tbb::parallel_for(0, nthr, [&](
int ithr) {
357 for_3d(ithr, nthr, D0, D1, D2, func);
359 #elif IE_THREAD == IE_THREAD_OMP
360 # pragma omp parallel
361 for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func);
362 #elif IE_THREAD == IE_THREAD_SEQ
363 for_3d(0, 1, D0, D1, D2, func);
367 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename F>
368 void for_4d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
369 const T2 &D2,
const T3 &D3,
const F &func) {
370 const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
371 if (work_amount == 0)
return;
372 size_t start{ 0 }, end{ 0 };
373 splitter(work_amount, nthr, ithr, start, end);
375 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 };
376 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3);
377 for (
size_t iwork = start; iwork < end; ++iwork) {
378 func(d0, d1, d2, d3);
379 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3);
383 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename F>
384 void parallel_for4d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const T3 &D3,
const F &func) {
385 #if IE_THREAD == IE_THREAD_TBB
386 const int nthr = parallel_get_max_threads();
387 tbb::parallel_for(0, nthr, [&](
int ithr) {
388 for_4d(ithr, nthr, D0, D1, D2, D3, func);
390 #elif IE_THREAD == IE_THREAD_OMP
391 # pragma omp parallel
392 for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func);
393 #elif IE_THREAD == IE_THREAD_SEQ
394 for_4d(0, 1, D0, D1, D2, D3, func);
398 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename F>
399 void for_5d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
400 const T2 &D2,
const T3 &D3,
const T4 &D4,
const F &func) {
401 const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4;
402 if (work_amount == 0)
return;
403 size_t start{ 0 }, end{ 0 };
404 splitter(work_amount, nthr, ithr, start, end);
406 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 };
407 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
408 for (
size_t iwork = start; iwork < end; ++iwork) {
409 func(d0, d1, d2, d3, d4);
410 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
414 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename F>
415 void parallel_for5d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const T3 &D3,
416 const T4 &D4,
const F &func) {
417 #if IE_THREAD == IE_THREAD_TBB
418 const int nthr = parallel_get_max_threads();
419 tbb::parallel_for(0, nthr, [&](
int ithr) {
420 for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
422 #elif IE_THREAD == IE_THREAD_OMP
423 # pragma omp parallel
424 for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func);
425 #elif IE_THREAD == IE_THREAD_SEQ
426 for_5d(0, 1, D0, D1, D2, D3, D4, func);
431 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename F>
432 void for_6d(
const int &ithr,
const int &nthr,
const T0 &D0,
const T1 &D1,
433 const T2 &D2,
const T3 &D3,
const T4 &D4,
const T5 &D5, F func) {
434 const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5;
435 if (work_amount == 0)
return;
436 size_t start{ 0 }, end{ 0 };
437 splitter(work_amount, nthr, ithr, start, end);
439 T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 }; T5 d5{ 0 };
440 parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4,
442 for (
size_t iwork = start; iwork < end; ++iwork) {
443 func(d0, d1, d2, d3, d4, d5);
444 parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5);
448 template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename F>
449 void parallel_for6d(
const T0 &D0,
const T1 &D1,
const T2 &D2,
const T3 &D3,
450 const T4 &D4,
const T5 &D5, F func) {
451 #if IE_THREAD == IE_THREAD_TBB
452 const int nthr = parallel_get_max_threads();
453 tbb::parallel_for(0, nthr, [&](
int ithr) {
454 for_6d(ithr, nthr, D0, D1, D2, D3, D4, D5, func);
456 #elif IE_THREAD == IE_THREAD_OMP
457 # pragma omp parallel
458 for_6d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, D5, func);
459 #elif IE_THREAD == IE_THREAD_SEQ
460 for_6d(0, 1, D0, D1, D2, D3, D4, D5, func);
Definition: ie_argmax_layer.hpp:11