ie_parallel.hpp
Go to the documentation of this file.
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 
5 /**
6  * @brief Contains declarations and definitions for sequential and multi-threading implementations.
7  * Multi-threading support is implemented in two variants: using the Threading Building Blocks library and OpenMP* product.
8  * To build a particular implementation, use the corresponding identifier: IE_THREAD_TBB, IE_THREAD_OMP or IE_THREAD_SEQ.
9  * @file ie_parallel.hpp
10  */
11 
12 #pragma once
13 
14 #define IE_THREAD_TBB 0
15 #define IE_THREAD_OMP 1
16 #define IE_THREAD_SEQ 2
17 
18 #if IE_THREAD == IE_THREAD_TBB
19 #define TBB_PREVIEW_LOCAL_OBSERVER 1
20 #include "tbb/task_scheduler_observer.h"
21 #include "tbb/parallel_for.h"
22 #include "tbb/task_arena.h"
23 
24 #include "tbb/parallel_reduce.h"
25 #include "tbb/blocked_range.h"
26 #include "tbb/blocked_range2d.h"
27 #include "tbb/blocked_range3d.h"
28 
29 inline int parallel_get_max_threads() { return tbb::this_task_arena::max_concurrency(); }
30 inline int parallel_get_num_threads() { return parallel_get_max_threads(); }
31 inline int parallel_get_thread_num() { return tbb::this_task_arena::current_thread_index(); }
32 inline void parallel_set_num_threads(int n) { return; }
33 inline int parallel_get_env_threads() { return 0; }
34 
35 #elif IE_THREAD == IE_THREAD_OMP
36 #include <cstdlib>
37 #include <string>
38 #include <omp.h>
39 
40 
41 /* MSVC still supports omp 2.0 only */
42 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
43 # define collapse(x)
44 #endif // defined(_MSC_VER) && !defined(__INTEL_COMPILER)
45 inline int parallel_get_max_threads() { return omp_get_max_threads(); }
46 inline int parallel_get_num_threads() { return omp_get_num_threads(); }
47 inline int parallel_get_thread_num() { return omp_get_thread_num(); }
48 inline void parallel_set_num_threads(int n) { omp_set_num_threads(n); }
49 inline int parallel_get_env_threads() {
50  int env_cores = 0;
51  if (getenv("OMP_NUM_THREADS") != nullptr) {
52  try {
53  env_cores = std::stoi(getenv("OMP_NUM_THREADS"));
54  } catch (const std::exception&) {
55  env_cores = 0;
56  }
57  }
58  return env_cores;
59 }
60 
61 #elif IE_THREAD == IE_THREAD_SEQ
62 inline int parallel_get_env_threads() { return 1; }
63 inline int parallel_get_max_threads() { return 1; }
64 inline int parallel_get_num_threads() { return 1; }
65 inline int parallel_get_thread_num() { return 0; }
66 inline void parallel_set_num_threads(int n) { return; }
67 #endif
68 
69 
70 namespace InferenceEngine {
71 
72 template <typename F>
73 void parallel_nt(int nthr, F func) {
74 #if IE_THREAD == IE_THREAD_TBB
75  if (nthr == 0) nthr = parallel_get_max_threads();
76  if (nthr == 1) {
77  func(0, 1);
78  return;
79  }
80 
81  tbb::parallel_for(0, nthr, [&](int ithr) {
82  func(ithr, nthr);
83  });
84 #elif IE_THREAD == IE_THREAD_OMP
85  if (nthr == 1) {
86  func(0, 1);
87  return;
88  }
89 
90 # pragma omp parallel num_threads(nthr)
91  func(parallel_get_thread_num(), parallel_get_num_threads());
92 #elif IE_THREAD == IE_THREAD_SEQ
93  func(0, 1);
94 #endif
95 }
96 
97 template <typename F>
98 void parallel_nt_static(int nthr, F func) {
99 #if IE_THREAD == IE_THREAD_SEQ
100  const bool serial = true;
101 #else
102  const bool serial = false;
103 #endif
104 
105  if (serial || nthr == 1) {
106  func(0, 1);
107  return;
108  }
109 
110  if (nthr == 0) nthr = parallel_get_max_threads();
111 #if IE_THREAD == IE_THREAD_TBB
112  tbb::parallel_for(0, nthr, [&](int ithr) {
113  func(ithr, nthr);
114  }
115  , tbb::static_partitioner{});
116 
117 #elif IE_THREAD == IE_THREAD_OMP
118 
119 # pragma omp parallel num_threads(nthr)
120  {
121  func(parallel_get_thread_num(), parallel_get_num_threads());
122  }
123 #endif
124 }
125 
126 template <typename T0, typename R, typename F>
127 R parallel_sum(const T0 D0, R &input, F func) {
128 #if IE_THREAD == IE_THREAD_TBB
129  return tbb::parallel_reduce(
130  tbb::blocked_range<T0>(0, D0), input,
131  [&](const tbb::blocked_range<T0>& r, R init)->R {
132  R sum = init;
133  for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1)
134  sum += func(dim1);
135  return sum;
136  },
137  [](R x, R y)->R {
138  return x + y;
139  });
140 #else
141  R sum = input;
142 
143 #ifdef _MSC_VER
144  using T0_IT = typename std::make_signed<T0>::type;
145 #else
146  using T0_IT = T0;
147 #endif
148 
149 #if IE_THREAD == IE_THREAD_OMP
150  #pragma omp parallel for reduction(+ : sum) schedule(static)
151 #endif
152  for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
153  sum += static_cast<R>(func(dim1));
154  }
155  return sum;
156 #endif
157 }
158 
159 template <typename T0, typename T1, typename R, typename F>
160 R parallel_sum2d(const T0 D0, const T1 D1, R input, F func) {
161 #if IE_THREAD == IE_THREAD_TBB
162  return tbb::parallel_reduce(
163  tbb::blocked_range2d<T0, T1>(0, D0, 0, D1), input,
164  [&](const tbb::blocked_range2d<T0, T1>& r, R init)->R {
165  R sum = init;
166  for (T0 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
167  for (T1 dim1 = r.cols().begin(); dim1 < r.cols().end(); dim1++) {
168  sum += func(dim2, dim1);
169  }
170  }
171  return sum;
172  },
173  [](R x, R y)->R {
174  return x + y;
175  });
176 #else
177  R sum = input;
178 
179 #ifdef _MSC_VER
180  using T0_IT = typename std::make_signed<T0>::type;
181  using T1_IT = typename std::make_signed<T1>::type;
182 #else
183  using T0_IT = T0;
184  using T1_IT = T1;
185 #endif
186 
187 #if IE_THREAD == IE_THREAD_OMP
188  #pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static)
189 #endif
190  for (T0_IT dim2 = 0; dim2 < D0; dim2++) {
191  for (T1_IT dim1 = 0; dim1 < D1; dim1++) {
192  sum += func(dim2, dim1);
193  }
194  }
195  return sum;
196 #endif
197 }
198 template <typename T0, typename T1, typename T2, typename R, typename F>
199 R parallel_sum3d(const T0 D0, const T1 D1, const T2 D2, R input, F func) {
200 #if IE_THREAD == IE_THREAD_TBB
201  return tbb::parallel_reduce(
202  tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2), input,
203  [&](const tbb::blocked_range3d<T0, T1, T2>& r, R init)->R {
204  R sum = init;
205  for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) {
206  for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
207  for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) {
208  sum += func(dim1, dim2, dim3);
209  }
210  }
211  }
212  return sum;
213  },
214  [](R x, R y)->R {
215  return x + y;
216  });
217 #else
218  R sum = input;
219 
220 #ifdef _MSC_VER
221  using T0_IT = typename std::make_signed<T0>::type;
222  using T1_IT = typename std::make_signed<T1>::type;
223  using T2_IT = typename std::make_signed<T2>::type;
224 #else
225  using T0_IT = T0;
226  using T1_IT = T1;
227  using T2_IT = T2;
228 #endif
229 
230 #if IE_THREAD == IE_THREAD_OMP
231  #pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static)
232 #endif
233  for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
234  for (T1_IT dim2 = 0; dim2 < static_cast<T1_IT>(D1); dim2++) {
235  for (T2_IT dim3 = 0; dim3 < static_cast<T2_IT>(D2); dim3++) {
236  sum += func(dim1, dim2, dim3);
237  }
238  }
239  }
240  return sum;
241 #endif
242 }
243 
244 template<typename T>
245 inline T parallel_it_init(T start) { return start; }
246 template<typename T, typename Q, typename R, typename... Args>
247 inline T parallel_it_init(T start, Q &x, const R &X, Args &&... tuple) {
248  start = parallel_it_init(start, static_cast<Args>(tuple)...);
249  x = start % X;
250  return start / X;
251 }
252 
253 inline bool parallel_it_step() { return true; }
254 template<typename Q, typename R, typename... Args>
255 inline bool parallel_it_step(Q &x, const R &X, Args &&... tuple) {
256  if (parallel_it_step(static_cast<Args>(tuple)...)) {
257  x = (x + 1) % X;
258  return x == 0;
259  }
260  return false;
261 }
262 
263 template <typename T, typename Q>
264 inline void splitter(T n, Q team, Q tid, T &n_start, T &n_end) {
265  if (team <= 1 || n == 0) {
266  n_start = 0;
267  n_end = n;
268  } else {
269  T n1 = (n + (T)team - 1) / (T)team;
270  T n2 = n1 - 1;
271  T T1 = n - n2 * (T)team;
272  n_end = (T)tid < T1 ? n1 : n2;
273  n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2;
274  }
275 
276  n_end += n_start;
277 }
278 
279 
280 template <typename T0, typename F>
281 void for_1d(const int ithr, const int nthr, const T0 &D0, F func) {
282  T0 d0{ 0 }, end{ 0 };
283  splitter(D0, nthr, ithr, d0, end);
284  for (; d0 < end; ++d0) func(d0);
285 }
286 
287 template <typename T0, typename F>
288 void parallel_for(const T0 &D0, F func) {
289 #if IE_THREAD == IE_THREAD_TBB
290  const int nthr = parallel_get_max_threads();
291  tbb::parallel_for(0, nthr, [&](int ithr) {
292  for_1d(ithr, nthr, D0, func);
293  });
294 #elif IE_THREAD == IE_THREAD_OMP
295  # pragma omp parallel
296  for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func);
297 #elif IE_THREAD == IE_THREAD_SEQ
298  for_1d(0, 1, D0, func);
299 #endif
300 }
301 
302 
303 template <typename T0, typename T1, typename F>
304 void for_2d(const int ithr, const int nthr, const T0 &D0, const T1 &D1, F func) {
305  const size_t work_amount = (size_t)D0 * D1;
306  if (work_amount == 0) return;
307  size_t start{ 0 }, end{ 0 };
308  splitter(work_amount, nthr, ithr, start, end);
309 
310  T0 d0{ 0 }; T1 d1{ 0 };
311  parallel_it_init(start, d0, D0, d1, D1);
312  for (size_t iwork = start; iwork < end; ++iwork) {
313  func(d0, d1);
314  parallel_it_step(d0, D0, d1, D1);
315  }
316 }
317 
318 template <typename T0, typename T1, typename F>
319 void parallel_for2d(const T0 &D0, const T1 &D1, F func) {
320 #if IE_THREAD == IE_THREAD_TBB
321  const int nthr = parallel_get_max_threads();
322  tbb::parallel_for(0, nthr, [&](int ithr) {
323  for_2d(ithr, nthr, D0, D1, func);
324  });
325 #elif IE_THREAD == IE_THREAD_OMP
326  # pragma omp parallel
327  for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func);
328 #elif IE_THREAD == IE_THREAD_SEQ
329  for_2d(0, 1, D0, D1, func);
330 #endif
331 }
332 
333 
334 template <typename T0, typename T1, typename T2, typename F>
335 void for_3d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
336  const T2 &D2, F func) {
337  const size_t work_amount = (size_t)D0 * D1 * D2;
338  if (work_amount == 0) return;
339  size_t start{ 0 }, end{ 0 };
340  splitter(work_amount, nthr, ithr, start, end);
341 
342  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 };
343  parallel_it_init(start, d0, D0, d1, D1, d2, D2);
344  for (size_t iwork = start; iwork < end; ++iwork) {
345  func(d0, d1, d2);
346  parallel_it_step(d0, D0, d1, D1, d2, D2);
347  }
348 }
349 
350 template <typename T0, typename T1, typename T2, typename F>
351 void parallel_for3d(const T0 &D0, const T1 &D1, const T2 &D2, F func) {
352 #if IE_THREAD == IE_THREAD_TBB
353  const int nthr = parallel_get_max_threads();
354  tbb::parallel_for(0, nthr, [&](int ithr) {
355  for_3d(ithr, nthr, D0, D1, D2, func);
356  });
357 #elif IE_THREAD == IE_THREAD_OMP
358  # pragma omp parallel
359  for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func);
360 #elif IE_THREAD == IE_THREAD_SEQ
361  for_3d(0, 1, D0, D1, D2, func);
362 #endif
363 }
364 
365 template <typename T0, typename T1, typename T2, typename T3, typename F>
366 void for_4d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
367  const T2 &D2, const T3 &D3, F func) {
368  const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
369  if (work_amount == 0) return;
370  size_t start{ 0 }, end{ 0 };
371  splitter(work_amount, nthr, ithr, start, end);
372 
373  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 };
374  parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3);
375  for (size_t iwork = start; iwork < end; ++iwork) {
376  func(d0, d1, d2, d3);
377  parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3);
378  }
379 }
380 
381 template <typename T0, typename T1, typename T2, typename T3, typename F>
382 void parallel_for4d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3, F func) {
383 #if IE_THREAD == IE_THREAD_TBB
384  const int nthr = parallel_get_max_threads();
385  tbb::parallel_for(0, nthr, [&](int ithr) {
386  for_4d(ithr, nthr, D0, D1, D2, D3, func);
387  });
388 #elif IE_THREAD == IE_THREAD_OMP
389  # pragma omp parallel
390  for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func);
391 #elif IE_THREAD == IE_THREAD_SEQ
392  for_4d(0, 1, D0, D1, D2, D3, func);
393 #endif
394 }
395 
396 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
397 void for_5d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
398  const T2 &D2, const T3 &D3, const T4 &D4, F func) {
399  const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4;
400  if (work_amount == 0) return;
401  size_t start{ 0 }, end{ 0 };
402  splitter(work_amount, nthr, ithr, start, end);
403 
404  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 };
405  parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
406  for (size_t iwork = start; iwork < end; ++iwork) {
407  func(d0, d1, d2, d3, d4);
408  parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
409  }
410 }
411 
412 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
413 void parallel_for5d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3,
414  const T4 &D4, F func) {
415 #if IE_THREAD == IE_THREAD_TBB
416  const int nthr = parallel_get_max_threads();
417  tbb::parallel_for(0, nthr, [&](int ithr) {
418  for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
419  });
420 #elif IE_THREAD == IE_THREAD_OMP
421  # pragma omp parallel
422  for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func);
423 #elif IE_THREAD == IE_THREAD_SEQ
424  for_5d(0, 1, D0, D1, D2, D3, D4, func);
425 #endif
426 }
427 
428 
429 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename F>
430 void for_6d(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
431  const T2 &D2, const T3 &D3, const T4 &D4, const T5 &D5, F func) {
432  const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5;
433  if (work_amount == 0) return;
434  size_t start{ 0 }, end{ 0 };
435  splitter(work_amount, nthr, ithr, start, end);
436 
437  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 }; T5 d5{ 0 };
438  parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4,
439  d5, D5);
440  for (size_t iwork = start; iwork < end; ++iwork) {
441  func(d0, d1, d2, d3, d4, d5);
442  parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5);
443  }
444 }
445 
446 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename F>
447 void parallel_for6d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3,
448  const T4 &D4, const T5 &D5, F func) {
449 #if IE_THREAD == IE_THREAD_TBB
450  const int nthr = parallel_get_max_threads();
451  tbb::parallel_for(0, nthr, [&](int ithr) {
452  for_6d(ithr, nthr, D0, D1, D2, D3, D4, D5, func);
453  });
454 #elif IE_THREAD == IE_THREAD_OMP
455 # pragma omp parallel
456  for_6d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, D5, func);
457 #elif IE_THREAD == IE_THREAD_SEQ
458  for_6d(0, 1, D0, D1, D2, D3, D4, D5, func);
459 #endif
460 }
461 
462 } // namespace InferenceEngine
463 
Definition: ie_argmax_layer.hpp:11