| |
| |
| @@ -27,9 +27,13 @@ |
| |
| #include <limits.h> |
| #include <stdlib.h> |
| +#include <string.h> |
| #include "libgomp.h" |
| |
| |
| +ialias (GOMP_loop_runtime_next) |
| +ialias_redirect (GOMP_taskgroup_reduction_register) |
| + |
| /* Initialize the given work share construct from the given arguments. */ |
| |
| static inline void |
| @@ -79,12 +83,12 @@ gomp_loop_init (struct gomp_work_share * |
| } |
| |
| /* The *_start routines are called when first encountering a loop construct |
| - that is not bound directly to a parallel construct. The first thread |
| + that is not bound directly to a parallel construct. The first thread |
| that arrives will create the work-share construct; subsequent threads |
| will see the construct exists and allocate work from it. |
| |
| START, END, INCR are the bounds of the loop; due to the restrictions of |
| - OpenMP, these values must be the same in every thread. This is not |
| + OpenMP, these values must be the same in every thread. This is not |
| verified (nor is it entirely verifiable, since START is not necessarily |
| retained intact in the work-share data structure). CHUNK_SIZE is the |
| scheduling parameter; again this must be identical in all threads. |
| @@ -101,7 +105,7 @@ gomp_loop_static_start (long start, long |
| struct gomp_thread *thr = gomp_thread (); |
| |
| thr->ts.static_trip = 0; |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_init (thr->ts.work_share, start, end, incr, |
| GFS_STATIC, chunk_size); |
| @@ -123,7 +127,7 @@ gomp_loop_dynamic_start (long start, lon |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_init (thr->ts.work_share, start, end, incr, |
| GFS_DYNAMIC, chunk_size); |
| @@ -151,7 +155,7 @@ gomp_loop_guided_start (long start, long |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_init (thr->ts.work_share, start, end, incr, |
| GFS_GUIDED, chunk_size); |
| @@ -174,7 +178,7 @@ GOMP_loop_runtime_start (long start, lon |
| long *istart, long *iend) |
| { |
| struct gomp_task_icv *icv = gomp_icv (false); |
| - switch (icv->run_sched_var) |
| + switch (icv->run_sched_var & ~GFS_MONOTONIC) |
| { |
| case GFS_STATIC: |
| return gomp_loop_static_start (start, end, incr, |
| @@ -197,6 +201,100 @@ GOMP_loop_runtime_start (long start, lon |
| } |
| } |
| |
| +static long |
| +gomp_adjust_sched (long sched, long *chunk_size) |
| +{ |
| + sched &= ~GFS_MONOTONIC; |
| + switch (sched) |
| + { |
| + case GFS_STATIC: |
| + case GFS_DYNAMIC: |
| + case GFS_GUIDED: |
| + return sched; |
| + /* GFS_RUNTIME is used for runtime schedule without monotonic |
| + or nonmonotonic modifiers on the clause. |
| + GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic |
| + modifier. */ |
| + case GFS_RUNTIME: |
| + /* GFS_AUTO is used for runtime schedule with nonmonotonic |
| + modifier. */ |
| + case GFS_AUTO: |
| + { |
| + struct gomp_task_icv *icv = gomp_icv (false); |
| + sched = icv->run_sched_var & ~GFS_MONOTONIC; |
| + switch (sched) |
| + { |
| + case GFS_STATIC: |
| + case GFS_DYNAMIC: |
| + case GFS_GUIDED: |
| + *chunk_size = icv->run_sched_chunk_size; |
| + break; |
| + case GFS_AUTO: |
| + sched = GFS_STATIC; |
| + *chunk_size = 0; |
| + break; |
| + default: |
| + abort (); |
| + } |
| + return sched; |
| + } |
| + default: |
| + abort (); |
| + } |
| +} |
| + |
| +bool |
| +GOMP_loop_start (long start, long end, long incr, long sched, |
| + long chunk_size, long *istart, long *iend, |
| + uintptr_t *reductions, void **mem) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + |
| + thr->ts.static_trip = 0; |
| + if (reductions) |
| + gomp_workshare_taskgroup_start (); |
| + if (gomp_work_share_start (0)) |
| + { |
| + sched = gomp_adjust_sched (sched, &chunk_size); |
| + gomp_loop_init (thr->ts.work_share, start, end, incr, |
| + sched, chunk_size); |
| + if (reductions) |
| + { |
| + GOMP_taskgroup_reduction_register (reductions); |
| + thr->task->taskgroup->workshare = true; |
| + thr->ts.work_share->task_reductions = reductions; |
| + } |
| + if (mem) |
| + { |
| + uintptr_t size = (uintptr_t) *mem; |
| + if (size > (sizeof (struct gomp_work_share) |
| + - offsetof (struct gomp_work_share, |
| + inline_ordered_team_ids))) |
| + thr->ts.work_share->ordered_team_ids |
| + = gomp_malloc_cleared (size); |
| + else |
| + memset (thr->ts.work_share->ordered_team_ids, '\0', size); |
| + *mem = (void *) thr->ts.work_share->ordered_team_ids; |
| + } |
| + gomp_work_share_init_done (); |
| + } |
| + else |
| + { |
| + if (reductions) |
| + { |
| + uintptr_t *first_reductions = thr->ts.work_share->task_reductions; |
| + gomp_workshare_task_reduction_register (reductions, |
| + first_reductions); |
| + } |
| + if (mem) |
| + *mem = (void *) thr->ts.work_share->ordered_team_ids; |
| + } |
| + |
| + if (!istart) |
| + return true; |
| + return ialias_call (GOMP_loop_runtime_next) (istart, iend); |
| +} |
| + |
| /* The *_ordered_*_start routines are similar. The only difference is that |
| this work-share construct is initialized to expect an ORDERED section. */ |
| |
| @@ -207,7 +305,7 @@ gomp_loop_ordered_static_start (long sta |
| struct gomp_thread *thr = gomp_thread (); |
| |
| thr->ts.static_trip = 0; |
| - if (gomp_work_share_start (true)) |
| + if (gomp_work_share_start (1)) |
| { |
| gomp_loop_init (thr->ts.work_share, start, end, incr, |
| GFS_STATIC, chunk_size); |
| @@ -225,7 +323,7 @@ gomp_loop_ordered_dynamic_start (long st |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (true)) |
| + if (gomp_work_share_start (1)) |
| { |
| gomp_loop_init (thr->ts.work_share, start, end, incr, |
| GFS_DYNAMIC, chunk_size); |
| @@ -250,7 +348,7 @@ gomp_loop_ordered_guided_start (long sta |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (true)) |
| + if (gomp_work_share_start (1)) |
| { |
| gomp_loop_init (thr->ts.work_share, start, end, incr, |
| GFS_GUIDED, chunk_size); |
| @@ -273,7 +371,7 @@ GOMP_loop_ordered_runtime_start (long st |
| long *istart, long *iend) |
| { |
| struct gomp_task_icv *icv = gomp_icv (false); |
| - switch (icv->run_sched_var) |
| + switch (icv->run_sched_var & ~GFS_MONOTONIC) |
| { |
| case GFS_STATIC: |
| return gomp_loop_ordered_static_start (start, end, incr, |
| @@ -297,6 +395,81 @@ GOMP_loop_ordered_runtime_start (long st |
| } |
| } |
| |
| +bool |
| +GOMP_loop_ordered_start (long start, long end, long incr, long sched, |
| + long chunk_size, long *istart, long *iend, |
| + uintptr_t *reductions, void **mem) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + size_t ordered = 1; |
| + bool ret; |
| + |
| + thr->ts.static_trip = 0; |
| + if (reductions) |
| + gomp_workshare_taskgroup_start (); |
| + if (mem) |
| + ordered += (uintptr_t) *mem; |
| + if (gomp_work_share_start (ordered)) |
| + { |
| + sched = gomp_adjust_sched (sched, &chunk_size); |
| + gomp_loop_init (thr->ts.work_share, start, end, incr, |
| + sched, chunk_size); |
| + if (reductions) |
| + { |
| + GOMP_taskgroup_reduction_register (reductions); |
| + thr->task->taskgroup->workshare = true; |
| + thr->ts.work_share->task_reductions = reductions; |
| + } |
| + if (sched == GFS_STATIC) |
| + gomp_ordered_static_init (); |
| + else |
| + gomp_mutex_lock (&thr->ts.work_share->lock); |
| + gomp_work_share_init_done (); |
| + } |
| + else |
| + { |
| + if (reductions) |
| + { |
| + uintptr_t *first_reductions = thr->ts.work_share->task_reductions; |
| + gomp_workshare_task_reduction_register (reductions, |
| + first_reductions); |
| + } |
| + sched = thr->ts.work_share->sched; |
| + if (sched != GFS_STATIC) |
| + gomp_mutex_lock (&thr->ts.work_share->lock); |
| + } |
| + |
| + if (mem) |
| + { |
| + uintptr_t p |
| + = (uintptr_t) (thr->ts.work_share->ordered_team_ids |
| + + (thr->ts.team ? thr->ts.team->nthreads : 1)); |
| + p += __alignof__ (long long) - 1; |
| + p &= ~(__alignof__ (long long) - 1); |
| + *mem = (void *) p; |
| + } |
| + |
| + switch (sched) |
| + { |
| + case GFS_STATIC: |
| + case GFS_AUTO: |
| + return !gomp_iter_static_next (istart, iend); |
| + case GFS_DYNAMIC: |
| + ret = gomp_iter_dynamic_next_locked (istart, iend); |
| + break; |
| + case GFS_GUIDED: |
| + ret = gomp_iter_guided_next_locked (istart, iend); |
| + break; |
| + default: |
| + abort (); |
| + } |
| + |
| + if (ret) |
| + gomp_ordered_first (); |
| + gomp_mutex_unlock (&thr->ts.work_share->lock); |
| + return ret; |
| +} |
| + |
| /* The *_doacross_*_start routines are similar. The only difference is that |
| this work-share construct is initialized to expect an ORDERED(N) - DOACROSS |
| section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1 |
| @@ -310,11 +483,11 @@ gomp_loop_doacross_static_start (unsigne |
| struct gomp_thread *thr = gomp_thread (); |
| |
| thr->ts.static_trip = 0; |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, |
| GFS_STATIC, chunk_size); |
| - gomp_doacross_init (ncounts, counts, chunk_size); |
| + gomp_doacross_init (ncounts, counts, chunk_size, 0); |
| gomp_work_share_init_done (); |
| } |
| |
| @@ -328,11 +501,11 @@ gomp_loop_doacross_dynamic_start (unsign |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, |
| GFS_DYNAMIC, chunk_size); |
| - gomp_doacross_init (ncounts, counts, chunk_size); |
| + gomp_doacross_init (ncounts, counts, chunk_size, 0); |
| gomp_work_share_init_done (); |
| } |
| |
| @@ -354,11 +527,11 @@ gomp_loop_doacross_guided_start (unsigne |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, |
| GFS_GUIDED, chunk_size); |
| - gomp_doacross_init (ncounts, counts, chunk_size); |
| + gomp_doacross_init (ncounts, counts, chunk_size, 0); |
| gomp_work_share_init_done (); |
| } |
| |
| @@ -378,7 +551,7 @@ GOMP_loop_doacross_runtime_start (unsign |
| long *istart, long *iend) |
| { |
| struct gomp_task_icv *icv = gomp_icv (false); |
| - switch (icv->run_sched_var) |
| + switch (icv->run_sched_var & ~GFS_MONOTONIC) |
| { |
| case GFS_STATIC: |
| return gomp_loop_doacross_static_start (ncounts, counts, |
| @@ -402,8 +575,52 @@ GOMP_loop_doacross_runtime_start (unsign |
| } |
| } |
| |
| -/* The *_next routines are called when the thread completes processing of |
| - the iteration block currently assigned to it. If the work-share |
| +bool |
| +GOMP_loop_doacross_start (unsigned ncounts, long *counts, long sched, |
| + long chunk_size, long *istart, long *iend, |
| + uintptr_t *reductions, void **mem) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + |
| + thr->ts.static_trip = 0; |
| + if (reductions) |
| + gomp_workshare_taskgroup_start (); |
| + if (gomp_work_share_start (0)) |
| + { |
| + size_t extra = 0; |
| + if (mem) |
| + extra = (uintptr_t) *mem; |
| + sched = gomp_adjust_sched (sched, &chunk_size); |
| + gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, |
| + sched, chunk_size); |
| + gomp_doacross_init (ncounts, counts, chunk_size, extra); |
| + if (reductions) |
| + { |
| + GOMP_taskgroup_reduction_register (reductions); |
| + thr->task->taskgroup->workshare = true; |
| + thr->ts.work_share->task_reductions = reductions; |
| + } |
| + gomp_work_share_init_done (); |
| + } |
| + else |
| + { |
| + if (reductions) |
| + { |
| + uintptr_t *first_reductions = thr->ts.work_share->task_reductions; |
| + gomp_workshare_task_reduction_register (reductions, |
| + first_reductions); |
| + } |
| + sched = thr->ts.work_share->sched; |
| + } |
| + |
| + if (mem) |
| + *mem = thr->ts.work_share->doacross->extra; |
| + |
| + return ialias_call (GOMP_loop_runtime_next) (istart, iend); |
| +} |
| + |
| +/* The *_next routines are called when the thread completes processing of |
| + the iteration block currently assigned to it. If the work-share |
| construct is bound directly to a parallel construct, then the iteration |
| bounds may have been set up before the parallel. In which case, this |
| may be the first iteration for the thread. |
| @@ -456,7 +673,7 @@ bool |
| GOMP_loop_runtime_next (long *istart, long *iend) |
| { |
| struct gomp_thread *thr = gomp_thread (); |
| - |
| + |
| switch (thr->ts.work_share->sched) |
| { |
| case GFS_STATIC: |
| @@ -534,7 +751,7 @@ bool |
| GOMP_loop_ordered_runtime_next (long *istart, long *iend) |
| { |
| struct gomp_thread *thr = gomp_thread (); |
| - |
| + |
| switch (thr->ts.work_share->sched) |
| { |
| case GFS_STATIC: |
| @@ -563,7 +780,7 @@ gomp_parallel_loop_start (void (*fn) (vo |
| num_threads = gomp_resolve_num_threads (num_threads, 0); |
| team = gomp_new_team (num_threads); |
| gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size); |
| - gomp_team_start (fn, data, num_threads, flags, team); |
| + gomp_team_start (fn, data, num_threads, flags, team, NULL); |
| } |
| |
| void |
| @@ -600,7 +817,8 @@ GOMP_parallel_loop_runtime_start (void ( |
| { |
| struct gomp_task_icv *icv = gomp_icv (false); |
| gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, |
| - icv->run_sched_var, icv->run_sched_chunk_size, 0); |
| + icv->run_sched_var & ~GFS_MONOTONIC, |
| + icv->run_sched_chunk_size, 0); |
| } |
| |
| ialias_redirect (GOMP_parallel_end) |
| @@ -638,11 +856,28 @@ GOMP_parallel_loop_guided (void (*fn) (v |
| GOMP_parallel_end (); |
| } |
| |
| +void |
| +GOMP_parallel_loop_runtime (void (*fn) (void *), void *data, |
| + unsigned num_threads, long start, long end, |
| + long incr, unsigned flags) |
| +{ |
| + struct gomp_task_icv *icv = gomp_icv (false); |
| + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, |
| + icv->run_sched_var & ~GFS_MONOTONIC, |
| + icv->run_sched_chunk_size, flags); |
| + fn (data); |
| + GOMP_parallel_end (); |
| +} |
| + |
| #ifdef HAVE_ATTRIBUTE_ALIAS |
| extern __typeof(GOMP_parallel_loop_dynamic) GOMP_parallel_loop_nonmonotonic_dynamic |
| __attribute__((alias ("GOMP_parallel_loop_dynamic"))); |
| extern __typeof(GOMP_parallel_loop_guided) GOMP_parallel_loop_nonmonotonic_guided |
| __attribute__((alias ("GOMP_parallel_loop_guided"))); |
| +extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_nonmonotonic_runtime |
| + __attribute__((alias ("GOMP_parallel_loop_runtime"))); |
| +extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_maybe_nonmonotonic_runtime |
| + __attribute__((alias ("GOMP_parallel_loop_runtime"))); |
| #else |
| void |
| GOMP_parallel_loop_nonmonotonic_dynamic (void (*fn) (void *), void *data, |
| @@ -667,21 +902,35 @@ GOMP_parallel_loop_nonmonotonic_guided ( |
| fn (data); |
| GOMP_parallel_end (); |
| } |
| -#endif |
| |
| void |
| -GOMP_parallel_loop_runtime (void (*fn) (void *), void *data, |
| - unsigned num_threads, long start, long end, |
| - long incr, unsigned flags) |
| +GOMP_parallel_loop_nonmonotonic_runtime (void (*fn) (void *), void *data, |
| + unsigned num_threads, long start, |
| + long end, long incr, unsigned flags) |
| { |
| struct gomp_task_icv *icv = gomp_icv (false); |
| gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, |
| - icv->run_sched_var, icv->run_sched_chunk_size, |
| - flags); |
| + icv->run_sched_var & ~GFS_MONOTONIC, |
| + icv->run_sched_chunk_size, flags); |
| fn (data); |
| GOMP_parallel_end (); |
| } |
| |
| +void |
| +GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*fn) (void *), void *data, |
| + unsigned num_threads, long start, |
| + long end, long incr, |
| + unsigned flags) |
| +{ |
| + struct gomp_task_icv *icv = gomp_icv (false); |
| + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, |
| + icv->run_sched_var & ~GFS_MONOTONIC, |
| + icv->run_sched_chunk_size, flags); |
| + fn (data); |
| + GOMP_parallel_end (); |
| +} |
| +#endif |
| + |
| /* The GOMP_loop_end* routines are called after the thread is told that |
| all loop iterations are complete. The first two versions synchronize |
| all threads; the nowait version does not. */ |
| @@ -721,6 +970,10 @@ extern __typeof(gomp_loop_dynamic_start) |
| __attribute__((alias ("gomp_loop_dynamic_start"))); |
| extern __typeof(gomp_loop_guided_start) GOMP_loop_nonmonotonic_guided_start |
| __attribute__((alias ("gomp_loop_guided_start"))); |
| +extern __typeof(GOMP_loop_runtime_start) GOMP_loop_nonmonotonic_runtime_start |
| + __attribute__((alias ("GOMP_loop_runtime_start"))); |
| +extern __typeof(GOMP_loop_runtime_start) GOMP_loop_maybe_nonmonotonic_runtime_start |
| + __attribute__((alias ("GOMP_loop_runtime_start"))); |
| |
| extern __typeof(gomp_loop_ordered_static_start) GOMP_loop_ordered_static_start |
| __attribute__((alias ("gomp_loop_ordered_static_start"))); |
| @@ -746,6 +999,10 @@ extern __typeof(gomp_loop_dynamic_next) |
| __attribute__((alias ("gomp_loop_dynamic_next"))); |
| extern __typeof(gomp_loop_guided_next) GOMP_loop_nonmonotonic_guided_next |
| __attribute__((alias ("gomp_loop_guided_next"))); |
| +extern __typeof(GOMP_loop_runtime_next) GOMP_loop_nonmonotonic_runtime_next |
| + __attribute__((alias ("GOMP_loop_runtime_next"))); |
| +extern __typeof(GOMP_loop_runtime_next) GOMP_loop_maybe_nonmonotonic_runtime_next |
| + __attribute__((alias ("GOMP_loop_runtime_next"))); |
| |
| extern __typeof(gomp_loop_ordered_static_next) GOMP_loop_ordered_static_next |
| __attribute__((alias ("gomp_loop_ordered_static_next"))); |
| @@ -791,6 +1048,20 @@ GOMP_loop_nonmonotonic_guided_start (lon |
| } |
| |
| bool |
| +GOMP_loop_nonmonotonic_runtime_start (long start, long end, long incr, |
| + long *istart, long *iend) |
| +{ |
| + return GOMP_loop_runtime_start (start, end, incr, istart, iend); |
| +} |
| + |
| +bool |
| +GOMP_loop_maybe_nonmonotonic_runtime_start (long start, long end, long incr, |
| + long *istart, long *iend) |
| +{ |
| + return GOMP_loop_runtime_start (start, end, incr, istart, iend); |
| +} |
| + |
| +bool |
| GOMP_loop_ordered_static_start (long start, long end, long incr, |
| long chunk_size, long *istart, long *iend) |
| { |
| @@ -869,6 +1140,18 @@ GOMP_loop_nonmonotonic_guided_next (long |
| } |
| |
| bool |
| +GOMP_loop_nonmonotonic_runtime_next (long *istart, long *iend) |
| +{ |
| + return GOMP_loop_runtime_next (istart, iend); |
| +} |
| + |
| +bool |
| +GOMP_loop_maybe_nonmonotonic_runtime_next (long *istart, long *iend) |
| +{ |
| + return GOMP_loop_runtime_next (istart, iend); |
| +} |
| + |
| +bool |
| GOMP_loop_ordered_static_next (long *istart, long *iend) |
| { |
| return gomp_loop_ordered_static_next (istart, iend); |
| |
| |
| @@ -49,3 +49,14 @@ GOMP_PLUGIN_acc_thread (void) |
| struct goacc_thread *thr = goacc_thread (); |
| return thr ? thr->target_tls : NULL; |
| } |
| + |
| +int |
| +GOMP_PLUGIN_acc_default_dim (unsigned int i) |
| +{ |
| + if (i >= GOMP_DIM_MAX) |
| + { |
| + gomp_fatal ("invalid dimension argument: %d", i); |
| + return -1; |
| + } |
| + return goacc_default_dims[i]; |
| +} |
| |
| |
| @@ -1,4 +1,4 @@ |
| -/* Copyright (C) 2005-2018 Free Software Foundation, Inc. |
| +/* Copyright (C) 2005-2019 Free Software Foundation, Inc. |
| Contributed by Richard Henderson <rth@redhat.com>. |
| |
| This file is part of the GNU Offloading and Multi Processing Library |
| @@ -31,6 +31,7 @@ |
| |
| #include <stdbool.h> |
| #include <stddef.h> |
| +#include "gstdint.h" |
| |
| /* barrier.c */ |
| |
| @@ -56,6 +57,12 @@ extern bool GOMP_loop_nonmonotonic_dynam |
| long *, long *); |
| extern bool GOMP_loop_nonmonotonic_guided_start (long, long, long, long, |
| long *, long *); |
| +extern bool GOMP_loop_nonmonotonic_runtime_start (long, long, long, |
| + long *, long *); |
| +extern bool GOMP_loop_maybe_nonmonotonic_runtime_start (long, long, long, |
| + long *, long *); |
| +extern bool GOMP_loop_start (long, long, long, long, long, long *, long *, |
| + uintptr_t *, void **); |
| |
| extern bool GOMP_loop_ordered_static_start (long, long, long, long, |
| long *, long *); |
| @@ -64,6 +71,8 @@ extern bool GOMP_loop_ordered_dynamic_st |
| extern bool GOMP_loop_ordered_guided_start (long, long, long, long, |
| long *, long *); |
| extern bool GOMP_loop_ordered_runtime_start (long, long, long, long *, long *); |
| +extern bool GOMP_loop_ordered_start (long, long, long, long, long, long *, |
| + long *, uintptr_t *, void **); |
| |
| extern bool GOMP_loop_static_next (long *, long *); |
| extern bool GOMP_loop_dynamic_next (long *, long *); |
| @@ -71,6 +80,8 @@ extern bool GOMP_loop_guided_next (long |
| extern bool GOMP_loop_runtime_next (long *, long *); |
| extern bool GOMP_loop_nonmonotonic_dynamic_next (long *, long *); |
| extern bool GOMP_loop_nonmonotonic_guided_next (long *, long *); |
| +extern bool GOMP_loop_nonmonotonic_runtime_next (long *, long *); |
| +extern bool GOMP_loop_maybe_nonmonotonic_runtime_next (long *, long *); |
| |
| extern bool GOMP_loop_ordered_static_next (long *, long *); |
| extern bool GOMP_loop_ordered_dynamic_next (long *, long *); |
| @@ -85,6 +96,8 @@ extern bool GOMP_loop_doacross_guided_st |
| long *); |
| extern bool GOMP_loop_doacross_runtime_start (unsigned, long *, long *, |
| long *); |
| +extern bool GOMP_loop_doacross_start (unsigned, long *, long, long, long *, |
| + long *, uintptr_t *, void **); |
| |
| extern void GOMP_parallel_loop_static_start (void (*)(void *), void *, |
| unsigned, long, long, long, long); |
| @@ -112,6 +125,13 @@ extern void GOMP_parallel_loop_nonmonoto |
| extern void GOMP_parallel_loop_nonmonotonic_guided (void (*)(void *), void *, |
| unsigned, long, long, |
| long, long, unsigned); |
| +extern void GOMP_parallel_loop_nonmonotonic_runtime (void (*)(void *), void *, |
| + unsigned, long, long, |
| + long, unsigned); |
| +extern void GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*)(void *), |
| + void *, unsigned, |
| + long, long, |
| + long, unsigned); |
| |
| extern void GOMP_loop_end (void); |
| extern void GOMP_loop_end_nowait (void); |
| @@ -154,6 +174,21 @@ extern bool GOMP_loop_ull_nonmonotonic_g |
| unsigned long long, |
| unsigned long long *, |
| unsigned long long *); |
| +extern bool GOMP_loop_ull_nonmonotonic_runtime_start (bool, unsigned long long, |
| + unsigned long long, |
| + unsigned long long, |
| + unsigned long long *, |
| + unsigned long long *); |
| +extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool, |
| + unsigned long long, |
| + unsigned long long, |
| + unsigned long long, |
| + unsigned long long *, |
| + unsigned long long *); |
| +extern bool GOMP_loop_ull_start (bool, unsigned long long, unsigned long long, |
| + unsigned long long, long, unsigned long long, |
| + unsigned long long *, unsigned long long *, |
| + uintptr_t *, void **); |
| |
| extern bool GOMP_loop_ull_ordered_static_start (bool, unsigned long long, |
| unsigned long long, |
| @@ -178,6 +213,13 @@ extern bool GOMP_loop_ull_ordered_runtim |
| unsigned long long, |
| unsigned long long *, |
| unsigned long long *); |
| +extern bool GOMP_loop_ull_ordered_start (bool, unsigned long long, |
| + unsigned long long, |
| + unsigned long long, long, |
| + unsigned long long, |
| + unsigned long long *, |
| + unsigned long long *, |
| + uintptr_t *, void **); |
| |
| extern bool GOMP_loop_ull_static_next (unsigned long long *, |
| unsigned long long *); |
| @@ -191,6 +233,10 @@ extern bool GOMP_loop_ull_nonmonotonic_d |
| unsigned long long *); |
| extern bool GOMP_loop_ull_nonmonotonic_guided_next (unsigned long long *, |
| unsigned long long *); |
| +extern bool GOMP_loop_ull_nonmonotonic_runtime_next (unsigned long long *, |
| + unsigned long long *); |
| +extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_next (unsigned long long *, |
| + unsigned long long *); |
| |
| extern bool GOMP_loop_ull_ordered_static_next (unsigned long long *, |
| unsigned long long *); |
| @@ -220,6 +266,11 @@ extern bool GOMP_loop_ull_doacross_runti |
| unsigned long long *, |
| unsigned long long *, |
| unsigned long long *); |
| +extern bool GOMP_loop_ull_doacross_start (unsigned, unsigned long long *, |
| + long, unsigned long long, |
| + unsigned long long *, |
| + unsigned long long *, |
| + uintptr_t *, void **); |
| |
| /* ordered.c */ |
| |
| @@ -235,6 +286,8 @@ extern void GOMP_doacross_ull_wait (unsi |
| extern void GOMP_parallel_start (void (*) (void *), void *, unsigned); |
| extern void GOMP_parallel_end (void); |
| extern void GOMP_parallel (void (*) (void *), void *, unsigned, unsigned); |
| +extern unsigned GOMP_parallel_reductions (void (*) (void *), void *, unsigned, |
| + unsigned); |
| extern bool GOMP_cancel (int, bool); |
| extern bool GOMP_cancellation_point (int); |
| |
| @@ -251,13 +304,19 @@ extern void GOMP_taskloop_ull (void (*) |
| unsigned long long, unsigned long long, |
| unsigned long long); |
| extern void GOMP_taskwait (void); |
| +extern void GOMP_taskwait_depend (void **); |
| extern void GOMP_taskyield (void); |
| extern void GOMP_taskgroup_start (void); |
| extern void GOMP_taskgroup_end (void); |
| +extern void GOMP_taskgroup_reduction_register (uintptr_t *); |
| +extern void GOMP_taskgroup_reduction_unregister (uintptr_t *); |
| +extern void GOMP_task_reduction_remap (size_t, size_t, void **); |
| +extern void GOMP_workshare_task_reduction_unregister (bool); |
| |
| /* sections.c */ |
| |
| extern unsigned GOMP_sections_start (unsigned); |
| +extern unsigned GOMP_sections2_start (unsigned, uintptr_t *, void **); |
| extern unsigned GOMP_sections_next (void); |
| extern void GOMP_parallel_sections_start (void (*) (void *), void *, |
| unsigned, unsigned); |
| @@ -293,6 +352,11 @@ extern void GOMP_target_enter_exit_data |
| void **); |
| extern void GOMP_teams (unsigned int, unsigned int); |
| |
| +/* teams.c */ |
| + |
| +extern void GOMP_teams_reg (void (*) (void *), void *, unsigned, unsigned, |
| + unsigned); |
| + |
| /* oacc-parallel.c */ |
| |
| extern void GOACC_parallel_keyed (int, void (*) (void *), size_t, |
| |
| |
| @@ -26,6 +26,8 @@ |
| /* This is a generic stub implementation of a CPU affinity setting. */ |
| |
| #include "libgomp.h" |
| +#include <string.h> |
| +#include <stdio.h> |
| |
| void |
| gomp_init_affinity (void) |
| @@ -138,5 +140,17 @@ gomp_get_place_proc_ids_8 (int place_num |
| (void) ids; |
| } |
| |
| +void |
| +gomp_display_affinity_place (char *buffer, size_t size, size_t *ret, |
| + int place) |
| +{ |
| + char buf[sizeof (long) * 3 + 4]; |
| + if (gomp_available_cpus > 1) |
| + sprintf (buf, "0-%lu", gomp_available_cpus - 1); |
| + else |
| + strcpy (buf, "0"); |
| + gomp_display_string (buffer, size, ret, buf, strlen (buf)); |
| +} |
| + |
| ialias(omp_get_place_num_procs) |
| ialias(omp_get_place_proc_ids) |
| |
| |
| @@ -26,8 +26,11 @@ |
| /* This file handles the SECTIONS construct. */ |
| |
| #include "libgomp.h" |
| +#include <string.h> |
| |
| |
| +ialias_redirect (GOMP_taskgroup_reduction_register) |
| + |
| /* Initialize the given work share construct from the given arguments. */ |
| |
| static inline void |
| @@ -72,7 +75,7 @@ GOMP_sections_start (unsigned count) |
| struct gomp_thread *thr = gomp_thread (); |
| long s, e, ret; |
| |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_sections_init (thr->ts.work_share, count); |
| gomp_work_share_init_done (); |
| @@ -95,6 +98,66 @@ GOMP_sections_start (unsigned count) |
| return ret; |
| } |
| |
| +unsigned |
| +GOMP_sections2_start (unsigned count, uintptr_t *reductions, void **mem) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + long s, e, ret; |
| + |
| + if (reductions) |
| + gomp_workshare_taskgroup_start (); |
| + if (gomp_work_share_start (0)) |
| + { |
| + gomp_sections_init (thr->ts.work_share, count); |
| + if (reductions) |
| + { |
| + GOMP_taskgroup_reduction_register (reductions); |
| + thr->task->taskgroup->workshare = true; |
| + thr->ts.work_share->task_reductions = reductions; |
| + } |
| + if (mem) |
| + { |
| + uintptr_t size = (uintptr_t) *mem; |
| + if (size > (sizeof (struct gomp_work_share) |
| + - offsetof (struct gomp_work_share, |
| + inline_ordered_team_ids))) |
| + thr->ts.work_share->ordered_team_ids |
| + = gomp_malloc_cleared (size); |
| + else |
| + memset (thr->ts.work_share->ordered_team_ids, '\0', size); |
| + *mem = (void *) thr->ts.work_share->ordered_team_ids; |
| + } |
| + gomp_work_share_init_done (); |
| + } |
| + else |
| + { |
| + if (reductions) |
| + { |
| + uintptr_t *first_reductions = thr->ts.work_share->task_reductions; |
| + gomp_workshare_task_reduction_register (reductions, |
| + first_reductions); |
| + } |
| + if (mem) |
| + *mem = (void *) thr->ts.work_share->ordered_team_ids; |
| + } |
| + |
| +#ifdef HAVE_SYNC_BUILTINS |
| + if (gomp_iter_dynamic_next (&s, &e)) |
| + ret = s; |
| + else |
| + ret = 0; |
| +#else |
| + gomp_mutex_lock (&thr->ts.work_share->lock); |
| + if (gomp_iter_dynamic_next_locked (&s, &e)) |
| + ret = s; |
| + else |
| + ret = 0; |
| + gomp_mutex_unlock (&thr->ts.work_share->lock); |
| +#endif |
| + |
| + return ret; |
| +} |
| + |
| /* This routine is called when the thread completes processing of the |
| section currently assigned to it. If the work-share construct is |
| bound directly to a parallel construct, then the construct may have |
| @@ -140,7 +203,7 @@ GOMP_parallel_sections_start (void (*fn) |
| num_threads = gomp_resolve_num_threads (num_threads, count); |
| team = gomp_new_team (num_threads); |
| gomp_sections_init (&team->work_shares[0], count); |
| - gomp_team_start (fn, data, num_threads, 0, team); |
| + gomp_team_start (fn, data, num_threads, 0, team, NULL); |
| } |
| |
| ialias_redirect (GOMP_parallel_end) |
| @@ -154,7 +217,7 @@ GOMP_parallel_sections (void (*fn) (void |
| num_threads = gomp_resolve_num_threads (num_threads, count); |
| team = gomp_new_team (num_threads); |
| gomp_sections_init (&team->work_shares[0], count); |
| - gomp_team_start (fn, data, num_threads, flags, team); |
| + gomp_team_start (fn, data, num_threads, flags, team, NULL); |
| fn (data); |
| GOMP_parallel_end (); |
| } |
| |
| |
| @@ -396,6 +396,56 @@ gomp_get_place_proc_ids_8 (int place_num |
| *ids++ = i; |
| } |
| |
| +void |
| +gomp_display_affinity_place (char *buffer, size_t size, size_t *ret, |
| + int place) |
| +{ |
| + cpu_set_t *cpusetp; |
| + char buf[sizeof (long) * 3 + 4]; |
| + if (place >= 0 && place < gomp_places_list_len) |
| + cpusetp = (cpu_set_t *) gomp_places_list[place]; |
| + else if (gomp_cpusetp) |
| + cpusetp = gomp_cpusetp; |
| + else |
| + { |
| + if (gomp_available_cpus > 1) |
| + sprintf (buf, "0-%lu", gomp_available_cpus - 1); |
| + else |
| + strcpy (buf, "0"); |
| + gomp_display_string (buffer, size, ret, buf, strlen (buf)); |
| + return; |
| + } |
| + |
| + unsigned long i, max = 8 * gomp_cpuset_size, start; |
| + bool prev_set = false; |
| + start = max; |
| + for (i = 0; i <= max; i++) |
| + { |
| + bool this_set; |
| + if (i == max) |
| + this_set = false; |
| + else |
| + this_set = CPU_ISSET_S (i, gomp_cpuset_size, cpusetp); |
| + if (this_set != prev_set) |
| + { |
| + prev_set = this_set; |
| + if (this_set) |
| + { |
| + char *p = buf; |
| + if (start != max) |
| + *p++ = ','; |
| + sprintf (p, "%lu", i); |
| + start = i; |
| + } |
| + else if (i == start + 1) |
| + continue; |
| + else |
| + sprintf (buf, "-%lu", i - 1); |
| + gomp_display_string (buffer, size, ret, buf, strlen (buf)); |
| + } |
| + } |
| +} |
| + |
| ialias(omp_get_place_num_procs) |
| ialias(omp_get_place_proc_ids) |
| |
| |
| |
| @@ -45,8 +45,8 @@ sys_futex0(int *addr, int op, int val) |
| "=r"(r8), "=r"(r10) |
| : "r"(r15), "r"(out0), "r"(out1), "r"(out2), "r"(out3) |
| : "memory", "out4", "out5", "out6", "out7", |
| - /* Non-stacked integer registers, minus r8, r10, r15. */ |
| - "r2", "r3", "r9", "r11", "r12", "r13", "r14", "r16", "r17", "r18", |
| + /* Non-stacked integer registers, minus r8, r10, r12, r15. */ |
| + "r2", "r3", "r9", "r11", "r13", "r14", "r16", "r17", "r18", |
| "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", |
| "r28", "r29", "r30", "r31", |
| /* Predicate registers. */ |
| |
| |
| @@ -0,0 +1,57 @@ |
| +/* Copyright (C) 2015-2019 Free Software Foundation, Inc. |
| + Contributed by Alexander Monakov <amonakov@ispras.ru> |
| + |
| + This file is part of the GNU Offloading and Multi Processing Library |
| + (libgomp). |
| + |
| + Libgomp is free software; you can redistribute it and/or modify it |
| + under the terms of the GNU General Public License as published by |
| + the Free Software Foundation; either version 3, or (at your option) |
| + any later version. |
| + |
| + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY |
| + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| + FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
| + more details. |
| + |
| + Under Section 7 of GPL version 3, you are granted additional |
| + permissions described in the GCC Runtime Library Exception, version |
| + 3.1, as published by the Free Software Foundation. |
| + |
| + You should have received a copy of the GNU General Public License and |
| + a copy of the GCC Runtime Library Exception along with this program; |
| + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| + <http://www.gnu.org/licenses/>. */ |
| + |
| +/* This file defines OpenMP API entry points that accelerator targets are |
| + expected to replace. */ |
| + |
| +#include "libgomp.h" |
| + |
| +void |
| +GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams, |
| + unsigned int thread_limit, unsigned int flags) |
| +{ |
| + (void) fn; |
| + (void) data; |
| + (void) flags; |
| + (void) num_teams; |
| + (void) thread_limit; |
| +} |
| + |
| +int |
| +omp_get_num_teams (void) |
| +{ |
| + return gomp_num_teams_var + 1; |
| +} |
| + |
| +int |
| +omp_get_team_num (void) |
| +{ |
| + int ctaid; |
| + asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid)); |
| + return ctaid; |
| +} |
| + |
| +ialias (omp_get_num_teams) |
| +ialias (omp_get_team_num) |
| |
| |
| @@ -116,7 +116,8 @@ gomp_thread_start (struct gomp_thread_po |
| |
| void |
| gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, |
| - unsigned flags, struct gomp_team *team) |
| + unsigned flags, struct gomp_team *team, |
| + struct gomp_taskgroup *taskgroup) |
| { |
| struct gomp_thread *thr, *nthr; |
| struct gomp_task *task; |
| @@ -147,6 +148,7 @@ gomp_team_start (void (*fn) (void *), vo |
| nthreads_var = icv->nthreads_var; |
| gomp_init_task (thr->task, task, icv); |
| team->implicit_task[0].icv.nthreads_var = nthreads_var; |
| + team->implicit_task[0].taskgroup = taskgroup; |
| |
| if (nthreads == 1) |
| return; |
| @@ -166,6 +168,7 @@ gomp_team_start (void (*fn) (void *), vo |
| nthr->task = &team->implicit_task[i]; |
| gomp_init_task (nthr->task, task, icv); |
| team->implicit_task[i].icv.nthreads_var = nthreads_var; |
| + team->implicit_task[i].taskgroup = taskgroup; |
| nthr->fn = fn; |
| nthr->data = data; |
| team->ordered_release[i] = &nthr->release; |
| @@ -174,5 +177,11 @@ gomp_team_start (void (*fn) (void *), vo |
| gomp_simple_barrier_wait (&pool->threads_dock); |
| } |
| |
| +int |
| +gomp_pause_host (void) |
| +{ |
| + return -1; |
| +} |
| + |
| #include "../../team.c" |
| #endif |
| |
| |
| @@ -1,358 +0,0 @@ |
| -/* OpenACC constructs |
| - |
| - Copyright (C) 2014-2018 Free Software Foundation, Inc. |
| - |
| - Contributed by Mentor Embedded. |
| - |
| - This file is part of the GNU Offloading and Multi Processing Library |
| - (libgomp). |
| - |
| - Libgomp is free software; you can redistribute it and/or modify it |
| - under the terms of the GNU General Public License as published by |
| - the Free Software Foundation; either version 3, or (at your option) |
| - any later version. |
| - |
| - Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY |
| - WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| - FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
| - more details. |
| - |
| - Under Section 7 of GPL version 3, you are granted additional |
| - permissions described in the GCC Runtime Library Exception, version |
| - 3.1, as published by the Free Software Foundation. |
| - |
| - You should have received a copy of the GNU General Public License and |
| - a copy of the GCC Runtime Library Exception along with this program; |
| - see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| - <http://www.gnu.org/licenses/>. */ |
| - |
| -#include "libgomp_g.h" |
| - |
| -__asm__ (".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1);\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1);\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1);\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1);\n" |
| - "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_num_threads\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n" |
| - "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_thread_num\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n" |
| - "// BEGIN GLOBAL FUNCTION DECL: abort\n" |
| - ".extern .func abort;\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1)\n" |
| - "{\n" |
| - ".reg .u32 %ar1;\n" |
| - ".reg .u32 %retval;\n" |
| - ".reg .u64 %hr10;\n" |
| - ".reg .u32 %r22;\n" |
| - ".reg .u32 %r23;\n" |
| - ".reg .u32 %r24;\n" |
| - ".reg .u32 %r25;\n" |
| - ".reg .u32 %r26;\n" |
| - ".reg .u32 %r27;\n" |
| - ".reg .u32 %r28;\n" |
| - ".reg .u32 %r29;\n" |
| - ".reg .pred %r30;\n" |
| - ".reg .u32 %r31;\n" |
| - ".reg .pred %r32;\n" |
| - ".reg .u32 %r33;\n" |
| - ".reg .pred %r34;\n" |
| - ".local .align 8 .b8 %frame[4];\n" |
| - "ld.param.u32 %ar1,[%in_ar1];\n" |
| - "mov.u32 %r27,%ar1;\n" |
| - "st.local.u32 [%frame],%r27;\n" |
| - "ld.local.u32 %r28,[%frame];\n" |
| - "mov.u32 %r29,1;\n" |
| - "setp.eq.u32 %r30,%r28,%r29;\n" |
| - "@%r30 bra $L4;\n" |
| - "mov.u32 %r31,2;\n" |
| - "setp.eq.u32 %r32,%r28,%r31;\n" |
| - "@%r32 bra $L5;\n" |
| - "mov.u32 %r33,0;\n" |
| - "setp.eq.u32 %r34,%r28,%r33;\n" |
| - "@!%r34 bra $L8;\n" |
| - "mov.u32 %r23,%tid.x;\n" |
| - "mov.u32 %r22,%r23;\n" |
| - "bra $L7;\n" |
| - "$L4:\n" |
| - "mov.u32 %r24,%tid.y;\n" |
| - "mov.u32 %r22,%r24;\n" |
| - "bra $L7;\n" |
| - "$L5:\n" |
| - "mov.u32 %r25,%tid.z;\n" |
| - "mov.u32 %r22,%r25;\n" |
| - "bra $L7;\n" |
| - "$L8:\n" |
| - "{\n" |
| - "{\n" |
| - "call abort;\n" |
| - "}\n" |
| - "}\n" |
| - "$L7:\n" |
| - "mov.u32 %r26,%r22;\n" |
| - "mov.u32 %retval,%r26;\n" |
| - "st.param.u32 [%out_retval],%retval;\n" |
| - "ret;\n" |
| - "}\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1)\n" |
| - "{\n" |
| - ".reg .u32 %ar1;\n" |
| - ".reg .u32 %retval;\n" |
| - ".reg .u64 %hr10;\n" |
| - ".reg .u32 %r22;\n" |
| - ".reg .u32 %r23;\n" |
| - ".reg .u32 %r24;\n" |
| - ".reg .u32 %r25;\n" |
| - ".reg .u32 %r26;\n" |
| - ".reg .u32 %r27;\n" |
| - ".reg .u32 %r28;\n" |
| - ".reg .u32 %r29;\n" |
| - ".reg .pred %r30;\n" |
| - ".reg .u32 %r31;\n" |
| - ".reg .pred %r32;\n" |
| - ".reg .u32 %r33;\n" |
| - ".reg .pred %r34;\n" |
| - ".local .align 8 .b8 %frame[4];\n" |
| - "ld.param.u32 %ar1,[%in_ar1];\n" |
| - "mov.u32 %r27,%ar1;\n" |
| - "st.local.u32 [%frame],%r27;\n" |
| - "ld.local.u32 %r28,[%frame];\n" |
| - "mov.u32 %r29,1;\n" |
| - "setp.eq.u32 %r30,%r28,%r29;\n" |
| - "@%r30 bra $L11;\n" |
| - "mov.u32 %r31,2;\n" |
| - "setp.eq.u32 %r32,%r28,%r31;\n" |
| - "@%r32 bra $L12;\n" |
| - "mov.u32 %r33,0;\n" |
| - "setp.eq.u32 %r34,%r28,%r33;\n" |
| - "@!%r34 bra $L15;\n" |
| - "mov.u32 %r23,%ntid.x;\n" |
| - "mov.u32 %r22,%r23;\n" |
| - "bra $L14;\n" |
| - "$L11:\n" |
| - "mov.u32 %r24,%ntid.y;\n" |
| - "mov.u32 %r22,%r24;\n" |
| - "bra $L14;\n" |
| - "$L12:\n" |
| - "mov.u32 %r25,%ntid.z;\n" |
| - "mov.u32 %r22,%r25;\n" |
| - "bra $L14;\n" |
| - "$L15:\n" |
| - "{\n" |
| - "{\n" |
| - "call abort;\n" |
| - "}\n" |
| - "}\n" |
| - "$L14:\n" |
| - "mov.u32 %r26,%r22;\n" |
| - "mov.u32 %retval,%r26;\n" |
| - "st.param.u32 [%out_retval],%retval;\n" |
| - "ret;\n" |
| - "}\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1)\n" |
| - "{\n" |
| - ".reg .u32 %ar1;\n" |
| - ".reg .u32 %retval;\n" |
| - ".reg .u64 %hr10;\n" |
| - ".reg .u32 %r22;\n" |
| - ".reg .u32 %r23;\n" |
| - ".reg .u32 %r24;\n" |
| - ".reg .u32 %r25;\n" |
| - ".reg .u32 %r26;\n" |
| - ".reg .u32 %r27;\n" |
| - ".reg .u32 %r28;\n" |
| - ".reg .u32 %r29;\n" |
| - ".reg .pred %r30;\n" |
| - ".reg .u32 %r31;\n" |
| - ".reg .pred %r32;\n" |
| - ".reg .u32 %r33;\n" |
| - ".reg .pred %r34;\n" |
| - ".local .align 8 .b8 %frame[4];\n" |
| - "ld.param.u32 %ar1,[%in_ar1];\n" |
| - "mov.u32 %r27,%ar1;\n" |
| - "st.local.u32 [%frame],%r27;\n" |
| - "ld.local.u32 %r28,[%frame];\n" |
| - "mov.u32 %r29,1;\n" |
| - "setp.eq.u32 %r30,%r28,%r29;\n" |
| - "@%r30 bra $L18;\n" |
| - "mov.u32 %r31,2;\n" |
| - "setp.eq.u32 %r32,%r28,%r31;\n" |
| - "@%r32 bra $L19;\n" |
| - "mov.u32 %r33,0;\n" |
| - "setp.eq.u32 %r34,%r28,%r33;\n" |
| - "@!%r34 bra $L22;\n" |
| - "mov.u32 %r23,%ctaid.x;\n" |
| - "mov.u32 %r22,%r23;\n" |
| - "bra $L21;\n" |
| - "$L18:\n" |
| - "mov.u32 %r24,%ctaid.y;\n" |
| - "mov.u32 %r22,%r24;\n" |
| - "bra $L21;\n" |
| - "$L19:\n" |
| - "mov.u32 %r25,%ctaid.z;\n" |
| - "mov.u32 %r22,%r25;\n" |
| - "bra $L21;\n" |
| - "$L22:\n" |
| - "{\n" |
| - "{\n" |
| - "call abort;\n" |
| - "}\n" |
| - "}\n" |
| - "$L21:\n" |
| - "mov.u32 %r26,%r22;\n" |
| - "mov.u32 %retval,%r26;\n" |
| - "st.param.u32 [%out_retval],%retval;\n" |
| - "ret;\n" |
| - "}\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1)\n" |
| - "{\n" |
| - ".reg .u32 %ar1;\n" |
| - ".reg .u32 %retval;\n" |
| - ".reg .u64 %hr10;\n" |
| - ".reg .u32 %r22;\n" |
| - ".reg .u32 %r23;\n" |
| - ".reg .u32 %r24;\n" |
| - ".reg .u32 %r25;\n" |
| - ".reg .u32 %r26;\n" |
| - ".reg .u32 %r27;\n" |
| - ".reg .u32 %r28;\n" |
| - ".reg .u32 %r29;\n" |
| - ".reg .pred %r30;\n" |
| - ".reg .u32 %r31;\n" |
| - ".reg .pred %r32;\n" |
| - ".reg .u32 %r33;\n" |
| - ".reg .pred %r34;\n" |
| - ".local .align 8 .b8 %frame[4];\n" |
| - "ld.param.u32 %ar1,[%in_ar1];\n" |
| - "mov.u32 %r27,%ar1;\n" |
| - "st.local.u32 [%frame],%r27;\n" |
| - "ld.local.u32 %r28,[%frame];\n" |
| - "mov.u32 %r29,1;\n" |
| - "setp.eq.u32 %r30,%r28,%r29;\n" |
| - "@%r30 bra $L25;\n" |
| - "mov.u32 %r31,2;\n" |
| - "setp.eq.u32 %r32,%r28,%r31;\n" |
| - "@%r32 bra $L26;\n" |
| - "mov.u32 %r33,0;\n" |
| - "setp.eq.u32 %r34,%r28,%r33;\n" |
| - "@!%r34 bra $L29;\n" |
| - "mov.u32 %r23,%nctaid.x;\n" |
| - "mov.u32 %r22,%r23;\n" |
| - "bra $L28;\n" |
| - "$L25:\n" |
| - "mov.u32 %r24,%nctaid.y;\n" |
| - "mov.u32 %r22,%r24;\n" |
| - "bra $L28;\n" |
| - "$L26:\n" |
| - "mov.u32 %r25,%nctaid.z;\n" |
| - "mov.u32 %r22,%r25;\n" |
| - "bra $L28;\n" |
| - "$L29:\n" |
| - "{\n" |
| - "{\n" |
| - "call abort;\n" |
| - "}\n" |
| - "}\n" |
| - "$L28:\n" |
| - "mov.u32 %r26,%r22;\n" |
| - "mov.u32 %retval,%r26;\n" |
| - "st.param.u32 [%out_retval],%retval;\n" |
| - "ret;\n" |
| - "}\n" |
| - "// BEGIN GLOBAL FUNCTION DEF: GOACC_get_num_threads\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads\n" |
| - "{\n" |
| - ".reg .u32 %retval;\n" |
| - ".reg .u64 %hr10;\n" |
| - ".reg .u32 %r22;\n" |
| - ".reg .u32 %r23;\n" |
| - ".reg .u32 %r24;\n" |
| - ".reg .u32 %r25;\n" |
| - ".reg .u32 %r26;\n" |
| - ".reg .u32 %r27;\n" |
| - ".reg .u32 %r28;\n" |
| - ".reg .u32 %r29;\n" |
| - "mov.u32 %r26,0;\n" |
| - "{\n" |
| - ".param .u32 %retval_in;\n" |
| - "{\n" |
| - ".param .u32 %out_arg0;\n" |
| - "st.param.u32 [%out_arg0],%r26;\n" |
| - "call (%retval_in),GOACC_ntid,(%out_arg0);\n" |
| - "}\n" |
| - "ld.param.u32 %r27,[%retval_in];\n" |
| - "}\n" |
| - "mov.u32 %r22,%r27;\n" |
| - "mov.u32 %r28,0;\n" |
| - "{\n" |
| - ".param .u32 %retval_in;\n" |
| - "{\n" |
| - ".param .u32 %out_arg0;\n" |
| - "st.param.u32 [%out_arg0],%r28;\n" |
| - "call (%retval_in),GOACC_nctaid,(%out_arg0);\n" |
| - "}\n" |
| - "ld.param.u32 %r29,[%retval_in];\n" |
| - "}\n" |
| - "mov.u32 %r23,%r29;\n" |
| - "mul.lo.u32 %r24,%r22,%r23;\n" |
| - "mov.u32 %r25,%r24;\n" |
| - "mov.u32 %retval,%r25;\n" |
| - "st.param.u32 [%out_retval],%retval;\n" |
| - "ret;\n" |
| - "}\n" |
| - "// BEGIN GLOBAL FUNCTION DEF: GOACC_get_thread_num\n" |
| - ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num\n" |
| - "{\n" |
| - ".reg .u32 %retval;\n" |
| - ".reg .u64 %hr10;\n" |
| - ".reg .u32 %r22;\n" |
| - ".reg .u32 %r23;\n" |
| - ".reg .u32 %r24;\n" |
| - ".reg .u32 %r25;\n" |
| - ".reg .u32 %r26;\n" |
| - ".reg .u32 %r27;\n" |
| - ".reg .u32 %r28;\n" |
| - ".reg .u32 %r29;\n" |
| - ".reg .u32 %r30;\n" |
| - ".reg .u32 %r31;\n" |
| - ".reg .u32 %r32;\n" |
| - ".reg .u32 %r33;\n" |
| - "mov.u32 %r28,0;\n" |
| - "{\n" |
| - ".param .u32 %retval_in;\n" |
| - "{\n" |
| - ".param .u32 %out_arg0;\n" |
| - "st.param.u32 [%out_arg0],%r28;\n" |
| - "call (%retval_in),GOACC_ntid,(%out_arg0);\n" |
| - "}\n" |
| - "ld.param.u32 %r29,[%retval_in];\n" |
| - "}\n" |
| - "mov.u32 %r22,%r29;\n" |
| - "mov.u32 %r30,0;\n" |
| - "{\n" |
| - ".param .u32 %retval_in;\n" |
| - "{\n" |
| - ".param .u32 %out_arg0;\n" |
| - "st.param.u32 [%out_arg0],%r30;\n" |
| - "call (%retval_in),GOACC_ctaid,(%out_arg0);\n" |
| - "}\n" |
| - "ld.param.u32 %r31,[%retval_in];\n" |
| - "}\n" |
| - "mov.u32 %r23,%r31;\n" |
| - "mul.lo.u32 %r24,%r22,%r23;\n" |
| - "mov.u32 %r32,0;\n" |
| - "{\n" |
| - ".param .u32 %retval_in;\n" |
| - "{\n" |
| - ".param .u32 %out_arg0;\n" |
| - "st.param.u32 [%out_arg0],%r32;\n" |
| - "call (%retval_in),GOACC_tid,(%out_arg0);\n" |
| - "}\n" |
| - "ld.param.u32 %r33,[%retval_in];\n" |
| - "}\n" |
| - "mov.u32 %r25,%r33;\n" |
| - "add.u32 %r26,%r24,%r25;\n" |
| - "mov.u32 %r27,%r26;\n" |
| - "mov.u32 %retval,%r27;\n" |
| - "st.param.u32 [%out_retval],%retval;\n" |
| - "ret;\n" |
| - "}\n"); |
| |
| |
| @@ -47,3 +47,21 @@ GOMP_teams (unsigned int num_teams, unsi |
| } |
| gomp_num_teams_var = num_teams - 1; |
| } |
| + |
| +int |
| +omp_pause_resource (omp_pause_resource_t kind, int device_num) |
| +{ |
| + (void) kind; |
| + (void) device_num; |
| + return -1; |
| +} |
| + |
| +int |
| +omp_pause_resource_all (omp_pause_resource_t kind) |
| +{ |
| + (void) kind; |
| + return -1; |
| +} |
| + |
| +ialias (omp_pause_resource) |
| +ialias (omp_pause_resource_all) |
| |
| |
| @@ -46,20 +46,6 @@ omp_get_num_devices (void) |
| } |
| |
| int |
| -omp_get_num_teams (void) |
| -{ |
| - return gomp_num_teams_var + 1; |
| -} |
| - |
| -int |
| -omp_get_team_num (void) |
| -{ |
| - int ctaid; |
| - asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid)); |
| - return ctaid; |
| -} |
| - |
| -int |
| omp_is_initial_device (void) |
| { |
| /* NVPTX is an accelerator-only target. */ |
| @@ -69,6 +55,4 @@ omp_is_initial_device (void) |
| ialias (omp_set_default_device) |
| ialias (omp_get_default_device) |
| ialias (omp_get_num_devices) |
| -ialias (omp_get_num_teams) |
| -ialias (omp_get_team_num) |
| ialias (omp_is_initial_device) |
| |
| |
| @@ -0,0 +1,51 @@ |
| +/* Copyright (C) 2018-2019 Free Software Foundation, Inc. |
| + |
| + This file is part of the GNU Offloading and Multi Processing Library |
| + (libgomp). |
| + |
| + Libgomp is free software; you can redistribute it and/or modify it |
| + under the terms of the GNU General Public License as published by |
| + the Free Software Foundation; either version 3, or (at your option) |
| + any later version. |
| + |
| + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY |
| + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| + FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
| + more details. |
| + |
| + Under Section 7 of GPL version 3, you are granted additional |
| + permissions described in the GCC Runtime Library Exception, version |
| + 3.1, as published by the Free Software Foundation. |
| + |
| + You should have received a copy of the GNU General Public License and |
| + a copy of the GCC Runtime Library Exception along with this program; |
| + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| + <http://www.gnu.org/licenses/>. */ |
| + |
| +#include "libgomp.h" |
| +#include <string.h> |
| +#include <stdio.h> |
| +#include <stdlib.h> |
| +#ifdef HAVE_UNISTD_H |
| +#include <unistd.h> |
| +#endif |
| +#ifdef HAVE_INTTYPES_H |
| +# include <inttypes.h> /* For PRIx64. */ |
| +#endif |
| +#ifdef HAVE_UNAME |
| +#include <sys/utsname.h> |
| +#endif |
| + |
| +/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for nvptx, |
| + while the nvptx newlib implementation does not support those functions. |
| + Override the configure test results here. */ |
| +#undef HAVE_GETPID |
| +#undef HAVE_GETHOSTNAME |
| + |
| +/* The nvptx newlib implementation does not support fwrite, but it does support |
| + write. Map fwrite to write. */ |
| +#undef fwrite |
| +#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size)) |
| + |
| +#include "../../affinity-fmt.c" |
| + |
| |
| |
| @@ -0,0 +1,68 @@ |
| +/* Copyright (C) 2018-2019 Free Software Foundation, Inc. |
| + Contributed by Jakub Jelinek <jakub@redhat.com>. |
| + |
| + This file is part of the GNU Offloading and Multi Processing Library |
| + (libgomp). |
| + |
| + Libgomp is free software; you can redistribute it and/or modify it |
| + under the terms of the GNU General Public License as published by |
| + the Free Software Foundation; either version 3, or (at your option) |
| + any later version. |
| + |
| + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY |
| + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| + FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
| + more details. |
| + |
| + Under Section 7 of GPL version 3, you are granted additional |
| + permissions described in the GCC Runtime Library Exception, version |
| + 3.1, as published by the Free Software Foundation. |
| + |
| + You should have received a copy of the GNU General Public License and |
| + a copy of the GCC Runtime Library Exception along with this program; |
| + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| + <http://www.gnu.org/licenses/>. */ |
| + |
| +#include "libgomp.h" |
| +#include <string.h> |
| +#include <stdio.h> |
| +#include <stdlib.h> |
| +#ifdef HAVE_UNISTD_H |
| +#include <unistd.h> |
| +#endif |
| +#ifdef HAVE_INTTYPES_H |
| +# include <inttypes.h> /* For PRIx64. */ |
| +#endif |
| +#define WIN32_LEAN_AND_MEAN |
| +#include <windows.h> |
| +#include <errno.h> |
| + |
| +static int |
| +gomp_gethostname (char *name, size_t len) |
| +{ |
| + /* On Win9x GetComputerName fails if the input size is less |
| + than MAX_COMPUTERNAME_LENGTH + 1. */ |
| + char buffer[MAX_COMPUTERNAME_LENGTH + 1]; |
| + DWORD size = sizeof (buffer); |
| + int ret = 0; |
| + |
| + if (!GetComputerName (buffer, &size)) |
| + return -1; |
| + |
| + if ((size = strlen (buffer) + 1) > len) |
| + { |
| + errno = EINVAL; |
| + /* Truncate as per POSIX spec. We do not NUL-terminate. */ |
| + size = len; |
| + ret = -1; |
| + } |
| + memcpy (name, buffer, (size_t) size); |
| + |
| + return ret; |
| +} |
| + |
| +#undef gethostname |
| +#define gethostname gomp_gethostname |
| +#define HAVE_GETHOSTNAME 1 |
| + |
| +#include "../../affinity-fmt.c" |
| |
| |
| @@ -72,184 +72,5 @@ do_wait (int *addr, int val) |
| futex_wait (addr, val); |
| } |
| |
| -/* Everything below this point should be identical to the Linux |
| - implementation. */ |
| - |
| -void |
| -gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) |
| -{ |
| - if (__builtin_expect (state & BAR_WAS_LAST, 0)) |
| - { |
| - /* Next time we'll be awaiting TOTAL threads again. */ |
| - bar->awaited = bar->total; |
| - __atomic_store_n (&bar->generation, bar->generation + BAR_INCR, |
| - MEMMODEL_RELEASE); |
| - futex_wake ((int *) &bar->generation, INT_MAX); |
| - } |
| - else |
| - { |
| - do |
| - do_wait ((int *) &bar->generation, state); |
| - while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE) == state); |
| - } |
| -} |
| - |
| -void |
| -gomp_barrier_wait (gomp_barrier_t *bar) |
| -{ |
| - gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar)); |
| -} |
| - |
| -/* Like gomp_barrier_wait, except that if the encountering thread |
| - is not the last one to hit the barrier, it returns immediately. |
| - The intended usage is that a thread which intends to gomp_barrier_destroy |
| - this barrier calls gomp_barrier_wait, while all other threads |
| - call gomp_barrier_wait_last. When gomp_barrier_wait returns, |
| - the barrier can be safely destroyed. */ |
| - |
| -void |
| -gomp_barrier_wait_last (gomp_barrier_t *bar) |
| -{ |
| - gomp_barrier_state_t state = gomp_barrier_wait_start (bar); |
| - if (state & BAR_WAS_LAST) |
| - gomp_barrier_wait_end (bar, state); |
| -} |
| - |
| -void |
| -gomp_team_barrier_wake (gomp_barrier_t *bar, int count) |
| -{ |
| - futex_wake ((int *) &bar->generation, count == 0 ? INT_MAX : count); |
| -} |
| - |
| -void |
| -gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) |
| -{ |
| - unsigned int generation, gen; |
| - |
| - if (__builtin_expect (state & BAR_WAS_LAST, 0)) |
| - { |
| - /* Next time we'll be awaiting TOTAL threads again. */ |
| - struct gomp_thread *thr = gomp_thread (); |
| - struct gomp_team *team = thr->ts.team; |
| - |
| - bar->awaited = bar->total; |
| - team->work_share_cancelled = 0; |
| - if (__builtin_expect (team->task_count, 0)) |
| - { |
| - gomp_barrier_handle_tasks (state); |
| - state &= ~BAR_WAS_LAST; |
| - } |
| - else |
| - { |
| - state &= ~BAR_CANCELLED; |
| - state += BAR_INCR - BAR_WAS_LAST; |
| - __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); |
| - futex_wake ((int *) &bar->generation, INT_MAX); |
| - return; |
| - } |
| - } |
| - |
| - generation = state; |
| - state &= ~BAR_CANCELLED; |
| - do |
| - { |
| - do_wait ((int *) &bar->generation, generation); |
| - gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); |
| - if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) |
| - { |
| - gomp_barrier_handle_tasks (state); |
| - gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); |
| - } |
| - generation |= gen & BAR_WAITING_FOR_TASK; |
| - } |
| - while (gen != state + BAR_INCR); |
| -} |
| - |
| -void |
| -gomp_team_barrier_wait (gomp_barrier_t *bar) |
| -{ |
| - gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar)); |
| -} |
| - |
| -void |
| -gomp_team_barrier_wait_final (gomp_barrier_t *bar) |
| -{ |
| - gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar); |
| - if (__builtin_expect (state & BAR_WAS_LAST, 0)) |
| - bar->awaited_final = bar->total; |
| - gomp_team_barrier_wait_end (bar, state); |
| -} |
| - |
| -bool |
| -gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, |
| - gomp_barrier_state_t state) |
| -{ |
| - unsigned int generation, gen; |
| - |
| - if (__builtin_expect (state & BAR_WAS_LAST, 0)) |
| - { |
| - /* Next time we'll be awaiting TOTAL threads again. */ |
| - /* BAR_CANCELLED should never be set in state here, because |
| - cancellation means that at least one of the threads has been |
| - cancelled, thus on a cancellable barrier we should never see |
| - all threads to arrive. */ |
| - struct gomp_thread *thr = gomp_thread (); |
| - struct gomp_team *team = thr->ts.team; |
| - |
| - bar->awaited = bar->total; |
| - team->work_share_cancelled = 0; |
| - if (__builtin_expect (team->task_count, 0)) |
| - { |
| - gomp_barrier_handle_tasks (state); |
| - state &= ~BAR_WAS_LAST; |
| - } |
| - else |
| - { |
| - state += BAR_INCR - BAR_WAS_LAST; |
| - __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); |
| - futex_wake ((int *) &bar->generation, INT_MAX); |
| - return false; |
| - } |
| - } |
| - |
| - if (__builtin_expect (state & BAR_CANCELLED, 0)) |
| - return true; |
| - |
| - generation = state; |
| - do |
| - { |
| - do_wait ((int *) &bar->generation, generation); |
| - gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); |
| - if (__builtin_expect (gen & BAR_CANCELLED, 0)) |
| - return true; |
| - if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) |
| - { |
| - gomp_barrier_handle_tasks (state); |
| - gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); |
| - } |
| - generation |= gen & BAR_WAITING_FOR_TASK; |
| - } |
| - while (gen != state + BAR_INCR); |
| - |
| - return false; |
| -} |
| - |
| -bool |
| -gomp_team_barrier_wait_cancel (gomp_barrier_t *bar) |
| -{ |
| - return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar)); |
| -} |
| - |
| -void |
| -gomp_team_barrier_cancel (struct gomp_team *team) |
| -{ |
| - gomp_mutex_lock (&team->task_lock); |
| - if (team->barrier.generation & BAR_CANCELLED) |
| - { |
| - gomp_mutex_unlock (&team->task_lock); |
| - return; |
| - } |
| - team->barrier.generation |= BAR_CANCELLED; |
| - gomp_mutex_unlock (&team->task_lock); |
| - futex_wake ((int *) &team->barrier.generation, INT_MAX); |
| -} |
| +#define GOMP_WAIT_H 1 |
| +#include "../linux/bar.c" |
| |
| |
| @@ -0,0 +1,49 @@ |
| +/* Copyright (C) 2018-2019 Free Software Foundation, Inc. |
| + |
| + This file is part of the GNU Offloading and Multi Processing Library |
| + (libgomp). |
| + |
| + Libgomp is free software; you can redistribute it and/or modify it |
| + under the terms of the GNU General Public License as published by |
| + the Free Software Foundation; either version 3, or (at your option) |
| + any later version. |
| + |
| + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY |
| + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| + FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
| + more details. |
| + |
| + Under Section 7 of GPL version 3, you are granted additional |
| + permissions described in the GCC Runtime Library Exception, version |
| + 3.1, as published by the Free Software Foundation. |
| + |
| + You should have received a copy of the GNU General Public License and |
| + a copy of the GCC Runtime Library Exception along with this program; |
| + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| + <http://www.gnu.org/licenses/>. */ |
| + |
| +#include "libgomp.h" |
| +#include <string.h> |
| +#include <stdio.h> |
| +#include <stdlib.h> |
| +#ifdef HAVE_UNISTD_H |
| +#include <unistd.h> |
| +#endif |
| +#ifdef HAVE_INTTYPES_H |
| +# include <inttypes.h> /* For PRIx64. */ |
| +#endif |
| +#ifdef HAVE_UNAME |
| +#include <sys/utsname.h> |
| +#endif |
| + |
| +/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for RTEMS, |
| + but the extra information they give are of little value for the user. |
| + Override the configure test results here. */ |
| +#undef HAVE_GETPID |
| +#undef HAVE_GETHOSTNAME |
| + |
| +/* Avoid the complex fwrite() in favour of the simple write(). */ |
| +#undef fwrite |
| +#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size)) |
| + |
| +#include "../../affinity-fmt.c" |
| |
| |
| @@ -1,5 +1,8 @@ |
| /* config.h.in. Generated from configure.ac by autoheader. */ |
| |
| +/* Define to 1 if you have the `aligned_alloc' function. */ |
| +#undef HAVE_ALIGNED_ALLOC |
| + |
| /* Define to 1 if the target assembler supports .symver directive. */ |
| #undef HAVE_AS_SYMVER_DIRECTIVE |
| |
| @@ -33,9 +36,15 @@ |
| /* Define to 1 if you have the `getgid' function. */ |
| #undef HAVE_GETGID |
| |
| +/* Define if gethostname is supported. */ |
| +#undef HAVE_GETHOSTNAME |
| + |
| /* Define to 1 if you have the `getloadavg' function. */ |
| #undef HAVE_GETLOADAVG |
| |
| +/* Define if getpid is supported. */ |
| +#undef HAVE_GETPID |
| + |
| /* Define to 1 if you have the `getuid' function. */ |
| #undef HAVE_GETUID |
| |
| @@ -45,9 +54,15 @@ |
| /* Define to 1 if you have the `dl' library (-ldl). */ |
| #undef HAVE_LIBDL |
| |
| +/* Define to 1 if you have the `memalign' function. */ |
| +#undef HAVE_MEMALIGN |
| + |
| /* Define to 1 if you have the <memory.h> header file. */ |
| #undef HAVE_MEMORY_H |
| |
| +/* Define to 1 if you have the `posix_memalign' function. */ |
| +#undef HAVE_POSIX_MEMALIGN |
| + |
| /* Define if pthread_{,attr_}{g,s}etaffinity_np is supported. */ |
| #undef HAVE_PTHREAD_AFFINITY_NP |
| |
| @@ -103,9 +118,15 @@ |
| /* Define to 1 if the target supports thread-local storage. */ |
| #undef HAVE_TLS |
| |
| +/* Define if uname is supported and struct utsname has nodename field. */ |
| +#undef HAVE_UNAME |
| + |
| /* Define to 1 if you have the <unistd.h> header file. */ |
| #undef HAVE_UNISTD_H |
| |
| +/* Define to 1 if you have the `_aligned_malloc' function. */ |
| +#undef HAVE__ALIGNED_MALLOC |
| + |
| /* Define to 1 if you have the `__secure_getenv' function. */ |
| #undef HAVE___SECURE_GETENV |
| |
| @@ -125,8 +146,8 @@ |
| */ |
| #undef LT_OBJDIR |
| |
| -/* Define to offload targets, separated by commas. */ |
| -#undef OFFLOAD_TARGETS |
| +/* Define to offload plugins, separated by commas. */ |
| +#undef OFFLOAD_PLUGINS |
| |
| /* Name of package */ |
| #undef PACKAGE |
| |
| |
| @@ -0,0 +1,74 @@ |
| +/* Copyright (C) 2018-2019 Free Software Foundation, Inc. |
| + Contributed by Jakub Jelinek <jakub@redhat.com>. |
| + |
| + This file is part of the GNU Offloading and Multi Processing Library |
| + (libgomp). |
| + |
| + Libgomp is free software; you can redistribute it and/or modify it |
| + under the terms of the GNU General Public License as published by |
| + the Free Software Foundation; either version 3, or (at your option) |
| + any later version. |
| + |
| + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY |
| + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| + FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
| + more details. |
| + |
| + Under Section 7 of GPL version 3, you are granted additional |
| + permissions described in the GCC Runtime Library Exception, version |
| + 3.1, as published by the Free Software Foundation. |
| + |
| + You should have received a copy of the GNU General Public License and |
| + a copy of the GCC Runtime Library Exception along with this program; |
| + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| + <http://www.gnu.org/licenses/>. */ |
| + |
| +/* This file handles the host TEAMS construct. */ |
| + |
| +#include "libgomp.h" |
| +#include <limits.h> |
| + |
| +static unsigned gomp_num_teams = 1, gomp_team_num = 0; |
| + |
| +void |
| +GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams, |
| + unsigned int thread_limit, unsigned int flags) |
| +{ |
| + (void) flags; |
| + (void) num_teams; |
| + unsigned old_thread_limit_var = 0; |
| + if (thread_limit) |
| + { |
| + struct gomp_task_icv *icv = gomp_icv (true); |
| + old_thread_limit_var = icv->thread_limit_var; |
| + icv->thread_limit_var |
| + = thread_limit > INT_MAX ? UINT_MAX : thread_limit; |
| + } |
| + if (num_teams == 0) |
| + num_teams = 3; |
| + gomp_num_teams = num_teams; |
| + for (gomp_team_num = 0; gomp_team_num < num_teams; gomp_team_num++) |
| + fn (data); |
| + gomp_num_teams = 1; |
| + gomp_team_num = 0; |
| + if (thread_limit) |
| + { |
| + struct gomp_task_icv *icv = gomp_icv (true); |
| + icv->thread_limit_var = old_thread_limit_var; |
| + } |
| +} |
| + |
| +int |
| +omp_get_num_teams (void) |
| +{ |
| + return gomp_num_teams; |
| +} |
| + |
| +int |
| +omp_get_team_num (void) |
| +{ |
| + return gomp_team_num; |
| +} |
| + |
| +ialias (omp_get_num_teams) |
| +ialias (omp_get_team_num) |
| |
| |
| @@ -164,6 +164,22 @@ OMP_4.5 { |
| omp_target_disassociate_ptr; |
| } OMP_4.0; |
| |
| +OMP_5.0 { |
| + global: |
| + omp_capture_affinity; |
| + omp_capture_affinity_; |
| + omp_display_affinity; |
| + omp_display_affinity_; |
| + omp_get_affinity_format; |
| + omp_get_affinity_format_; |
| + omp_set_affinity_format; |
| + omp_set_affinity_format_; |
| + omp_pause_resource; |
| + omp_pause_resource_; |
| + omp_pause_resource_all; |
| + omp_pause_resource_all_; |
| +} OMP_4.5; |
| + |
| GOMP_1.0 { |
| global: |
| GOMP_atomic_end; |
| @@ -298,6 +314,34 @@ GOMP_4.5 { |
| GOMP_parallel_loop_nonmonotonic_guided; |
| } GOMP_4.0.1; |
| |
| +GOMP_5.0 { |
| + global: |
| + GOMP_loop_doacross_start; |
| + GOMP_loop_maybe_nonmonotonic_runtime_next; |
| + GOMP_loop_maybe_nonmonotonic_runtime_start; |
| + GOMP_loop_nonmonotonic_runtime_next; |
| + GOMP_loop_nonmonotonic_runtime_start; |
| + GOMP_loop_ordered_start; |
| + GOMP_loop_start; |
| + GOMP_loop_ull_doacross_start; |
| + GOMP_loop_ull_maybe_nonmonotonic_runtime_next; |
| + GOMP_loop_ull_maybe_nonmonotonic_runtime_start; |
| + GOMP_loop_ull_nonmonotonic_runtime_next; |
| + GOMP_loop_ull_nonmonotonic_runtime_start; |
| + GOMP_loop_ull_ordered_start; |
| + GOMP_loop_ull_start; |
| + GOMP_parallel_loop_maybe_nonmonotonic_runtime; |
| + GOMP_parallel_loop_nonmonotonic_runtime; |
| + GOMP_parallel_reductions; |
| + GOMP_sections2_start; |
| + GOMP_taskgroup_reduction_register; |
| + GOMP_taskgroup_reduction_unregister; |
| + GOMP_task_reduction_remap; |
| + GOMP_taskwait_depend; |
| + GOMP_teams_reg; |
| + GOMP_workshare_task_reduction_unregister; |
| +} GOMP_4.5; |
| + |
| OACC_2.0 { |
| global: |
| acc_get_num_devices; |
| @@ -386,6 +430,52 @@ OACC_2.0.1 { |
| acc_pcreate; |
| } OACC_2.0; |
| |
| +OACC_2.5 { |
| + global: |
| + acc_copyin_async; |
| + acc_copyin_async_32_h_; |
| + acc_copyin_async_64_h_; |
| + acc_copyin_async_array_h_; |
| + acc_copyout_async; |
| + acc_copyout_async_32_h_; |
| + acc_copyout_async_64_h_; |
| + acc_copyout_async_array_h_; |
| + acc_copyout_finalize; |
| + acc_copyout_finalize_32_h_; |
| + acc_copyout_finalize_64_h_; |
| + acc_copyout_finalize_array_h_; |
| + acc_copyout_finalize_async; |
| + acc_copyout_finalize_async_32_h_; |
| + acc_copyout_finalize_async_64_h_; |
| + acc_copyout_finalize_async_array_h_; |
| + acc_create_async; |
| + acc_create_async_32_h_; |
| + acc_create_async_64_h_; |
| + acc_create_async_array_h_; |
| + acc_delete_async; |
| + acc_delete_async_32_h_; |
| + acc_delete_async_64_h_; |
| + acc_delete_async_array_h_; |
| + acc_delete_finalize; |
| + acc_delete_finalize_32_h_; |
| + acc_delete_finalize_64_h_; |
| + acc_delete_finalize_array_h_; |
| + acc_delete_finalize_async; |
| + acc_delete_finalize_async_32_h_; |
| + acc_delete_finalize_async_64_h_; |
| + acc_delete_finalize_async_array_h_; |
| + acc_memcpy_from_device_async; |
| + acc_memcpy_to_device_async; |
| + acc_update_device_async; |
| + acc_update_device_async_32_h_; |
| + acc_update_device_async_64_h_; |
| + acc_update_device_async_array_h_; |
| + acc_update_self_async; |
| + acc_update_self_async_32_h_; |
| + acc_update_self_async_64_h_; |
| + acc_update_self_async_array_h_; |
| +} OACC_2.0.1; |
| + |
| GOACC_2.0 { |
| global: |
| GOACC_data_end; |
| @@ -420,3 +510,8 @@ GOMP_PLUGIN_1.1 { |
| global: |
| GOMP_PLUGIN_target_task_completion; |
| } GOMP_PLUGIN_1.0; |
| + |
| +GOMP_PLUGIN_1.2 { |
| + global: |
| + GOMP_PLUGIN_acc_default_dim; |
| +} GOMP_PLUGIN_1.1; |
| |
| |
| @@ -34,7 +34,7 @@ |
| int |
| acc_async_test (int async) |
| { |
| - if (async < acc_async_sync) |
| + if (!async_valid_p (async)) |
| gomp_fatal ("invalid async argument: %d", async); |
| |
| struct goacc_thread *thr = goacc_thread (); |
| @@ -59,7 +59,7 @@ acc_async_test_all (void) |
| void |
| acc_wait (int async) |
| { |
| - if (async < acc_async_sync) |
| + if (!async_valid_p (async)) |
| gomp_fatal ("invalid async argument: %d", async); |
| |
| struct goacc_thread *thr = goacc_thread (); |
| @@ -117,7 +117,7 @@ acc_async_wait_all (void) |
| void |
| acc_wait_all_async (int async) |
| { |
| - if (async < acc_async_sync) |
| + if (!async_valid_p (async)) |
| gomp_fatal ("invalid async argument: %d", async); |
| |
| struct goacc_thread *thr = goacc_thread (); |
| |
| |
| @@ -27,8 +27,12 @@ |
| |
| #include <limits.h> |
| #include <stdlib.h> |
| +#include <string.h> |
| #include "libgomp.h" |
| |
| +ialias (GOMP_loop_ull_runtime_next) |
| +ialias_redirect (GOMP_taskgroup_reduction_register) |
| + |
| typedef unsigned long long gomp_ull; |
| |
| /* Initialize the given work share construct from the given arguments. */ |
| @@ -104,7 +108,7 @@ gomp_loop_ull_static_start (bool up, gom |
| struct gomp_thread *thr = gomp_thread (); |
| |
| thr->ts.static_trip = 0; |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, |
| GFS_STATIC, chunk_size); |
| @@ -122,7 +126,7 @@ gomp_loop_ull_dynamic_start (bool up, go |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, |
| GFS_DYNAMIC, chunk_size); |
| @@ -148,7 +152,7 @@ gomp_loop_ull_guided_start (bool up, gom |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, |
| GFS_GUIDED, chunk_size); |
| @@ -171,7 +175,7 @@ GOMP_loop_ull_runtime_start (bool up, go |
| gomp_ull incr, gomp_ull *istart, gomp_ull *iend) |
| { |
| struct gomp_task_icv *icv = gomp_icv (false); |
| - switch (icv->run_sched_var) |
| + switch (icv->run_sched_var & ~GFS_MONOTONIC) |
| { |
| case GFS_STATIC: |
| return gomp_loop_ull_static_start (up, start, end, incr, |
| @@ -195,6 +199,99 @@ GOMP_loop_ull_runtime_start (bool up, go |
| } |
| } |
| |
| +static long |
| +gomp_adjust_sched (long sched, gomp_ull *chunk_size) |
| +{ |
| + sched &= ~GFS_MONOTONIC; |
| + switch (sched) |
| + { |
| + case GFS_STATIC: |
| + case GFS_DYNAMIC: |
| + case GFS_GUIDED: |
| + return sched; |
| + /* GFS_RUNTIME is used for runtime schedule without monotonic |
| + or nonmonotonic modifiers on the clause. |
| + GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic |
| + modifier. */ |
| + case GFS_RUNTIME: |
| + /* GFS_AUTO is used for runtime schedule with nonmonotonic |
| + modifier. */ |
| + case GFS_AUTO: |
| + { |
| + struct gomp_task_icv *icv = gomp_icv (false); |
| + sched = icv->run_sched_var & ~GFS_MONOTONIC; |
| + switch (sched) |
| + { |
| + case GFS_STATIC: |
| + case GFS_DYNAMIC: |
| + case GFS_GUIDED: |
| + *chunk_size = icv->run_sched_chunk_size; |
| + break; |
| + case GFS_AUTO: |
| + sched = GFS_STATIC; |
| + *chunk_size = 0; |
| + break; |
| + default: |
| + abort (); |
| + } |
| + return sched; |
| + } |
| + default: |
| + abort (); |
| + } |
| +} |
| + |
| +bool |
| +GOMP_loop_ull_start (bool up, gomp_ull start, gomp_ull end, |
| + gomp_ull incr, long sched, gomp_ull chunk_size, |
| + gomp_ull *istart, gomp_ull *iend, |
| + uintptr_t *reductions, void **mem) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + |
| + thr->ts.static_trip = 0; |
| + if (reductions) |
| + gomp_workshare_taskgroup_start (); |
| + if (gomp_work_share_start (0)) |
| + { |
| + sched = gomp_adjust_sched (sched, &chunk_size); |
| + gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, |
| + sched, chunk_size); |
| + if (reductions) |
| + { |
| + GOMP_taskgroup_reduction_register (reductions); |
| + thr->task->taskgroup->workshare = true; |
| + thr->ts.work_share->task_reductions = reductions; |
| + } |
| + if (mem) |
| + { |
| + uintptr_t size = (uintptr_t) *mem; |
| + if (size > (sizeof (struct gomp_work_share) |
| + - offsetof (struct gomp_work_share, |
| + inline_ordered_team_ids))) |
| + thr->ts.work_share->ordered_team_ids |
| + = gomp_malloc_cleared (size); |
| + else |
| + memset (thr->ts.work_share->ordered_team_ids, '\0', size); |
| + *mem = (void *) thr->ts.work_share->ordered_team_ids; |
| + } |
| + gomp_work_share_init_done (); |
| + } |
| + else |
| + { |
| + if (reductions) |
| + { |
| + uintptr_t *first_reductions = thr->ts.work_share->task_reductions; |
| + gomp_workshare_task_reduction_register (reductions, |
| + first_reductions); |
| + } |
| + if (mem) |
| + *mem = (void *) thr->ts.work_share->ordered_team_ids; |
| + } |
| + |
| + return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend); |
| +} |
| + |
| /* The *_ordered_*_start routines are similar. The only difference is that |
| this work-share construct is initialized to expect an ORDERED section. */ |
| |
| @@ -206,7 +303,7 @@ gomp_loop_ull_ordered_static_start (bool |
| struct gomp_thread *thr = gomp_thread (); |
| |
| thr->ts.static_trip = 0; |
| - if (gomp_work_share_start (true)) |
| + if (gomp_work_share_start (1)) |
| { |
| gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, |
| GFS_STATIC, chunk_size); |
| @@ -225,7 +322,7 @@ gomp_loop_ull_ordered_dynamic_start (boo |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (true)) |
| + if (gomp_work_share_start (1)) |
| { |
| gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, |
| GFS_DYNAMIC, chunk_size); |
| @@ -251,7 +348,7 @@ gomp_loop_ull_ordered_guided_start (bool |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (true)) |
| + if (gomp_work_share_start (1)) |
| { |
| gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, |
| GFS_GUIDED, chunk_size); |
| @@ -275,7 +372,7 @@ GOMP_loop_ull_ordered_runtime_start (boo |
| gomp_ull *iend) |
| { |
| struct gomp_task_icv *icv = gomp_icv (false); |
| - switch (icv->run_sched_var) |
| + switch (icv->run_sched_var & ~GFS_MONOTONIC) |
| { |
| case GFS_STATIC: |
| return gomp_loop_ull_ordered_static_start (up, start, end, incr, |
| @@ -299,6 +396,82 @@ GOMP_loop_ull_ordered_runtime_start (boo |
| } |
| } |
| |
| +bool |
| +GOMP_loop_ull_ordered_start (bool up, gomp_ull start, gomp_ull end, |
| + gomp_ull incr, long sched, gomp_ull chunk_size, |
| + gomp_ull *istart, gomp_ull *iend, |
| + uintptr_t *reductions, void **mem) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + size_t ordered = 1; |
| + bool ret; |
| + |
| + thr->ts.static_trip = 0; |
| + if (reductions) |
| + gomp_workshare_taskgroup_start (); |
| + if (mem) |
| + ordered += (uintptr_t) *mem; |
| + if (gomp_work_share_start (ordered)) |
| + { |
| + sched = gomp_adjust_sched (sched, &chunk_size); |
| + gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, |
| + sched, chunk_size); |
| + if (reductions) |
| + { |
| + GOMP_taskgroup_reduction_register (reductions); |
| + thr->task->taskgroup->workshare = true; |
| + thr->ts.work_share->task_reductions = reductions; |
| + } |
| + if (sched == GFS_STATIC) |
| + gomp_ordered_static_init (); |
| + else |
| + gomp_mutex_lock (&thr->ts.work_share->lock); |
| + gomp_work_share_init_done (); |
| + } |
| + else |
| + { |
| + if (reductions) |
| + { |
| + uintptr_t *first_reductions = thr->ts.work_share->task_reductions; |
| + gomp_workshare_task_reduction_register (reductions, |
| + first_reductions); |
| + } |
| + sched = thr->ts.work_share->sched; |
| + if (sched != GFS_STATIC) |
| + gomp_mutex_lock (&thr->ts.work_share->lock); |
| + } |
| + |
| + if (mem) |
| + { |
| + uintptr_t p |
| + = (uintptr_t) (thr->ts.work_share->ordered_team_ids |
| + + (thr->ts.team ? thr->ts.team->nthreads : 1)); |
| + p += __alignof__ (long long) - 1; |
| + p &= ~(__alignof__ (long long) - 1); |
| + *mem = (void *) p; |
| + } |
| + |
| + switch (sched) |
| + { |
| + case GFS_STATIC: |
| + case GFS_AUTO: |
| + return !gomp_iter_ull_static_next (istart, iend); |
| + case GFS_DYNAMIC: |
| + ret = gomp_iter_ull_dynamic_next_locked (istart, iend); |
| + break; |
| + case GFS_GUIDED: |
| + ret = gomp_iter_ull_guided_next_locked (istart, iend); |
| + break; |
| + default: |
| + abort (); |
| + } |
| + |
| + if (ret) |
| + gomp_ordered_first (); |
| + gomp_mutex_unlock (&thr->ts.work_share->lock); |
| + return ret; |
| +} |
| + |
| /* The *_doacross_*_start routines are similar. The only difference is that |
| this work-share construct is initialized to expect an ORDERED(N) - DOACROSS |
| section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1 |
| @@ -313,11 +486,11 @@ gomp_loop_ull_doacross_static_start (uns |
| struct gomp_thread *thr = gomp_thread (); |
| |
| thr->ts.static_trip = 0; |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, |
| GFS_STATIC, chunk_size); |
| - gomp_doacross_ull_init (ncounts, counts, chunk_size); |
| + gomp_doacross_ull_init (ncounts, counts, chunk_size, 0); |
| gomp_work_share_init_done (); |
| } |
| |
| @@ -332,11 +505,11 @@ gomp_loop_ull_doacross_dynamic_start (un |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, |
| GFS_DYNAMIC, chunk_size); |
| - gomp_doacross_ull_init (ncounts, counts, chunk_size); |
| + gomp_doacross_ull_init (ncounts, counts, chunk_size, 0); |
| gomp_work_share_init_done (); |
| } |
| |
| @@ -359,11 +532,11 @@ gomp_loop_ull_doacross_guided_start (uns |
| struct gomp_thread *thr = gomp_thread (); |
| bool ret; |
| |
| - if (gomp_work_share_start (false)) |
| + if (gomp_work_share_start (0)) |
| { |
| gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, |
| GFS_GUIDED, chunk_size); |
| - gomp_doacross_ull_init (ncounts, counts, chunk_size); |
| + gomp_doacross_ull_init (ncounts, counts, chunk_size, 0); |
| gomp_work_share_init_done (); |
| } |
| |
| @@ -383,7 +556,7 @@ GOMP_loop_ull_doacross_runtime_start (un |
| gomp_ull *istart, gomp_ull *iend) |
| { |
| struct gomp_task_icv *icv = gomp_icv (false); |
| - switch (icv->run_sched_var) |
| + switch (icv->run_sched_var & ~GFS_MONOTONIC) |
| { |
| case GFS_STATIC: |
| return gomp_loop_ull_doacross_static_start (ncounts, counts, |
| @@ -407,6 +580,51 @@ GOMP_loop_ull_doacross_runtime_start (un |
| } |
| } |
| |
| +bool |
| +GOMP_loop_ull_doacross_start (unsigned ncounts, gomp_ull *counts, |
| + long sched, gomp_ull chunk_size, |
| + gomp_ull *istart, gomp_ull *iend, |
| + uintptr_t *reductions, void **mem) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + |
| + thr->ts.static_trip = 0; |
| + if (reductions) |
| + gomp_workshare_taskgroup_start (); |
| + if (gomp_work_share_start (0)) |
| + { |
| + size_t extra = 0; |
| + if (mem) |
| + extra = (uintptr_t) *mem; |
| + sched = gomp_adjust_sched (sched, &chunk_size); |
| + gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, |
| + sched, chunk_size); |
| + gomp_doacross_ull_init (ncounts, counts, chunk_size, extra); |
| + if (reductions) |
| + { |
| + GOMP_taskgroup_reduction_register (reductions); |
| + thr->task->taskgroup->workshare = true; |
| + thr->ts.work_share->task_reductions = reductions; |
| + } |
| + gomp_work_share_init_done (); |
| + } |
| + else |
| + { |
| + if (reductions) |
| + { |
| + uintptr_t *first_reductions = thr->ts.work_share->task_reductions; |
| + gomp_workshare_task_reduction_register (reductions, |
| + first_reductions); |
| + } |
| + sched = thr->ts.work_share->sched; |
| + } |
| + |
| + if (mem) |
| + *mem = thr->ts.work_share->doacross->extra; |
| + |
| + return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend); |
| +} |
| + |
| /* The *_next routines are called when the thread completes processing of |
| the iteration block currently assigned to it. If the work-share |
| construct is bound directly to a parallel construct, then the iteration |
| @@ -570,6 +788,10 @@ extern __typeof(gomp_loop_ull_dynamic_st |
| __attribute__((alias ("gomp_loop_ull_dynamic_start"))); |
| extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_nonmonotonic_guided_start |
| __attribute__((alias ("gomp_loop_ull_guided_start"))); |
| +extern __typeof(GOMP_loop_ull_runtime_start) GOMP_loop_ull_nonmonotonic_runtime_start |
| + __attribute__((alias ("GOMP_loop_ull_runtime_start"))); |
| +extern __typeof(GOMP_loop_ull_runtime_start) GOMP_loop_ull_maybe_nonmonotonic_runtime_start |
| + __attribute__((alias ("GOMP_loop_ull_runtime_start"))); |
| |
| extern __typeof(gomp_loop_ull_ordered_static_start) GOMP_loop_ull_ordered_static_start |
| __attribute__((alias ("gomp_loop_ull_ordered_static_start"))); |
| @@ -595,6 +817,10 @@ extern __typeof(gomp_loop_ull_dynamic_ne |
| __attribute__((alias ("gomp_loop_ull_dynamic_next"))); |
| extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_nonmonotonic_guided_next |
| __attribute__((alias ("gomp_loop_ull_guided_next"))); |
| +extern __typeof(GOMP_loop_ull_runtime_next) GOMP_loop_ull_nonmonotonic_runtime_next |
| + __attribute__((alias ("GOMP_loop_ull_runtime_next"))); |
| +extern __typeof(GOMP_loop_ull_runtime_next) GOMP_loop_ull_maybe_nonmonotonic_runtime_next |
| + __attribute__((alias ("GOMP_loop_ull_runtime_next"))); |
| |
| extern __typeof(gomp_loop_ull_ordered_static_next) GOMP_loop_ull_ordered_static_next |
| __attribute__((alias ("gomp_loop_ull_ordered_static_next"))); |
| @@ -650,6 +876,23 @@ GOMP_loop_ull_nonmonotonic_guided_start |
| } |
| |
| bool |
| +GOMP_loop_ull_nonmonotonic_runtime_start (bool up, gomp_ull start, |
| + gomp_ull end, gomp_ull incr, |
| + gomp_ull *istart, gomp_ull *iend) |
| +{ |
| + return GOMP_loop_ull_runtime_start (up, start, end, incr, istart, iend); |
| +} |
| + |
| +bool |
| +GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool up, gomp_ull start, |
| + gomp_ull end, gomp_ull incr, |
| + gomp_ull *istart, |
| + gomp_ull *iend) |
| +{ |
| + return GOMP_loop_ull_runtime_start (up, start, end, incr, istart, iend); |
| +} |
| + |
| +bool |
| GOMP_loop_ull_ordered_static_start (bool up, gomp_ull start, gomp_ull end, |
| gomp_ull incr, gomp_ull chunk_size, |
| gomp_ull *istart, gomp_ull *iend) |
| @@ -734,6 +977,19 @@ GOMP_loop_ull_nonmonotonic_guided_next ( |
| } |
| |
| bool |
| +GOMP_loop_ull_nonmonotonic_runtime_next (gomp_ull *istart, gomp_ull *iend) |
| +{ |
| + return GOMP_loop_ull_runtime_next (istart, iend); |
| +} |
| + |
| +bool |
| +GOMP_loop_ull_maybe_nonmonotonic_runtime_next (gomp_ull *istart, |
| + gomp_ull *iend) |
| +{ |
| + return GOMP_loop_ull_runtime_next (istart, iend); |
| +} |
| + |
| +bool |
| GOMP_loop_ull_ordered_static_next (gomp_ull *istart, gomp_ull *iend) |
| { |
| return gomp_loop_ull_ordered_static_next (istart, iend); |
| |
| |
| @@ -99,6 +99,28 @@ void goacc_restore_bind (void); |
| void goacc_lazy_initialize (void); |
| void goacc_host_init (void); |
| |
| +static inline bool |
| +async_valid_stream_id_p (int async) |
| +{ |
| + return async >= 0; |
| +} |
| + |
| +static inline bool |
| +async_valid_p (int async) |
| +{ |
| + return (async == acc_async_noval || async == acc_async_sync |
| + || async_valid_stream_id_p (async)); |
| +} |
| + |
| +static inline bool |
| +async_synchronous_p (int async) |
| +{ |
| + if (!async_valid_p (async)) |
| + return true; |
| + |
| + return async == acc_async_sync; |
| +} |
| + |
| #ifdef HAVE_ATTRIBUTE_VISIBILITY |
| # pragma GCC visibility pop |
| #endif |
| |
| |
| @@ -223,6 +223,7 @@ mkdir_p = @mkdir_p@ |
| multi_basedir = @multi_basedir@ |
| offload_additional_lib_paths = @offload_additional_lib_paths@ |
| offload_additional_options = @offload_additional_options@ |
| +offload_plugins = @offload_plugins@ |
| offload_targets = @offload_targets@ |
| oldincludedir = @oldincludedir@ |
| pdfdir = @pdfdir@ |
| |
| |
| @@ -166,21 +166,72 @@ gomp_task_handle_depend (struct gomp_tas |
| void **depend) |
| { |
| size_t ndepend = (uintptr_t) depend[0]; |
| - size_t nout = (uintptr_t) depend[1]; |
| size_t i; |
| hash_entry_type ent; |
| |
| + if (ndepend) |
| + { |
| + /* depend[0] is total # */ |
| + size_t nout = (uintptr_t) depend[1]; /* # of out: and inout: */ |
| + /* ndepend - nout is # of in: */ |
| + for (i = 0; i < ndepend; i++) |
| + { |
| + task->depend[i].addr = depend[2 + i]; |
| + task->depend[i].is_in = i >= nout; |
| + } |
| + } |
| + else |
| + { |
| + ndepend = (uintptr_t) depend[1]; /* total # */ |
| + size_t nout = (uintptr_t) depend[2]; /* # of out: and inout: */ |
| + size_t nmutexinoutset = (uintptr_t) depend[3]; /* # of mutexinoutset: */ |
| + /* For now we treat mutexinoutset like out, which is compliant, but |
| + inefficient. */ |
| + size_t nin = (uintptr_t) depend[4]; /* # of in: */ |
| + /* ndepend - nout - nmutexinoutset - nin is # of depobjs */ |
| + size_t normal = nout + nmutexinoutset + nin; |
| + size_t n = 0; |
| + for (i = normal; i < ndepend; i++) |
| + { |
| + void **d = (void **) (uintptr_t) depend[5 + i]; |
| + switch ((uintptr_t) d[1]) |
| + { |
| + case GOMP_DEPEND_OUT: |
| + case GOMP_DEPEND_INOUT: |
| + case GOMP_DEPEND_MUTEXINOUTSET: |
| + break; |
| + case GOMP_DEPEND_IN: |
| + continue; |
| + default: |
| + gomp_fatal ("unknown omp_depend_t dependence type %d", |
| + (int) (uintptr_t) d[1]); |
| + } |
| + task->depend[n].addr = d[0]; |
| + task->depend[n++].is_in = 0; |
| + } |
| + for (i = 0; i < normal; i++) |
| + { |
| + task->depend[n].addr = depend[5 + i]; |
| + task->depend[n++].is_in = i >= nout + nmutexinoutset; |
| + } |
| + for (i = normal; i < ndepend; i++) |
| + { |
| + void **d = (void **) (uintptr_t) depend[5 + i]; |
| + if ((uintptr_t) d[1] != GOMP_DEPEND_IN) |
| + continue; |
| + task->depend[n].addr = d[0]; |
| + task->depend[n++].is_in = 1; |
| + } |
| + } |
| task->depend_count = ndepend; |
| task->num_dependees = 0; |
| if (parent->depend_hash == NULL) |
| parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); |
| for (i = 0; i < ndepend; i++) |
| { |
| - task->depend[i].addr = depend[2 + i]; |
| task->depend[i].next = NULL; |
| task->depend[i].prev = NULL; |
| task->depend[i].task = task; |
| - task->depend[i].is_in = i >= nout; |
| task->depend[i].redundant = false; |
| task->depend[i].redundant_out = false; |
| |
| @@ -205,7 +256,7 @@ gomp_task_handle_depend (struct gomp_tas |
| last = ent; |
| |
| /* depend(in:...) doesn't depend on earlier depend(in:...). */ |
| - if (i >= nout && ent->is_in) |
| + if (task->depend[i].is_in && ent->is_in) |
| continue; |
| |
| if (!ent->is_in) |
| @@ -280,9 +331,18 @@ gomp_task_handle_depend (struct gomp_tas |
| then the task may be executed by any member of the team. |
| |
| DEPEND is an array containing: |
| + if depend[0] is non-zero, then: |
| depend[0]: number of depend elements. |
| - depend[1]: number of depend elements of type "out". |
| - depend[2..N+1]: address of [1..N]th depend element. */ |
| + depend[1]: number of depend elements of type "out/inout". |
| + depend[2..N+1]: address of [1..N]th depend element. |
| + otherwise, when depend[0] is zero, then: |
| + depend[1]: number of depend elements. |
| + depend[2]: number of depend elements of type "out/inout". |
| + depend[3]: number of depend elements of type "mutexinoutset". |
| + depend[4]: number of depend elements of type "in". |
| + depend[5..4+depend[2]+depend[3]+depend[4]]: address of depend elements |
| + depend[5+depend[2]+depend[3]+depend[4]..4+depend[1]]: address of |
| + omp_depend_t objects. */ |
| |
| void |
| GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), |
| @@ -303,10 +363,20 @@ GOMP_task (void (*fn) (void *), void *da |
| #endif |
| |
| /* If parallel or taskgroup has been cancelled, don't start new tasks. */ |
| - if (team |
| - && (gomp_team_barrier_cancelled (&team->barrier) |
| - || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) |
| - return; |
| + if (__builtin_expect (gomp_cancel_var, 0) && team) |
| + { |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + return; |
| + if (thr->task->taskgroup) |
| + { |
| + if (thr->task->taskgroup->cancelled) |
| + return; |
| + if (thr->task->taskgroup->workshare |
| + && thr->task->taskgroup->prev |
| + && thr->task->taskgroup->prev->cancelled) |
| + return; |
| + } |
| + } |
| |
| if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0) |
| priority = 0; |
| @@ -377,7 +447,7 @@ GOMP_task (void (*fn) (void *), void *da |
| size_t depend_size = 0; |
| |
| if (flags & GOMP_TASK_FLAG_DEPEND) |
| - depend_size = ((uintptr_t) depend[0] |
| + depend_size = ((uintptr_t) (depend[0] ? depend[0] : depend[1]) |
| * sizeof (struct gomp_task_depend_entry)); |
| task = gomp_malloc (sizeof (*task) + depend_size |
| + arg_size + arg_align - 1); |
| @@ -404,14 +474,26 @@ GOMP_task (void (*fn) (void *), void *da |
| gomp_mutex_lock (&team->task_lock); |
| /* If parallel or taskgroup has been cancelled, don't start new |
| tasks. */ |
| - if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier) |
| - || (taskgroup && taskgroup->cancelled)) |
| - && !task->copy_ctors_done, 0)) |
| + if (__builtin_expect (gomp_cancel_var, 0) |
| + && !task->copy_ctors_done) |
| { |
| - gomp_mutex_unlock (&team->task_lock); |
| - gomp_finish_task (task); |
| - free (task); |
| - return; |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + { |
| + do_cancel: |
| + gomp_mutex_unlock (&team->task_lock); |
| + gomp_finish_task (task); |
| + free (task); |
| + return; |
| + } |
| + if (taskgroup) |
| + { |
| + if (taskgroup->cancelled) |
| + goto do_cancel; |
| + if (taskgroup->workshare |
| + && taskgroup->prev |
| + && taskgroup->prev->cancelled) |
| + goto do_cancel; |
| + } |
| } |
| if (taskgroup) |
| taskgroup->num_children++; |
| @@ -463,6 +545,7 @@ GOMP_task (void (*fn) (void *), void *da |
| |
| ialias (GOMP_taskgroup_start) |
| ialias (GOMP_taskgroup_end) |
| +ialias (GOMP_taskgroup_reduction_register) |
| |
| #define TYPE long |
| #define UTYPE unsigned long |
| @@ -601,10 +684,20 @@ gomp_create_target_task (struct gomp_dev |
| struct gomp_team *team = thr->ts.team; |
| |
| /* If parallel or taskgroup has been cancelled, don't start new tasks. */ |
| - if (team |
| - && (gomp_team_barrier_cancelled (&team->barrier) |
| - || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) |
| - return true; |
| + if (__builtin_expect (gomp_cancel_var, 0) && team) |
| + { |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + return true; |
| + if (thr->task->taskgroup) |
| + { |
| + if (thr->task->taskgroup->cancelled) |
| + return true; |
| + if (thr->task->taskgroup->workshare |
| + && thr->task->taskgroup->prev |
| + && thr->task->taskgroup->prev->cancelled) |
| + return true; |
| + } |
| + } |
| |
| struct gomp_target_task *ttask; |
| struct gomp_task *task; |
| @@ -617,7 +710,7 @@ gomp_create_target_task (struct gomp_dev |
| |
| if (depend != NULL) |
| { |
| - depend_cnt = (uintptr_t) depend[0]; |
| + depend_cnt = (uintptr_t) (depend[0] ? depend[0] : depend[1]); |
| depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry); |
| } |
| if (fn) |
| @@ -687,13 +780,25 @@ gomp_create_target_task (struct gomp_dev |
| task->final_task = 0; |
| gomp_mutex_lock (&team->task_lock); |
| /* If parallel or taskgroup has been cancelled, don't start new tasks. */ |
| - if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier) |
| - || (taskgroup && taskgroup->cancelled), 0)) |
| + if (__builtin_expect (gomp_cancel_var, 0)) |
| { |
| - gomp_mutex_unlock (&team->task_lock); |
| - gomp_finish_task (task); |
| - free (task); |
| - return true; |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + { |
| + do_cancel: |
| + gomp_mutex_unlock (&team->task_lock); |
| + gomp_finish_task (task); |
| + free (task); |
| + return true; |
| + } |
| + if (taskgroup) |
| + { |
| + if (taskgroup->cancelled) |
| + goto do_cancel; |
| + if (taskgroup->workshare |
| + && taskgroup->prev |
| + && taskgroup->prev->cancelled) |
| + goto do_cancel; |
| + } |
| } |
| if (depend_size) |
| { |
| @@ -986,10 +1091,21 @@ gomp_task_run_pre (struct gomp_task *chi |
| |
| if (--team->task_queued_count == 0) |
| gomp_team_barrier_clear_task_pending (&team->barrier); |
| - if ((gomp_team_barrier_cancelled (&team->barrier) |
| - || (taskgroup && taskgroup->cancelled)) |
| + if (__builtin_expect (gomp_cancel_var, 0) |
| && !child_task->copy_ctors_done) |
| - return true; |
| + { |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + return true; |
| + if (taskgroup) |
| + { |
| + if (taskgroup->cancelled) |
| + return true; |
| + if (taskgroup->workshare |
| + && taskgroup->prev |
| + && taskgroup->prev->cancelled) |
| + return true; |
| + } |
| + } |
| return false; |
| } |
| |
| @@ -1456,6 +1572,35 @@ GOMP_taskwait (void) |
| } |
| } |
| |
| +/* Called when encountering a taskwait directive with depend clause(s). |
| + Wait as if it was an mergeable included task construct with empty body. */ |
| + |
| +void |
| +GOMP_taskwait_depend (void **depend) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + struct gomp_team *team = thr->ts.team; |
| + |
| + /* If parallel or taskgroup has been cancelled, return early. */ |
| + if (__builtin_expect (gomp_cancel_var, 0) && team) |
| + { |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + return; |
| + if (thr->task->taskgroup) |
| + { |
| + if (thr->task->taskgroup->cancelled) |
| + return; |
| + if (thr->task->taskgroup->workshare |
| + && thr->task->taskgroup->prev |
| + && thr->task->taskgroup->prev->cancelled) |
| + return; |
| + } |
| + } |
| + |
| + if (thr->task && thr->task->depend_hash) |
| + gomp_task_maybe_wait_for_dependencies (depend); |
| +} |
| + |
| /* An undeferred task is about to run. Wait for all tasks that this |
| undeferred task depends on. |
| |
| @@ -1464,7 +1609,7 @@ GOMP_taskwait (void) |
| the scheduling queues. Then we iterate through these imminently |
| ready tasks (and possibly other high priority tasks), and run them. |
| If we run out of ready dependencies to execute, we either wait for |
| - the reamining dependencies to finish, or wait for them to get |
| + the remaining dependencies to finish, or wait for them to get |
| scheduled so we can run them. |
| |
| DEPEND is as in GOMP_task. */ |
| @@ -1477,21 +1622,50 @@ gomp_task_maybe_wait_for_dependencies (v |
| struct gomp_team *team = thr->ts.team; |
| struct gomp_task_depend_entry elem, *ent = NULL; |
| struct gomp_taskwait taskwait; |
| - size_t ndepend = (uintptr_t) depend[0]; |
| + size_t orig_ndepend = (uintptr_t) depend[0]; |
| size_t nout = (uintptr_t) depend[1]; |
| + size_t ndepend = orig_ndepend; |
| + size_t normal = ndepend; |
| + size_t n = 2; |
| size_t i; |
| size_t num_awaited = 0; |
| struct gomp_task *child_task = NULL; |
| struct gomp_task *to_free = NULL; |
| int do_wake = 0; |
| |
| + if (ndepend == 0) |
| + { |
| + ndepend = nout; |
| + nout = (uintptr_t) depend[2] + (uintptr_t) depend[3]; |
| + normal = nout + (uintptr_t) depend[4]; |
| + n = 5; |
| + } |
| gomp_mutex_lock (&team->task_lock); |
| for (i = 0; i < ndepend; i++) |
| { |
| - elem.addr = depend[i + 2]; |
| + elem.addr = depend[i + n]; |
| + elem.is_in = i >= nout; |
| + if (__builtin_expect (i >= normal, 0)) |
| + { |
| + void **d = (void **) elem.addr; |
| + switch ((uintptr_t) d[1]) |
| + { |
| + case GOMP_DEPEND_IN: |
| + break; |
| + case GOMP_DEPEND_OUT: |
| + case GOMP_DEPEND_INOUT: |
| + case GOMP_DEPEND_MUTEXINOUTSET: |
| + elem.is_in = 0; |
| + break; |
| + default: |
| + gomp_fatal ("unknown omp_depend_t dependence type %d", |
| + (int) (uintptr_t) d[1]); |
| + } |
| + elem.addr = d[0]; |
| + } |
| ent = htab_find (task->depend_hash, &elem); |
| for (; ent; ent = ent->next) |
| - if (i >= nout && ent->is_in) |
| + if (elem.is_in && ent->is_in) |
| continue; |
| else |
| { |
| @@ -1654,13 +1828,28 @@ GOMP_taskyield (void) |
| /* Nothing at the moment. */ |
| } |
| |
| +static inline struct gomp_taskgroup * |
| +gomp_taskgroup_init (struct gomp_taskgroup *prev) |
| +{ |
| + struct gomp_taskgroup *taskgroup |
| + = gomp_malloc (sizeof (struct gomp_taskgroup)); |
| + taskgroup->prev = prev; |
| + priority_queue_init (&taskgroup->taskgroup_queue); |
| + taskgroup->reductions = prev ? prev->reductions : NULL; |
| + taskgroup->in_taskgroup_wait = false; |
| + taskgroup->cancelled = false; |
| + taskgroup->workshare = false; |
| + taskgroup->num_children = 0; |
| + gomp_sem_init (&taskgroup->taskgroup_sem, 0); |
| + return taskgroup; |
| +} |
| + |
| void |
| GOMP_taskgroup_start (void) |
| { |
| struct gomp_thread *thr = gomp_thread (); |
| struct gomp_team *team = thr->ts.team; |
| struct gomp_task *task = thr->task; |
| - struct gomp_taskgroup *taskgroup; |
| |
| /* If team is NULL, all tasks are executed as |
| GOMP_TASK_UNDEFERRED tasks and thus all children tasks of |
| @@ -1668,14 +1857,7 @@ GOMP_taskgroup_start (void) |
| by the time GOMP_taskgroup_end is called. */ |
| if (team == NULL) |
| return; |
| - taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup)); |
| - taskgroup->prev = task->taskgroup; |
| - priority_queue_init (&taskgroup->taskgroup_queue); |
| - taskgroup->in_taskgroup_wait = false; |
| - taskgroup->cancelled = false; |
| - taskgroup->num_children = 0; |
| - gomp_sem_init (&taskgroup->taskgroup_sem, 0); |
| - task->taskgroup = taskgroup; |
| + task->taskgroup = gomp_taskgroup_init (task->taskgroup); |
| } |
| |
| void |
| @@ -1840,6 +2022,302 @@ GOMP_taskgroup_end (void) |
| free (taskgroup); |
| } |
| |
| +static inline __attribute__((always_inline)) void |
| +gomp_reduction_register (uintptr_t *data, uintptr_t *old, uintptr_t *orig, |
| + unsigned nthreads) |
| +{ |
| + size_t total_cnt = 0; |
| + uintptr_t *d = data; |
| + struct htab *old_htab = NULL, *new_htab; |
| + do |
| + { |
| + if (__builtin_expect (orig != NULL, 0)) |
| + { |
| + /* For worksharing task reductions, memory has been allocated |
| + already by some other thread that encountered the construct |
| + earlier. */ |
| + d[2] = orig[2]; |
| + d[6] = orig[6]; |
| + orig = (uintptr_t *) orig[4]; |
| + } |
| + else |
| + { |
| + size_t sz = d[1] * nthreads; |
| + /* Should use omp_alloc if d[3] is not -1. */ |
| + void *ptr = gomp_aligned_alloc (d[2], sz); |
| + memset (ptr, '\0', sz); |
| + d[2] = (uintptr_t) ptr; |
| + d[6] = d[2] + sz; |
| + } |
| + d[5] = 0; |
| + total_cnt += d[0]; |
| + if (d[4] == 0) |
| + { |
| + d[4] = (uintptr_t) old; |
| + break; |
| + } |
| + else |
| + d = (uintptr_t *) d[4]; |
| + } |
| + while (1); |
| + if (old && old[5]) |
| + { |
| + old_htab = (struct htab *) old[5]; |
| + total_cnt += htab_elements (old_htab); |
| + } |
| + new_htab = htab_create (total_cnt); |
| + if (old_htab) |
| + { |
| + /* Copy old hash table, like in htab_expand. */ |
| + hash_entry_type *p, *olimit; |
| + new_htab->n_elements = htab_elements (old_htab); |
| + olimit = old_htab->entries + old_htab->size; |
| + p = old_htab->entries; |
| + do |
| + { |
| + hash_entry_type x = *p; |
| + if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY) |
| + *find_empty_slot_for_expand (new_htab, htab_hash (x)) = x; |
| + p++; |
| + } |
| + while (p < olimit); |
| + } |
| + d = data; |
| + do |
| + { |
| + size_t j; |
| + for (j = 0; j < d[0]; ++j) |
| + { |
| + uintptr_t *p = d + 7 + j * 3; |
| + p[2] = (uintptr_t) d; |
| + /* Ugly hack, hash_entry_type is defined for the task dependencies, |
| + which hash on the first element which is a pointer. We need |
| + to hash also on the first sizeof (uintptr_t) bytes which contain |
| + a pointer. Hide the cast from the compiler. */ |
| + hash_entry_type n; |
| + __asm ("" : "=g" (n) : "0" (p)); |
| + *htab_find_slot (&new_htab, n, INSERT) = n; |
| + } |
| + if (d[4] == (uintptr_t) old) |
| + break; |
| + else |
| + d = (uintptr_t *) d[4]; |
| + } |
| + while (1); |
| + d[5] = (uintptr_t) new_htab; |
| +} |
| + |
| +static void |
| +gomp_create_artificial_team (void) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + struct gomp_task_icv *icv; |
| + struct gomp_team *team = gomp_new_team (1); |
| + struct gomp_task *task = thr->task; |
| + icv = task ? &task->icv : &gomp_global_icv; |
| + team->prev_ts = thr->ts; |
| + thr->ts.team = team; |
| + thr->ts.team_id = 0; |
| + thr->ts.work_share = &team->work_shares[0]; |
| + thr->ts.last_work_share = NULL; |
| +#ifdef HAVE_SYNC_BUILTINS |
| + thr->ts.single_count = 0; |
| +#endif |
| + thr->ts.static_trip = 0; |
| + thr->task = &team->implicit_task[0]; |
| + gomp_init_task (thr->task, NULL, icv); |
| + if (task) |
| + { |
| + thr->task = task; |
| + gomp_end_task (); |
| + free (task); |
| + thr->task = &team->implicit_task[0]; |
| + } |
| +#ifdef LIBGOMP_USE_PTHREADS |
| + else |
| + pthread_setspecific (gomp_thread_destructor, thr); |
| +#endif |
| +} |
| + |
| +/* The format of data is: |
| + data[0] cnt |
| + data[1] size |
| + data[2] alignment (on output array pointer) |
| + data[3] allocator (-1 if malloc allocator) |
| + data[4] next pointer |
| + data[5] used internally (htab pointer) |
| + data[6] used internally (end of array) |
| + cnt times |
| + ent[0] address |
| + ent[1] offset |
| + ent[2] used internally (pointer to data[0]) |
| + The entries are sorted by increasing offset, so that a binary |
| + search can be performed. Normally, data[8] is 0, exception is |
| + for worksharing construct task reductions in cancellable parallel, |
| + where at offset 0 there should be space for a pointer and an integer |
| + which are used internally. */ |
| + |
| +void |
| +GOMP_taskgroup_reduction_register (uintptr_t *data) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + struct gomp_team *team = thr->ts.team; |
| + struct gomp_task *task; |
| + unsigned nthreads; |
| + if (__builtin_expect (team == NULL, 0)) |
| + { |
| + /* The task reduction code needs a team and task, so for |
| + orphaned taskgroups just create the implicit team. */ |
| + gomp_create_artificial_team (); |
| + ialias_call (GOMP_taskgroup_start) (); |
| + team = thr->ts.team; |
| + } |
| + nthreads = team->nthreads; |
| + task = thr->task; |
| + gomp_reduction_register (data, task->taskgroup->reductions, NULL, nthreads); |
| + task->taskgroup->reductions = data; |
| +} |
| + |
| +void |
| +GOMP_taskgroup_reduction_unregister (uintptr_t *data) |
| +{ |
| + uintptr_t *d = data; |
| + htab_free ((struct htab *) data[5]); |
| + do |
| + { |
| + gomp_aligned_free ((void *) d[2]); |
| + d = (uintptr_t *) d[4]; |
| + } |
| + while (d && !d[5]); |
| +} |
| +ialias (GOMP_taskgroup_reduction_unregister) |
| + |
| +/* For i = 0 to cnt-1, remap ptrs[i] which is either address of the |
| + original list item or address of previously remapped original list |
| + item to address of the private copy, store that to ptrs[i]. |
| + For i < cntorig, additionally set ptrs[cnt+i] to the address of |
| + the original list item. */ |
| + |
| +void |
| +GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + struct gomp_task *task = thr->task; |
| + unsigned id = thr->ts.team_id; |
| + uintptr_t *data = task->taskgroup->reductions; |
| + uintptr_t *d; |
| + struct htab *reduction_htab = (struct htab *) data[5]; |
| + size_t i; |
| + for (i = 0; i < cnt; ++i) |
| + { |
| + hash_entry_type ent, n; |
| + __asm ("" : "=g" (ent) : "0" (ptrs + i)); |
| + n = htab_find (reduction_htab, ent); |
| + if (n) |
| + { |
| + uintptr_t *p; |
| + __asm ("" : "=g" (p) : "0" (n)); |
| + /* At this point, p[0] should be equal to (uintptr_t) ptrs[i], |
| + p[1] is the offset within the allocated chunk for each |
| + thread, p[2] is the array registered with |
| + GOMP_taskgroup_reduction_register, d[2] is the base of the |
| + allocated memory and d[1] is the size of the allocated chunk |
| + for one thread. */ |
| + d = (uintptr_t *) p[2]; |
| + ptrs[i] = (void *) (d[2] + id * d[1] + p[1]); |
| + if (__builtin_expect (i < cntorig, 0)) |
| + ptrs[cnt + i] = (void *) p[0]; |
| + continue; |
| + } |
| + d = data; |
| + while (d != NULL) |
| + { |
| + if ((uintptr_t) ptrs[i] >= d[2] && (uintptr_t) ptrs[i] < d[6]) |
| + break; |
| + d = (uintptr_t *) d[4]; |
| + } |
| + if (d == NULL) |
| + gomp_fatal ("couldn't find matching task_reduction or reduction with " |
| + "task modifier for %p", ptrs[i]); |
| + uintptr_t off = ((uintptr_t) ptrs[i] - d[2]) % d[1]; |
| + ptrs[i] = (void *) (d[2] + id * d[1] + off); |
| + if (__builtin_expect (i < cntorig, 0)) |
| + { |
| + size_t lo = 0, hi = d[0] - 1; |
| + while (lo <= hi) |
| + { |
| + size_t m = (lo + hi) / 2; |
| + if (d[7 + 3 * m + 1] < off) |
| + lo = m + 1; |
| + else if (d[7 + 3 * m + 1] == off) |
| + { |
| + ptrs[cnt + i] = (void *) d[7 + 3 * m]; |
| + break; |
| + } |
| + else |
| + hi = m - 1; |
| + } |
| + if (lo > hi) |
| + gomp_fatal ("couldn't find matching task_reduction or reduction " |
| + "with task modifier for %p", ptrs[i]); |
| + } |
| + } |
| +} |
| + |
| +struct gomp_taskgroup * |
| +gomp_parallel_reduction_register (uintptr_t *data, unsigned nthreads) |
| +{ |
| + struct gomp_taskgroup *taskgroup = gomp_taskgroup_init (NULL); |
| + gomp_reduction_register (data, NULL, NULL, nthreads); |
| + taskgroup->reductions = data; |
| + return taskgroup; |
| +} |
| + |
| +void |
| +gomp_workshare_task_reduction_register (uintptr_t *data, uintptr_t *orig) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + struct gomp_team *team = thr->ts.team; |
| + struct gomp_task *task = thr->task; |
| + unsigned nthreads = team->nthreads; |
| + gomp_reduction_register (data, task->taskgroup->reductions, orig, nthreads); |
| + task->taskgroup->reductions = data; |
| +} |
| + |
| +void |
| +gomp_workshare_taskgroup_start (void) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + struct gomp_team *team = thr->ts.team; |
| + struct gomp_task *task; |
| + |
| + if (team == NULL) |
| + { |
| + gomp_create_artificial_team (); |
| + team = thr->ts.team; |
| + } |
| + task = thr->task; |
| + task->taskgroup = gomp_taskgroup_init (task->taskgroup); |
| + task->taskgroup->workshare = true; |
| +} |
| + |
| +void |
| +GOMP_workshare_task_reduction_unregister (bool cancelled) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + struct gomp_task *task = thr->task; |
| + struct gomp_team *team = thr->ts.team; |
| + uintptr_t *data = task->taskgroup->reductions; |
| + ialias_call (GOMP_taskgroup_end) (); |
| + if (thr->ts.team_id == 0) |
| + ialias_call (GOMP_taskgroup_reduction_unregister) (data); |
| + else |
| + htab_free ((struct htab *) data[5]); |
| + |
| + if (!cancelled) |
| + gomp_team_barrier_wait (&team->barrier); |
| +} |
| + |
| int |
| omp_in_final (void) |
| { |
| |
| |
| @@ -32,7 +32,6 @@ |
| #include <string.h> |
| |
| #ifdef LIBGOMP_USE_PTHREADS |
| -/* This attribute contains PTHREAD_CREATE_DETACHED. */ |
| pthread_attr_t gomp_thread_attr; |
| |
| /* This key is for the thread destructor. */ |
| @@ -58,6 +57,7 @@ struct gomp_thread_start_data |
| struct gomp_thread_pool *thread_pool; |
| unsigned int place; |
| bool nested; |
| + pthread_t handle; |
| }; |
| |
| |
| @@ -89,6 +89,9 @@ gomp_thread_start (void *xdata) |
| thr->ts = data->ts; |
| thr->task = data->task; |
| thr->place = data->place; |
| +#ifdef GOMP_NEEDS_THREAD_HANDLE |
| + thr->handle = data->handle; |
| +#endif |
| |
| thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release; |
| |
| @@ -131,6 +134,7 @@ gomp_thread_start (void *xdata) |
| } |
| |
| gomp_sem_destroy (&thr->release); |
| + pthread_detach (pthread_self ()); |
| thr->thread_pool = NULL; |
| thr->task = NULL; |
| return NULL; |
| @@ -183,7 +187,7 @@ gomp_new_team (unsigned nthreads) |
| team->single_count = 0; |
| #endif |
| team->work_shares_to_free = &team->work_shares[0]; |
| - gomp_init_work_share (&team->work_shares[0], false, nthreads); |
| + gomp_init_work_share (&team->work_shares[0], 0, nthreads); |
| team->work_shares[0].next_alloc = NULL; |
| team->work_share_list_free = NULL; |
| team->work_share_list_alloc = &team->work_shares[1]; |
| @@ -231,6 +235,7 @@ gomp_free_pool_helper (void *thread_pool |
| thr->thread_pool = NULL; |
| thr->task = NULL; |
| #ifdef LIBGOMP_USE_PTHREADS |
| + pthread_detach (pthread_self ()); |
| pthread_exit (NULL); |
| #elif defined(__nvptx__) |
| asm ("exit;"); |
| @@ -297,7 +302,8 @@ gomp_free_thread (void *arg __attribute_ |
| #ifdef LIBGOMP_USE_PTHREADS |
| void |
| gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, |
| - unsigned flags, struct gomp_team *team) |
| + unsigned flags, struct gomp_team *team, |
| + struct gomp_taskgroup *taskgroup) |
| { |
| struct gomp_thread_start_data *start_data; |
| struct gomp_thread *thr, *nthr; |
| @@ -312,6 +318,7 @@ gomp_team_start (void (*fn) (void *), vo |
| unsigned int s = 0, rest = 0, p = 0, k = 0; |
| unsigned int affinity_count = 0; |
| struct gomp_thread **affinity_thr = NULL; |
| + bool force_display = false; |
| |
| thr = gomp_thread (); |
| nested = thr->ts.level; |
| @@ -319,7 +326,12 @@ gomp_team_start (void (*fn) (void *), vo |
| task = thr->task; |
| icv = task ? &task->icv : &gomp_global_icv; |
| if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0) |
| - gomp_init_affinity (); |
| + { |
| + gomp_init_affinity (); |
| + if (__builtin_expect (gomp_display_affinity_var, 0) && nthreads == 1) |
| + gomp_display_affinity_thread (gomp_thread_self (), &thr->ts, |
| + thr->place); |
| + } |
| |
| /* Always save the previous state, even if this isn't a nested team. |
| In particular, we should save any work share state from an outer |
| @@ -338,6 +350,9 @@ gomp_team_start (void (*fn) (void *), vo |
| #endif |
| thr->ts.static_trip = 0; |
| thr->task = &team->implicit_task[0]; |
| +#ifdef GOMP_NEEDS_THREAD_HANDLE |
| + thr->handle = pthread_self (); |
| +#endif |
| nthreads_var = icv->nthreads_var; |
| if (__builtin_expect (gomp_nthreads_var_list != NULL, 0) |
| && thr->ts.level < gomp_nthreads_var_list_len) |
| @@ -350,6 +365,7 @@ gomp_team_start (void (*fn) (void *), vo |
| && thr->ts.level < gomp_bind_var_list_len) |
| bind_var = gomp_bind_var_list[thr->ts.level]; |
| gomp_init_task (thr->task, task, icv); |
| + thr->task->taskgroup = taskgroup; |
| team->implicit_task[0].icv.nthreads_var = nthreads_var; |
| team->implicit_task[0].icv.bind_var = bind_var; |
| |
| @@ -465,7 +481,9 @@ gomp_team_start (void (*fn) (void *), vo |
| pool->threads |
| = gomp_realloc (pool->threads, |
| pool->threads_size |
| - * sizeof (struct gomp_thread_data *)); |
| + * sizeof (struct gomp_thread *)); |
| + /* Add current (master) thread to threads[]. */ |
| + pool->threads[0] = thr; |
| } |
| |
| /* Release existing idle threads. */ |
| @@ -540,6 +558,7 @@ gomp_team_start (void (*fn) (void *), vo |
| + place_partition_len)) |
| { |
| unsigned int l; |
| + force_display = true; |
| if (affinity_thr == NULL) |
| { |
| unsigned int j; |
| @@ -623,6 +642,7 @@ gomp_team_start (void (*fn) (void *), vo |
| gomp_init_task (nthr->task, task, icv); |
| team->implicit_task[i].icv.nthreads_var = nthreads_var; |
| team->implicit_task[i].icv.bind_var = bind_var; |
| + nthr->task->taskgroup = taskgroup; |
| nthr->fn = fn; |
| nthr->data = data; |
| team->ordered_release[i] = &nthr->release; |
| @@ -712,19 +732,17 @@ gomp_team_start (void (*fn) (void *), vo |
| { |
| size_t stacksize; |
| pthread_attr_init (&thread_attr); |
| - pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED); |
| if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize)) |
| pthread_attr_setstacksize (&thread_attr, stacksize); |
| attr = &thread_attr; |
| } |
| |
| start_data = gomp_alloca (sizeof (struct gomp_thread_start_data) |
| - * (nthreads-i)); |
| + * (nthreads - i)); |
| |
| /* Launch new threads. */ |
| for (; i < nthreads; ++i) |
| { |
| - pthread_t pt; |
| int err; |
| |
| start_data->ts.place_partition_off = thr->ts.place_partition_off; |
| @@ -810,11 +828,14 @@ gomp_team_start (void (*fn) (void *), vo |
| gomp_init_task (start_data->task, task, icv); |
| team->implicit_task[i].icv.nthreads_var = nthreads_var; |
| team->implicit_task[i].icv.bind_var = bind_var; |
| + start_data->task->taskgroup = taskgroup; |
| start_data->thread_pool = pool; |
| start_data->nested = nested; |
| |
| attr = gomp_adjust_thread_attr (attr, &thread_attr); |
| - err = pthread_create (&pt, attr, gomp_thread_start, start_data++); |
| + err = pthread_create (&start_data->handle, attr, gomp_thread_start, |
| + start_data); |
| + start_data++; |
| if (err != 0) |
| gomp_fatal ("Thread creation failed: %s", strerror (err)); |
| } |
| @@ -854,6 +875,42 @@ gomp_team_start (void (*fn) (void *), vo |
| gomp_mutex_unlock (&gomp_managed_threads_lock); |
| #endif |
| } |
| + if (__builtin_expect (gomp_display_affinity_var, 0)) |
| + { |
| + if (nested |
| + || nthreads != old_threads_used |
| + || force_display) |
| + { |
| + gomp_display_affinity_thread (gomp_thread_self (), &thr->ts, |
| + thr->place); |
| + if (nested) |
| + { |
| + start_data -= nthreads - 1; |
| + for (i = 1; i < nthreads; ++i) |
| + { |
| + gomp_display_affinity_thread ( |
| +#ifdef LIBGOMP_USE_PTHREADS |
| + start_data->handle, |
| +#else |
| + gomp_thread_self (), |
| +#endif |
| + &start_data->ts, |
| + start_data->place); |
| + start_data++; |
| + } |
| + } |
| + else |
| + { |
| + for (i = 1; i < nthreads; ++i) |
| + { |
| + gomp_thread_handle handle |
| + = gomp_thread_to_pthread_t (pool->threads[i]); |
| + gomp_display_affinity_thread (handle, &pool->threads[i]->ts, |
| + pool->threads[i]->place); |
| + } |
| + } |
| + } |
| + } |
| if (__builtin_expect (affinity_thr != NULL, 0) |
| && team->prev_ts.place_partition_len > 64) |
| free (affinity_thr); |
| @@ -894,7 +951,7 @@ gomp_team_end (void) |
| gomp_end_task (); |
| thr->ts = team->prev_ts; |
| |
| - if (__builtin_expect (thr->ts.team != NULL, 0)) |
| + if (__builtin_expect (thr->ts.level != 0, 0)) |
| { |
| #ifdef HAVE_SYNC_BUILTINS |
| __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads); |
| @@ -959,6 +1016,76 @@ team_destructor (void) |
| crashes. */ |
| pthread_key_delete (gomp_thread_destructor); |
| } |
| + |
| +/* Similar to gomp_free_pool_helper, but don't detach itself, |
| + gomp_pause_host will pthread_join those threads. */ |
| + |
| +static void |
| +gomp_pause_pool_helper (void *thread_pool) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + struct gomp_thread_pool *pool |
| + = (struct gomp_thread_pool *) thread_pool; |
| + gomp_simple_barrier_wait_last (&pool->threads_dock); |
| + gomp_sem_destroy (&thr->release); |
| + thr->thread_pool = NULL; |
| + thr->task = NULL; |
| + pthread_exit (NULL); |
| +} |
| + |
| +/* Free a thread pool and release its threads. Return non-zero on |
| + failure. */ |
| + |
| +int |
| +gomp_pause_host (void) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + struct gomp_thread_pool *pool = thr->thread_pool; |
| + if (thr->ts.level) |
| + return -1; |
| + if (pool) |
| + { |
| + if (pool->threads_used > 0) |
| + { |
| + int i; |
| + pthread_t *thrs |
| + = gomp_alloca (sizeof (pthread_t) * pool->threads_used); |
| + for (i = 1; i < pool->threads_used; i++) |
| + { |
| + struct gomp_thread *nthr = pool->threads[i]; |
| + nthr->fn = gomp_pause_pool_helper; |
| + nthr->data = pool; |
| + thrs[i] = gomp_thread_to_pthread_t (nthr); |
| + } |
| + /* This barrier undocks threads docked on pool->threads_dock. */ |
| + gomp_simple_barrier_wait (&pool->threads_dock); |
| + /* And this waits till all threads have called gomp_barrier_wait_last |
| + in gomp_pause_pool_helper. */ |
| + gomp_simple_barrier_wait (&pool->threads_dock); |
| + /* Now it is safe to destroy the barrier and free the pool. */ |
| + gomp_simple_barrier_destroy (&pool->threads_dock); |
| + |
| +#ifdef HAVE_SYNC_BUILTINS |
| + __sync_fetch_and_add (&gomp_managed_threads, |
| + 1L - pool->threads_used); |
| +#else |
| + gomp_mutex_lock (&gomp_managed_threads_lock); |
| + gomp_managed_threads -= pool->threads_used - 1L; |
| + gomp_mutex_unlock (&gomp_managed_threads_lock); |
| +#endif |
| + for (i = 1; i < pool->threads_used; i++) |
| + pthread_join (thrs[i], NULL); |
| + } |
| + if (pool->last_team) |
| + free_team (pool->last_team); |
| +#ifndef __nvptx__ |
| + free (pool->threads); |
| + free (pool); |
| +#endif |
| + thr->thread_pool = NULL; |
| + } |
| + return 0; |
| +} |
| #endif |
| |
| struct gomp_task_icv * |
| |
| |
| @@ -44,6 +44,7 @@ |
| #include "config.h" |
| #include "gstdint.h" |
| #include "libgomp-plugin.h" |
| +#include "gomp-constants.h" |
| |
| #ifdef HAVE_PTHREAD_H |
| #include <pthread.h> |
| @@ -85,9 +86,21 @@ enum memmodel |
| |
| /* alloc.c */ |
| |
| +#if defined(HAVE_ALIGNED_ALLOC) \ |
| + || defined(HAVE__ALIGNED_MALLOC) \ |
| + || defined(HAVE_POSIX_MEMALIGN) \ |
| + || defined(HAVE_MEMALIGN) |
| +/* Defined if gomp_aligned_alloc doesn't use fallback version |
| + and free can be used instead of gomp_aligned_free. */ |
| +#define GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC 1 |
| +#endif |
| + |
| extern void *gomp_malloc (size_t) __attribute__((malloc)); |
| extern void *gomp_malloc_cleared (size_t) __attribute__((malloc)); |
| extern void *gomp_realloc (void *, size_t); |
| +extern void *gomp_aligned_alloc (size_t, size_t) |
| + __attribute__((malloc, alloc_size (2))); |
| +extern void gomp_aligned_free (void *); |
| |
| /* Avoid conflicting prototypes of alloca() in system headers by using |
| GCC's builtin alloca(). */ |
| @@ -137,7 +150,8 @@ enum gomp_schedule_type |
| GFS_STATIC, |
| GFS_DYNAMIC, |
| GFS_GUIDED, |
| - GFS_AUTO |
| + GFS_AUTO, |
| + GFS_MONOTONIC = 0x80000000U |
| }; |
| |
| struct gomp_doacross_work_share |
| @@ -174,6 +188,8 @@ struct gomp_doacross_work_share |
| /* Likewise, but for the ull implementation. */ |
| unsigned long long boundary_ull; |
| }; |
| + /* Pointer to extra memory if needed for lastprivate(conditional). */ |
| + void *extra; |
| /* Array of shift counts for each dimension if they can be flattened. */ |
| unsigned int shift_counts[]; |
| }; |
| @@ -275,6 +291,9 @@ struct gomp_work_share |
| struct gomp_work_share *next_free; |
| }; |
| |
| + /* Task reductions for this work-sharing construct. */ |
| + uintptr_t *task_reductions; |
| + |
| /* If only few threads are in the team, ordered_team_ids can point |
| to this array which fills the padding at the end of this struct. */ |
| unsigned inline_ordered_team_ids[0]; |
| @@ -365,8 +384,12 @@ extern void **gomp_places_list; |
| extern unsigned long gomp_places_list_len; |
| extern unsigned int gomp_num_teams_var; |
| extern int gomp_debug_var; |
| +extern bool gomp_display_affinity_var; |
| +extern char *gomp_affinity_format_var; |
| +extern size_t gomp_affinity_format_len; |
| extern int goacc_device_num; |
| extern char *goacc_device_type; |
| +extern int goacc_default_dims[GOMP_DIM_MAX]; |
| |
| enum gomp_task_kind |
| { |
| @@ -469,8 +492,10 @@ struct gomp_taskgroup |
| struct gomp_taskgroup *prev; |
| /* Queue of tasks that belong in this taskgroup. */ |
| struct priority_queue taskgroup_queue; |
| + uintptr_t *reductions; |
| bool in_taskgroup_wait; |
| bool cancelled; |
| + bool workshare; |
| gomp_sem_t taskgroup_sem; |
| size_t num_children; |
| }; |
| @@ -613,6 +638,19 @@ struct gomp_thread |
| |
| /* User pthread thread pool */ |
| struct gomp_thread_pool *thread_pool; |
| + |
| +#if defined(LIBGOMP_USE_PTHREADS) \ |
| + && (!defined(HAVE_TLS) \ |
| + || !defined(__GLIBC__) \ |
| + || !defined(USING_INITIAL_EXEC_TLS)) |
| + /* pthread_t of the thread containing this gomp_thread. |
| + On Linux when using initial-exec TLS, |
| + (typeof (pthread_t)) gomp_thread () - pthread_self () |
| + is constant in all threads, so we can optimize and not |
| + store it. */ |
| +#define GOMP_NEEDS_THREAD_HANDLE 1 |
| + pthread_t handle; |
| +#endif |
| }; |
| |
| |
| @@ -709,6 +747,25 @@ extern bool gomp_affinity_finalize_place |
| extern bool gomp_affinity_init_level (int, unsigned long, bool); |
| extern void gomp_affinity_print_place (void *); |
| extern void gomp_get_place_proc_ids_8 (int, int64_t *); |
| +extern void gomp_display_affinity_place (char *, size_t, size_t *, int); |
| + |
| +/* affinity-fmt.c */ |
| + |
| +extern void gomp_print_string (const char *str, size_t len); |
| +extern void gomp_set_affinity_format (const char *, size_t); |
| +extern void gomp_display_string (char *, size_t, size_t *, const char *, |
| + size_t); |
| +#ifdef LIBGOMP_USE_PTHREADS |
| +typedef pthread_t gomp_thread_handle; |
| +#else |
| +typedef struct {} gomp_thread_handle; |
| +#endif |
| +extern size_t gomp_display_affinity (char *, size_t, const char *, |
| + gomp_thread_handle, |
| + struct gomp_team_state *, unsigned int); |
| +extern void gomp_display_affinity_thread (gomp_thread_handle, |
| + struct gomp_team_state *, |
| + unsigned int) __attribute__((cold)); |
| |
| /* iter.c */ |
| |
| @@ -745,9 +802,9 @@ extern void gomp_ordered_next (void); |
| extern void gomp_ordered_static_init (void); |
| extern void gomp_ordered_static_next (void); |
| extern void gomp_ordered_sync (void); |
| -extern void gomp_doacross_init (unsigned, long *, long); |
| +extern void gomp_doacross_init (unsigned, long *, long, size_t); |
| extern void gomp_doacross_ull_init (unsigned, unsigned long long *, |
| - unsigned long long); |
| + unsigned long long, size_t); |
| |
| /* parallel.c */ |
| |
| @@ -770,6 +827,10 @@ extern bool gomp_create_target_task (str |
| size_t *, unsigned short *, unsigned int, |
| void **, void **, |
| enum gomp_target_task_state); |
| +extern struct gomp_taskgroup *gomp_parallel_reduction_register (uintptr_t *, |
| + unsigned); |
| +extern void gomp_workshare_taskgroup_start (void); |
| +extern void gomp_workshare_task_reduction_register (uintptr_t *, uintptr_t *); |
| |
| static void inline |
| gomp_finish_task (struct gomp_task *task) |
| @@ -782,9 +843,11 @@ gomp_finish_task (struct gomp_task *task |
| |
| extern struct gomp_team *gomp_new_team (unsigned); |
| extern void gomp_team_start (void (*) (void *), void *, unsigned, |
| - unsigned, struct gomp_team *); |
| + unsigned, struct gomp_team *, |
| + struct gomp_taskgroup *); |
| extern void gomp_team_end (void); |
| extern void gomp_free_thread (void *); |
| +extern int gomp_pause_host (void); |
| |
| /* target.c */ |
| |
| @@ -851,6 +914,8 @@ struct splay_tree_key_s { |
| uintptr_t tgt_offset; |
| /* Reference count. */ |
| uintptr_t refcount; |
| + /* Dynamic reference count. */ |
| + uintptr_t dynamic_refcount; |
| /* Pointer to the original mapping of "omp declare target link" object. */ |
| splay_tree_key link_key; |
| }; |
| @@ -989,7 +1054,9 @@ enum gomp_map_vars_kind |
| }; |
| |
| extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *); |
| -extern void gomp_acc_remove_pointer (void *, bool, int, int); |
| +extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int); |
| +extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *, |
| + unsigned short *); |
| |
| extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *, |
| size_t, void **, void **, |
| @@ -999,12 +1066,13 @@ extern void gomp_unmap_vars (struct targ |
| extern void gomp_init_device (struct gomp_device_descr *); |
| extern void gomp_free_memmap (struct splay_tree_s *); |
| extern void gomp_unload_device (struct gomp_device_descr *); |
| +extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key); |
| |
| /* work.c */ |
| |
| -extern void gomp_init_work_share (struct gomp_work_share *, bool, unsigned); |
| +extern void gomp_init_work_share (struct gomp_work_share *, size_t, unsigned); |
| extern void gomp_fini_work_share (struct gomp_work_share *); |
| -extern bool gomp_work_share_start (bool); |
| +extern bool gomp_work_share_start (size_t); |
| extern void gomp_work_share_end (void); |
| extern bool gomp_work_share_end_cancel (void); |
| extern void gomp_work_share_end_nowait (void); |
| @@ -1028,6 +1096,14 @@ gomp_work_share_init_done (void) |
| #include "omp-lock.h" |
| #define _LIBGOMP_OMP_LOCK_DEFINED 1 |
| #include "omp.h.in" |
| +#define omp_sched_monotonic 0x80000000U |
| +typedef enum omp_pause_resource_t |
| +{ |
| + omp_pause_soft = 1, |
| + omp_pause_hard = 2 |
| +} omp_pause_resource_t; |
| +extern int omp_pause_resource (omp_pause_resource_t, int) __GOMP_NOTHROW; |
| +extern int omp_pause_resource_all (omp_pause_resource_t) __GOMP_NOTHROW; |
| |
| #if !defined (HAVE_ATTRIBUTE_VISIBILITY) \ |
| || !defined (HAVE_ATTRIBUTE_ALIAS) \ |
| @@ -1082,16 +1158,26 @@ extern int gomp_test_nest_lock_25 (omp_n |
| # define attribute_hidden |
| #endif |
| |
| +#if __GNUC__ >= 9 |
| +# define HAVE_ATTRIBUTE_COPY |
| +#endif |
| + |
| +#ifdef HAVE_ATTRIBUTE_COPY |
| +# define attribute_copy(arg) __attribute__ ((copy (arg))) |
| +#else |
| +# define attribute_copy(arg) |
| +#endif |
| + |
| #ifdef HAVE_ATTRIBUTE_ALIAS |
| # define strong_alias(fn, al) \ |
| - extern __typeof (fn) al __attribute__ ((alias (#fn))); |
| + extern __typeof (fn) al __attribute__ ((alias (#fn))) attribute_copy (fn); |
| |
| # define ialias_ulp ialias_str1(__USER_LABEL_PREFIX__) |
| # define ialias_str1(x) ialias_str2(x) |
| # define ialias_str2(x) #x |
| # define ialias(fn) \ |
| extern __typeof (fn) gomp_ialias_##fn \ |
| - __attribute__ ((alias (#fn))) attribute_hidden; |
| + __attribute__ ((alias (#fn))) attribute_hidden attribute_copy (fn); |
| # define ialias_redirect(fn) \ |
| extern __typeof (fn) fn __asm__ (ialias_ulp "gomp_ialias_" #fn) attribute_hidden; |
| # define ialias_call(fn) gomp_ialias_ ## fn |
| @@ -1131,4 +1217,42 @@ task_to_priority_node (enum priority_que |
| return (struct priority_node *) ((char *) task |
| + priority_queue_offset (type)); |
| } |
| + |
| +#ifdef LIBGOMP_USE_PTHREADS |
| +static inline gomp_thread_handle |
| +gomp_thread_self (void) |
| +{ |
| + return pthread_self (); |
| +} |
| + |
| +static inline gomp_thread_handle |
| +gomp_thread_to_pthread_t (struct gomp_thread *thr) |
| +{ |
| + struct gomp_thread *this_thr = gomp_thread (); |
| + if (thr == this_thr) |
| + return pthread_self (); |
| +#ifdef GOMP_NEEDS_THREAD_HANDLE |
| + return thr->handle; |
| +#else |
| + /* On Linux with initial-exec TLS, the pthread_t of the thread containing |
| + thr can be computed from thr, this_thr and pthread_self (), |
| + as the distance between this_thr and pthread_self () is constant. */ |
| + return pthread_self () + ((uintptr_t) thr - (uintptr_t) this_thr); |
| +#endif |
| +} |
| +#else |
| +static inline gomp_thread_handle |
| +gomp_thread_self (void) |
| +{ |
| + return (gomp_thread_handle) {}; |
| +} |
| + |
| +static inline gomp_thread_handle |
| +gomp_thread_to_pthread_t (struct gomp_thread *thr) |
| +{ |
| + (void) thr; |
| + return gomp_thread_self (); |
| +} |
| +#endif |
| + |
| #endif /* LIBGOMP_H */ |
| |
| |
| @@ -27,6 +27,8 @@ |
| /* This file handles OpenACC constructs. */ |
| |
| #include "openacc.h" |
| +void acc_copyout_finalize (void *, size_t) __GOACC_NOTHROW; |
| +void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW; |
| #include "libgomp.h" |
| #include "libgomp_g.h" |
| #include "gomp-constants.h" |
| @@ -38,31 +40,95 @@ |
| #include <stdarg.h> |
| #include <assert.h> |
| |
| + |
| +/* In the ABI, the GOACC_FLAGs are encoded as an inverted bitmask, so that we |
| + continue to support the following two legacy values. */ |
| +_Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_ICV) == 0, |
| + "legacy GOMP_DEVICE_ICV broken"); |
| +_Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_HOST_FALLBACK) |
| + == GOACC_FLAG_HOST_FALLBACK, |
| + "legacy GOMP_DEVICE_HOST_FALLBACK broken"); |
| + |
| + |
| +/* Returns the number of mappings associated with the pointer or pset. PSET |
| + have three mappings, whereas pointer have two. */ |
| + |
| static int |
| -find_pset (int pos, size_t mapnum, unsigned short *kinds) |
| +find_pointer (int pos, size_t mapnum, unsigned short *kinds) |
| { |
| if (pos + 1 >= mapnum) |
| return 0; |
| |
| unsigned char kind = kinds[pos+1] & 0xff; |
| |
| - return kind == GOMP_MAP_TO_PSET; |
| + if (kind == GOMP_MAP_TO_PSET) |
| + return 3; |
| + else if (kind == GOMP_MAP_POINTER) |
| + return 2; |
| + |
| + return 0; |
| +} |
| + |
| +/* Handle the mapping pair that are presented when a |
| + deviceptr clause is used with Fortran. */ |
| + |
| +static void |
| +handle_ftn_pointers (size_t mapnum, void **hostaddrs, size_t *sizes, |
| + unsigned short *kinds) |
| +{ |
| + int i; |
| + |
| + for (i = 0; i < mapnum; i++) |
| + { |
| + unsigned short kind1 = kinds[i] & 0xff; |
| + |
| + /* Handle Fortran deviceptr clause. */ |
| + if (kind1 == GOMP_MAP_FORCE_DEVICEPTR) |
| + { |
| + unsigned short kind2; |
| + |
| + if (i < (signed)mapnum - 1) |
| + kind2 = kinds[i + 1] & 0xff; |
| + else |
| + kind2 = 0xffff; |
| + |
| + if (sizes[i] == sizeof (void *)) |
| + continue; |
| + |
| + /* At this point, we're dealing with a Fortran deviceptr. |
| + If the next element is not what we're expecting, then |
| + this is an instance of where the deviceptr variable was |
| + not used within the region and the pointer was removed |
| + by the gimplifier. */ |
| + if (kind2 == GOMP_MAP_POINTER |
| + && sizes[i + 1] == 0 |
| + && hostaddrs[i] == *(void **)hostaddrs[i + 1]) |
| + { |
| + kinds[i+1] = kinds[i]; |
| + sizes[i+1] = sizeof (void *); |
| + } |
| + |
| + /* Invalidate the entry. */ |
| + hostaddrs[i] = NULL; |
| + } |
| + } |
| } |
| |
| static void goacc_wait (int async, int num_waits, va_list *ap); |
| |
| |
| -/* Launch a possibly offloaded function on DEVICE. FN is the host fn |
| +/* Launch a possibly offloaded function with FLAGS. FN is the host fn |
| address. MAPNUM, HOSTADDRS, SIZES & KINDS describe the memory |
| blocks to be copied to/from the device. Varadic arguments are |
| keyed optional parameters terminated with a zero. */ |
| |
| void |
| -GOACC_parallel_keyed (int device, void (*fn) (void *), |
| +GOACC_parallel_keyed (int flags_m, void (*fn) (void *), |
| size_t mapnum, void **hostaddrs, size_t *sizes, |
| unsigned short *kinds, ...) |
| { |
| - bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; |
| + int flags = GOACC_FLAGS_UNMARSHAL (flags_m); |
| + |
| va_list ap; |
| struct goacc_thread *thr; |
| struct gomp_device_descr *acc_dev; |
| @@ -88,9 +154,11 @@ GOACC_parallel_keyed (int device, void ( |
| thr = goacc_thread (); |
| acc_dev = thr->dev; |
| |
| + handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds); |
| + |
| /* Host fallback if "if" clause is false or if the current device is set to |
| the host. */ |
| - if (host_fallback) |
| + if (flags & GOACC_FLAG_HOST_FALLBACK) |
| { |
| goacc_save_and_set_bind (acc_device_host); |
| fn (hostaddrs); |
| @@ -140,9 +208,7 @@ GOACC_parallel_keyed (int device, void ( |
| case GOMP_LAUNCH_WAIT: |
| { |
| unsigned num_waits = GOMP_LAUNCH_OP (tag); |
| - |
| - if (num_waits) |
| - goacc_wait (async, num_waits, &ap); |
| + goacc_wait (async, num_waits, &ap); |
| break; |
| } |
| |
| @@ -177,16 +243,36 @@ GOACC_parallel_keyed (int device, void ( |
| devaddrs = gomp_alloca (sizeof (void *) * mapnum); |
| for (i = 0; i < mapnum; i++) |
| devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start |
| - + tgt->list[i].key->tgt_offset); |
| + + tgt->list[i].key->tgt_offset |
| + + tgt->list[i].offset); |
| |
| acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, |
| async, dims, tgt); |
| |
| /* If running synchronously, unmap immediately. */ |
| - if (async < acc_async_noval) |
| + bool copyfrom = true; |
| + if (async_synchronous_p (async)) |
| gomp_unmap_vars (tgt, true); |
| else |
| - tgt->device_descr->openacc.register_async_cleanup_func (tgt, async); |
| + { |
| + bool async_unmap = false; |
| + for (size_t i = 0; i < tgt->list_count; i++) |
| + { |
| + splay_tree_key k = tgt->list[i].key; |
| + if (k && k->refcount == 1) |
| + { |
| + async_unmap = true; |
| + break; |
| + } |
| + } |
| + if (async_unmap) |
| + tgt->device_descr->openacc.register_async_cleanup_func (tgt, async); |
| + else |
| + { |
| + copyfrom = false; |
| + gomp_unmap_vars (tgt, copyfrom); |
| + } |
| + } |
| |
| acc_dev->openacc.async_set_async_func (acc_async_sync); |
| } |
| @@ -194,7 +280,7 @@ GOACC_parallel_keyed (int device, void ( |
| /* Legacy entry point, only provide host execution. */ |
| |
| void |
| -GOACC_parallel (int device, void (*fn) (void *), |
| +GOACC_parallel (int flags_m, void (*fn) (void *), |
| size_t mapnum, void **hostaddrs, size_t *sizes, |
| unsigned short *kinds, |
| int num_gangs, int num_workers, int vector_length, |
| @@ -206,10 +292,11 @@ GOACC_parallel (int device, void (*fn) ( |
| } |
| |
| void |
| -GOACC_data_start (int device, size_t mapnum, |
| +GOACC_data_start (int flags_m, size_t mapnum, |
| void **hostaddrs, size_t *sizes, unsigned short *kinds) |
| { |
| - bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; |
| + int flags = GOACC_FLAGS_UNMARSHAL (flags_m); |
| + |
| struct target_mem_desc *tgt; |
| |
| #ifdef HAVE_INTTYPES_H |
| @@ -227,7 +314,7 @@ GOACC_data_start (int device, size_t map |
| |
| /* Host fallback or 'do nothing'. */ |
| if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) |
| - || host_fallback) |
| + || (flags & GOACC_FLAG_HOST_FALLBACK)) |
| { |
| tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true, |
| GOMP_MAP_VARS_OPENACC); |
| @@ -258,13 +345,14 @@ GOACC_data_end (void) |
| } |
| |
| void |
| -GOACC_enter_exit_data (int device, size_t mapnum, |
| +GOACC_enter_exit_data (int flags_m, size_t mapnum, |
| void **hostaddrs, size_t *sizes, unsigned short *kinds, |
| int async, int num_waits, ...) |
| { |
| + int flags = GOACC_FLAGS_UNMARSHAL (flags_m); |
| + |
| struct goacc_thread *thr; |
| struct gomp_device_descr *acc_dev; |
| - bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; |
| bool data_enter = false; |
| size_t i; |
| |
| @@ -274,7 +362,7 @@ GOACC_enter_exit_data (int device, size_ |
| acc_dev = thr->dev; |
| |
| if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) |
| - || host_fallback) |
| + || (flags & GOACC_FLAG_HOST_FALLBACK)) |
| return; |
| |
| if (num_waits) |
| @@ -286,6 +374,17 @@ GOACC_enter_exit_data (int device, size_ |
| va_end (ap); |
| } |
| |
| + /* Determine whether "finalize" semantics apply to all mappings of this |
| + OpenACC directive. */ |
| + bool finalize = false; |
| + if (mapnum > 0) |
| + { |
| + unsigned char kind = kinds[0] & 0xff; |
| + if (kind == GOMP_MAP_DELETE |
| + || kind == GOMP_MAP_FORCE_FROM) |
| + finalize = true; |
| + } |
| + |
| acc_dev->openacc.async_set_async_func (async); |
| |
| /* Determine if this is an "acc enter data". */ |
| @@ -298,13 +397,17 @@ GOACC_enter_exit_data (int device, size_ |
| |
| if (kind == GOMP_MAP_FORCE_ALLOC |
| || kind == GOMP_MAP_FORCE_PRESENT |
| - || kind == GOMP_MAP_FORCE_TO) |
| + || kind == GOMP_MAP_FORCE_TO |
| + || kind == GOMP_MAP_TO |
| + || kind == GOMP_MAP_ALLOC) |
| { |
| data_enter = true; |
| break; |
| } |
| |
| - if (kind == GOMP_MAP_DELETE |
| + if (kind == GOMP_MAP_RELEASE |
| + || kind == GOMP_MAP_DELETE |
| + || kind == GOMP_MAP_FROM |
| || kind == GOMP_MAP_FORCE_FROM) |
| break; |
| |
| @@ -312,31 +415,35 @@ GOACC_enter_exit_data (int device, size_ |
| kind); |
| } |
| |
| + /* In c, non-pointers and arrays are represented by a single data clause. |
| + Dynamically allocated arrays and subarrays are represented by a data |
| + clause followed by an internal GOMP_MAP_POINTER. |
| + |
| + In fortran, scalars and not allocated arrays are represented by a |
| + single data clause. Allocated arrays and subarrays have three mappings: |
| + 1) the original data clause, 2) a PSET 3) a pointer to the array data. |
| + */ |
| + |
| if (data_enter) |
| { |
| for (i = 0; i < mapnum; i++) |
| { |
| unsigned char kind = kinds[i] & 0xff; |
| |
| - /* Scan for PSETs. */ |
| - int psets = find_pset (i, mapnum, kinds); |
| + /* Scan for pointers and PSETs. */ |
| + int pointer = find_pointer (i, mapnum, kinds); |
| |
| - if (!psets) |
| + if (!pointer) |
| { |
| switch (kind) |
| { |
| - case GOMP_MAP_POINTER: |
| - gomp_acc_insert_pointer (1, &hostaddrs[i], &sizes[i], |
| - &kinds[i]); |
| - break; |
| + case GOMP_MAP_ALLOC: |
| case GOMP_MAP_FORCE_ALLOC: |
| acc_create (hostaddrs[i], sizes[i]); |
| break; |
| - case GOMP_MAP_FORCE_PRESENT: |
| - acc_present_or_copyin (hostaddrs[i], sizes[i]); |
| - break; |
| + case GOMP_MAP_TO: |
| case GOMP_MAP_FORCE_TO: |
| - acc_present_or_copyin (hostaddrs[i], sizes[i]); |
| + acc_copyin (hostaddrs[i], sizes[i]); |
| break; |
| default: |
| gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", |
| @@ -346,12 +453,13 @@ GOACC_enter_exit_data (int device, size_ |
| } |
| else |
| { |
| - gomp_acc_insert_pointer (3, &hostaddrs[i], &sizes[i], &kinds[i]); |
| + gomp_acc_insert_pointer (pointer, &hostaddrs[i], |
| + &sizes[i], &kinds[i]); |
| /* Increment 'i' by two because OpenACC requires fortran |
| arrays to be contiguous, so each PSET is associated with |
| one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and |
| one MAP_POINTER. */ |
| - i += 2; |
| + i += pointer - 1; |
| } |
| } |
| } |
| @@ -360,22 +468,28 @@ GOACC_enter_exit_data (int device, size_ |
| { |
| unsigned char kind = kinds[i] & 0xff; |
| |
| - int psets = find_pset (i, mapnum, kinds); |
| + int pointer = find_pointer (i, mapnum, kinds); |
| |
| - if (!psets) |
| + if (!pointer) |
| { |
| switch (kind) |
| { |
| - case GOMP_MAP_POINTER: |
| - gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff) |
| - == GOMP_MAP_FORCE_FROM, |
| - async, 1); |
| - break; |
| + case GOMP_MAP_RELEASE: |
| case GOMP_MAP_DELETE: |
| - acc_delete (hostaddrs[i], sizes[i]); |
| + if (acc_is_present (hostaddrs[i], sizes[i])) |
| + { |
| + if (finalize) |
| + acc_delete_finalize (hostaddrs[i], sizes[i]); |
| + else |
| + acc_delete (hostaddrs[i], sizes[i]); |
| + } |
| break; |
| + case GOMP_MAP_FROM: |
| case GOMP_MAP_FORCE_FROM: |
| - acc_copyout (hostaddrs[i], sizes[i]); |
| + if (finalize) |
| + acc_copyout_finalize (hostaddrs[i], sizes[i]); |
| + else |
| + acc_copyout (hostaddrs[i], sizes[i]); |
| break; |
| default: |
| gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", |
| @@ -385,10 +499,12 @@ GOACC_enter_exit_data (int device, size_ |
| } |
| else |
| { |
| - gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff) |
| - == GOMP_MAP_FORCE_FROM, async, 3); |
| + bool copyfrom = (kind == GOMP_MAP_FORCE_FROM |
| + || kind == GOMP_MAP_FROM); |
| + gomp_acc_remove_pointer (hostaddrs[i], sizes[i], copyfrom, async, |
| + finalize, pointer); |
| /* See the above comment. */ |
| - i += 2; |
| + i += pointer - 1; |
| } |
| } |
| |
| @@ -398,13 +514,20 @@ GOACC_enter_exit_data (int device, size_ |
| static void |
| goacc_wait (int async, int num_waits, va_list *ap) |
| { |
| - struct goacc_thread *thr = goacc_thread (); |
| - struct gomp_device_descr *acc_dev = thr->dev; |
| - |
| while (num_waits--) |
| { |
| int qid = va_arg (*ap, int); |
| - |
| + |
| + /* Waiting on ACC_ASYNC_NOVAL maps to 'wait all'. */ |
| + if (qid == acc_async_noval) |
| + { |
| + if (async == acc_async_sync) |
| + acc_wait_all (); |
| + else |
| + acc_wait_all_async (async); |
| + break; |
| + } |
| + |
| if (acc_async_test (qid)) |
| continue; |
| |
| @@ -415,16 +538,17 @@ goacc_wait (int async, int num_waits, va |
| launching on, the queue itself will order work as |
| required, so there's no need to wait explicitly. */ |
| else |
| - acc_dev->openacc.async_wait_async_func (qid, async); |
| + acc_wait_async (qid, async); |
| } |
| } |
| |
| void |
| -GOACC_update (int device, size_t mapnum, |
| +GOACC_update (int flags_m, size_t mapnum, |
| void **hostaddrs, size_t *sizes, unsigned short *kinds, |
| int async, int num_waits, ...) |
| { |
| - bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; |
| + int flags = GOACC_FLAGS_UNMARSHAL (flags_m); |
| + |
| size_t i; |
| |
| goacc_lazy_initialize (); |
| @@ -433,7 +557,7 @@ GOACC_update (int device, size_t mapnum, |
| struct gomp_device_descr *acc_dev = thr->dev; |
| |
| if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) |
| - || host_fallback) |
| + || (flags & GOACC_FLAG_HOST_FALLBACK)) |
| return; |
| |
| if (num_waits) |
| @@ -447,6 +571,7 @@ GOACC_update (int device, size_t mapnum, |
| |
| acc_dev->openacc.async_set_async_func (async); |
| |
| + bool update_device = false; |
| for (i = 0; i < mapnum; ++i) |
| { |
| unsigned char kind = kinds[i] & 0xff; |
| @@ -457,11 +582,46 @@ GOACC_update (int device, size_t mapnum, |
| case GOMP_MAP_TO_PSET: |
| break; |
| |
| + case GOMP_MAP_ALWAYS_POINTER: |
| + if (update_device) |
| + { |
| + /* Save the contents of the host pointer. */ |
| + void *dptr = acc_deviceptr (hostaddrs[i-1]); |
| + uintptr_t t = *(uintptr_t *) hostaddrs[i]; |
| + |
| + /* Update the contents of the host pointer to reflect |
| + the value of the allocated device memory in the |
| + previous pointer. */ |
| + *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr; |
| + acc_update_device (hostaddrs[i], sizeof (uintptr_t)); |
| + |
| + /* Restore the host pointer. */ |
| + *(uintptr_t *) hostaddrs[i] = t; |
| + update_device = false; |
| + } |
| + break; |
| + |
| + case GOMP_MAP_TO: |
| + if (!acc_is_present (hostaddrs[i], sizes[i])) |
| + { |
| + update_device = false; |
| + break; |
| + } |
| + /* Fallthru */ |
| case GOMP_MAP_FORCE_TO: |
| + update_device = true; |
| acc_update_device (hostaddrs[i], sizes[i]); |
| break; |
| |
| + case GOMP_MAP_FROM: |
| + if (!acc_is_present (hostaddrs[i], sizes[i])) |
| + { |
| + update_device = false; |
| + break; |
| + } |
| + /* Fallthru */ |
| case GOMP_MAP_FORCE_FROM: |
| + update_device = false; |
| acc_update_self (hostaddrs[i], sizes[i]); |
| break; |
| |
| @@ -487,8 +647,8 @@ GOACC_wait (int async, int num_waits, .. |
| } |
| else if (async == acc_async_sync) |
| acc_wait_all (); |
| - else if (async == acc_async_noval) |
| - goacc_thread ()->dev->openacc.async_wait_all_async_func (acc_async_noval); |
| + else |
| + acc_wait_all_async (async); |
| } |
| |
| int |
| @@ -504,7 +664,7 @@ GOACC_get_thread_num (void) |
| } |
| |
| void |
| -GOACC_declare (int device, size_t mapnum, |
| +GOACC_declare (int flags_m, size_t mapnum, |
| void **hostaddrs, size_t *sizes, unsigned short *kinds) |
| { |
| int i; |
| @@ -522,9 +682,10 @@ GOACC_declare (int device, size_t mapnum |
| case GOMP_MAP_FORCE_FROM: |
| case GOMP_MAP_FORCE_TO: |
| case GOMP_MAP_POINTER: |
| + case GOMP_MAP_RELEASE: |
| case GOMP_MAP_DELETE: |
| - GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], |
| - &kinds[i], 0, 0); |
| + GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i], |
| + &kinds[i], GOMP_ASYNC_SYNC, 0); |
| break; |
| |
| case GOMP_MAP_FORCE_DEVICEPTR: |
| @@ -532,20 +693,19 @@ GOACC_declare (int device, size_t mapnum |
| |
| case GOMP_MAP_ALLOC: |
| if (!acc_is_present (hostaddrs[i], sizes[i])) |
| - GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], |
| - &kinds[i], 0, 0); |
| + GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i], |
| + &kinds[i], GOMP_ASYNC_SYNC, 0); |
| break; |
| |
| case GOMP_MAP_TO: |
| - GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], |
| - &kinds[i], 0, 0); |
| + GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i], |
| + &kinds[i], GOMP_ASYNC_SYNC, 0); |
| |
| break; |
| |
| case GOMP_MAP_FROM: |
| - kinds[i] = GOMP_MAP_FORCE_FROM; |
| - GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], |
| - &kinds[i], 0, 0); |
| + GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i], |
| + &kinds[i], GOMP_ASYNC_SYNC, 0); |
| break; |
| |
| case GOMP_MAP_FORCE_PRESENT: |
| |
| |
| @@ -0,0 +1,1502 @@ |
| +! OpenACC Runtime Library Definitions. |
| + |
| +! Copyright (C) 2014-2019 Free Software Foundation, Inc. |
| + |
| +! Contributed by Tobias Burnus <burnus@net-b.de> |
| +! and Mentor Embedded. |
| + |
| +! This file is part of the GNU Offloading and Multi Processing Library |
| +! (libgomp). |
| + |
| +! Libgomp is free software; you can redistribute it and/or modify it |
| +! under the terms of the GNU General Public License as published by |
| +! the Free Software Foundation; either version 3, or (at your option) |
| +! any later version. |
| + |
| +! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY |
| +! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| +! FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
| +! more details. |
| + |
| +! Under Section 7 of GPL version 3, you are granted additional |
| +! permissions described in the GCC Runtime Library Exception, version |
| +! 3.1, as published by the Free Software Foundation. |
| + |
| +! You should have received a copy of the GNU General Public License and |
| +! a copy of the GCC Runtime Library Exception along with this program; |
| +! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| +! <http://www.gnu.org/licenses/>. |
| + |
| +module openacc_kinds2 |
| + use iso_fortran_env, only: int32 |
| + implicit none |
| + |
| + private :: int32 |
| + public :: acc_device_kind |
| + |
| + integer, parameter :: acc_device_kind = int32 |
| + |
| + public :: acc_device_none, acc_device_default, acc_device_host |
| + public :: acc_device_not_host, acc_device_nvidia |
| + |
| + ! Keep in sync with include/gomp-constants.h. |
| + integer (acc_device_kind), parameter :: acc_device_none = 0 |
| + integer (acc_device_kind), parameter :: acc_device_default = 1 |
| + integer (acc_device_kind), parameter :: acc_device_host = 2 |
| + ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed. |
| + integer (acc_device_kind), parameter :: acc_device_not_host = 4 |
| + integer (acc_device_kind), parameter :: acc_device_nvidia = 5 |
| + |
| + public :: acc_handle_kind |
| + |
| + integer, parameter :: acc_handle_kind = int32 |
| + |
| + public :: acc_async_noval, acc_async_sync |
| + |
| + ! Keep in sync with include/gomp-constants.h. |
| + integer (acc_handle_kind), parameter :: acc_async_noval = -1 |
| + integer (acc_handle_kind), parameter :: acc_async_sync = -2 |
| + |
| +end module |
| + |
| +module openacc_internal2 |
| + use openacc_kinds2 |
| + implicit none |
| + |
| + interface |
| + function acc_get_num_devices_h (d) |
| + import |
| + integer acc_get_num_devices_h |
| + integer (acc_device_kind) d |
| + end function |
| + |
| + subroutine acc_set_device_type_h (d) |
| + import |
| + integer (acc_device_kind) d |
| + end subroutine |
| + |
| + function acc_get_device_type_h () |
| + import |
| + integer (acc_device_kind) acc_get_device_type_h |
| + end function |
| + |
| + subroutine acc_set_device_num_h (n, d) |
| + import |
| + integer n |
| + integer (acc_device_kind) d |
| + end subroutine |
| + |
| + function acc_get_device_num_h (d) |
| + import |
| + integer acc_get_device_num_h |
| + integer (acc_device_kind) d |
| + end function |
| + |
| + function acc_async_test_h (a) |
| + logical acc_async_test_h |
| + integer a |
| + end function |
| + |
| + function acc_async_test_all_h () |
| + logical acc_async_test_all_h |
| + end function |
| + |
| + subroutine acc_wait_h (a) |
| + integer a |
| + end subroutine |
| + |
| + subroutine acc_wait_async_h (a1, a2) |
| + integer a1, a2 |
| + end subroutine |
| + |
| + subroutine acc_wait_all_h () |
| + end subroutine |
| + |
| + subroutine acc_wait_all_async_h (a) |
| + integer a |
| + end subroutine |
| + |
| + subroutine acc_init_h (d) |
| + import |
| + integer (acc_device_kind) d |
| + end subroutine |
| + |
| + subroutine acc_shutdown_h (d) |
| + import |
| + integer (acc_device_kind) d |
| + end subroutine |
| + |
| + function acc_on_device_h (d) |
| + import |
| + integer (acc_device_kind) d |
| + logical acc_on_device_h |
| + end function |
| + |
| + subroutine acc_copyin_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end subroutine |
| + |
| + subroutine acc_copyin_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end subroutine |
| + |
| + subroutine acc_copyin_array_h (a) |
| + type (*), dimension (..), contiguous :: a |
| + end subroutine |
| + |
| + subroutine acc_present_or_copyin_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end subroutine |
| + |
| + subroutine acc_present_or_copyin_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end subroutine |
| + |
| + subroutine acc_present_or_copyin_array_h (a) |
| + type (*), dimension (..), contiguous :: a |
| + end subroutine |
| + |
| + subroutine acc_create_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end subroutine |
| + |
| + subroutine acc_create_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end subroutine |
| + |
| + subroutine acc_create_array_h (a) |
| + type (*), dimension (..), contiguous :: a |
| + end subroutine |
| + |
| + subroutine acc_present_or_create_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end subroutine |
| + |
| + subroutine acc_present_or_create_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end subroutine |
| + |
| + subroutine acc_present_or_create_array_h (a) |
| + type (*), dimension (..), contiguous :: a |
| + end subroutine |
| + |
| + subroutine acc_copyout_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end subroutine |
| + |
| + subroutine acc_copyout_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end subroutine |
| + |
| + subroutine acc_copyout_array_h (a) |
| + type (*), dimension (..), contiguous :: a |
| + end subroutine |
| + |
| + subroutine acc_copyout_finalize_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end subroutine |
| + |
| + subroutine acc_copyout_finalize_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end subroutine |
| + |
| + subroutine acc_copyout_finalize_array_h (a) |
| + type (*), dimension (..), contiguous :: a |
| + end subroutine |
| + |
| + subroutine acc_delete_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end subroutine |
| + |
| + subroutine acc_delete_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end subroutine |
| + |
| + subroutine acc_delete_array_h (a) |
| + type (*), dimension (..), contiguous :: a |
| + end subroutine |
| + |
| + subroutine acc_delete_finalize_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end subroutine |
| + |
| + subroutine acc_delete_finalize_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end subroutine |
| + |
| + subroutine acc_delete_finalize_array_h (a) |
| + type (*), dimension (..), contiguous :: a |
| + end subroutine |
| + |
| + subroutine acc_update_device_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end subroutine |
| + |
| + subroutine acc_update_device_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end subroutine |
| + |
| + subroutine acc_update_device_array_h (a) |
| + type (*), dimension (..), contiguous :: a |
| + end subroutine |
| + |
| + subroutine acc_update_self_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end subroutine |
| + |
| + subroutine acc_update_self_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end subroutine |
| + |
| + subroutine acc_update_self_array_h (a) |
| + type (*), dimension (..), contiguous :: a |
| + end subroutine |
| + |
| + function acc_is_present_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t |
| + logical acc_is_present_32_h |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + end function |
| + |
| + function acc_is_present_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t |
| + logical acc_is_present_64_h |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + end function |
| + |
| + function acc_is_present_array_h (a) |
| + logical acc_is_present_array_h |
| + type (*), dimension (..), contiguous :: a |
| + end function |
| + |
| + subroutine acc_copyin_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_copyin_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_copyin_async_array_h (a, async) |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_create_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_create_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_create_async_array_h (a, async) |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_copyout_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_copyout_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_copyout_async_array_h (a, async) |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_delete_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_delete_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_delete_async_array_h (a, async) |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_update_device_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_update_device_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_update_device_async_array_h (a, async) |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_update_self_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_update_self_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + |
| + subroutine acc_update_self_async_array_h (a, async) |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + end subroutine |
| + end interface |
| + |
| + interface |
| + function acc_get_num_devices_l (d) & |
| + bind (C, name = "acc_get_num_devices") |
| + use iso_c_binding, only: c_int |
| + integer (c_int) :: acc_get_num_devices_l |
| + integer (c_int), value :: d |
| + end function |
| + |
| + subroutine acc_set_device_type_l (d) & |
| + bind (C, name = "acc_set_device_type") |
| + use iso_c_binding, only: c_int |
| + integer (c_int), value :: d |
| + end subroutine |
| + |
| + function acc_get_device_type_l () & |
| + bind (C, name = "acc_get_device_type") |
| + use iso_c_binding, only: c_int |
| + integer (c_int) :: acc_get_device_type_l |
| + end function |
| + |
| + subroutine acc_set_device_num_l (n, d) & |
| + bind (C, name = "acc_set_device_num") |
| + use iso_c_binding, only: c_int |
| + integer (c_int), value :: n, d |
| + end subroutine |
| + |
| + function acc_get_device_num_l (d) & |
| + bind (C, name = "acc_get_device_num") |
| + use iso_c_binding, only: c_int |
| + integer (c_int) :: acc_get_device_num_l |
| + integer (c_int), value :: d |
| + end function |
| + |
| + function acc_async_test_l (a) & |
| + bind (C, name = "acc_async_test") |
| + use iso_c_binding, only: c_int |
| + integer (c_int) :: acc_async_test_l |
| + integer (c_int), value :: a |
| + end function |
| + |
| + function acc_async_test_all_l () & |
| + bind (C, name = "acc_async_test_all") |
| + use iso_c_binding, only: c_int |
| + integer (c_int) :: acc_async_test_all_l |
| + end function |
| + |
| + subroutine acc_wait_l (a) & |
| + bind (C, name = "acc_wait") |
| + use iso_c_binding, only: c_int |
| + integer (c_int), value :: a |
| + end subroutine |
| + |
| + subroutine acc_wait_async_l (a1, a2) & |
| + bind (C, name = "acc_wait_async") |
| + use iso_c_binding, only: c_int |
| + integer (c_int), value :: a1, a2 |
| + end subroutine |
| + |
| + subroutine acc_wait_all_l () & |
| + bind (C, name = "acc_wait_all") |
| + use iso_c_binding, only: c_int |
| + end subroutine |
| + |
| + subroutine acc_wait_all_async_l (a) & |
| + bind (C, name = "acc_wait_all_async") |
| + use iso_c_binding, only: c_int |
| + integer (c_int), value :: a |
| + end subroutine |
| + |
| + subroutine acc_init_l (d) & |
| + bind (C, name = "acc_init") |
| + use iso_c_binding, only: c_int |
| + integer (c_int), value :: d |
| + end subroutine |
| + |
| + subroutine acc_shutdown_l (d) & |
| + bind (C, name = "acc_shutdown") |
| + use iso_c_binding, only: c_int |
| + integer (c_int), value :: d |
| + end subroutine |
| + |
| + function acc_on_device_l (d) & |
| + bind (C, name = "acc_on_device") |
| + use iso_c_binding, only: c_int |
| + integer (c_int) :: acc_on_device_l |
| + integer (c_int), value :: d |
| + end function |
| + |
| + subroutine acc_copyin_l (a, len) & |
| + bind (C, name = "acc_copyin") |
| + use iso_c_binding, only: c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end subroutine |
| + |
| + subroutine acc_present_or_copyin_l (a, len) & |
| + bind (C, name = "acc_present_or_copyin") |
| + use iso_c_binding, only: c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end subroutine |
| + |
| + subroutine acc_create_l (a, len) & |
| + bind (C, name = "acc_create") |
| + use iso_c_binding, only: c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end subroutine |
| + |
| + subroutine acc_present_or_create_l (a, len) & |
| + bind (C, name = "acc_present_or_create") |
| + use iso_c_binding, only: c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end subroutine |
| + |
| + subroutine acc_copyout_l (a, len) & |
| + bind (C, name = "acc_copyout") |
| + use iso_c_binding, only: c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end subroutine |
| + |
| + subroutine acc_copyout_finalize_l (a, len) & |
| + bind (C, name = "acc_copyout_finalize") |
| + use iso_c_binding, only: c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end subroutine |
| + |
| + subroutine acc_delete_l (a, len) & |
| + bind (C, name = "acc_delete") |
| + use iso_c_binding, only: c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end subroutine |
| + |
| + subroutine acc_delete_finalize_l (a, len) & |
| + bind (C, name = "acc_delete_finalize") |
| + use iso_c_binding, only: c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end subroutine |
| + |
| + subroutine acc_update_device_l (a, len) & |
| + bind (C, name = "acc_update_device") |
| + use iso_c_binding, only: c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end subroutine |
| + |
| + subroutine acc_update_self_l (a, len) & |
| + bind (C, name = "acc_update_self") |
| + use iso_c_binding, only: c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end subroutine |
| + |
| + function acc_is_present_l (a, len) & |
| + bind (C, name = "acc_is_present") |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + integer (c_int32_t) :: acc_is_present_l |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + end function |
| + |
| + subroutine acc_copyin_async_l (a, len, async) & |
| + bind (C, name = "acc_copyin_async") |
| + use iso_c_binding, only: c_size_t, c_int |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + integer (c_int), value :: async |
| + end subroutine |
| + |
| + subroutine acc_create_async_l (a, len, async) & |
| + bind (C, name = "acc_create_async") |
| + use iso_c_binding, only: c_size_t, c_int |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + integer (c_int), value :: async |
| + end subroutine |
| + |
| + subroutine acc_copyout_async_l (a, len, async) & |
| + bind (C, name = "acc_copyout_async") |
| + use iso_c_binding, only: c_size_t, c_int |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + integer (c_int), value :: async |
| + end subroutine |
| + |
| + subroutine acc_delete_async_l (a, len, async) & |
| + bind (C, name = "acc_delete_async") |
| + use iso_c_binding, only: c_size_t, c_int |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + integer (c_int), value :: async |
| + end subroutine |
| + |
| + subroutine acc_update_device_async_l (a, len, async) & |
| + bind (C, name = "acc_update_device_async") |
| + use iso_c_binding, only: c_size_t, c_int |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + integer (c_int), value :: async |
| + end subroutine |
| + |
| + subroutine acc_update_self_async_l (a, len, async) & |
| + bind (C, name = "acc_update_self_async") |
| + use iso_c_binding, only: c_size_t, c_int |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_size_t), value :: len |
| + integer (c_int), value :: async |
| + end subroutine |
| + end interface |
| +end module |
| + |
| +module openacc2 |
| + use openacc_kinds2 |
| + use openacc_internal2 |
| + implicit none |
| + |
| + public :: openacc_version |
| + |
| + public :: acc_get_num_devices, acc_set_device_type, acc_get_device_type |
| + public :: acc_set_device_num, acc_get_device_num, acc_async_test |
| + public :: acc_async_test_all |
| + public :: acc_wait, acc_async_wait, acc_wait_async |
| + public :: acc_wait_all, acc_async_wait_all, acc_wait_all_async |
| + public :: acc_init, acc_shutdown, acc_on_device |
| + public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create |
| + public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete |
| + public :: acc_update_device, acc_update_self, acc_is_present |
| + public :: acc_copyin_async, acc_create_async, acc_copyout_async |
| + public :: acc_delete_async, acc_update_device_async, acc_update_self_async |
| + |
| + integer, parameter :: openacc_version = 201306 |
| + |
| + interface acc_get_num_devices |
| + procedure :: acc_get_num_devices_h |
| + end interface |
| + |
| + interface acc_set_device_type |
| + procedure :: acc_set_device_type_h |
| + end interface |
| + |
| + interface acc_get_device_type |
| + procedure :: acc_get_device_type_h |
| + end interface |
| + |
| + interface acc_set_device_num |
| + procedure :: acc_set_device_num_h |
| + end interface |
| + |
| + interface acc_get_device_num |
| + procedure :: acc_get_device_num_h |
| + end interface |
| + |
| + interface acc_async_test |
| + procedure :: acc_async_test_h |
| + end interface |
| + |
| + interface acc_async_test_all |
| + procedure :: acc_async_test_all_h |
| + end interface |
| + |
| + interface acc_wait |
| + procedure :: acc_wait_h |
| + end interface |
| + |
| + ! acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. |
| + interface acc_async_wait |
| + procedure :: acc_wait_h |
| + end interface |
| + |
| + interface acc_wait_async |
| + procedure :: acc_wait_async_h |
| + end interface |
| + |
| + interface acc_wait_all |
| + procedure :: acc_wait_all_h |
| + end interface |
| + |
| + ! acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all. |
| + interface acc_async_wait_all |
| + procedure :: acc_wait_all_h |
| + end interface |
| + |
| + interface acc_wait_all_async |
| + procedure :: acc_wait_all_async_h |
| + end interface |
| + |
| + interface acc_init |
| + procedure :: acc_init_h |
| + end interface |
| + |
| + interface acc_shutdown |
| + procedure :: acc_shutdown_h |
| + end interface |
| + |
| + interface acc_on_device |
| + procedure :: acc_on_device_h |
| + end interface |
| + |
| + ! acc_malloc: Only available in C/C++ |
| + ! acc_free: Only available in C/C++ |
| + |
| + ! As vendor extension, the following code supports both 32bit and 64bit |
| + ! arguments for "size"; the OpenACC standard only permits default-kind |
| + ! integers, which are of kind 4 (i.e. 32 bits). |
| + ! Additionally, the two-argument version also takes arrays as argument. |
| + ! and the one argument version also scalars. Note that the code assumes |
| + ! that the arrays are contiguous. |
| + |
| + interface acc_copyin |
| + procedure :: acc_copyin_32_h |
| + procedure :: acc_copyin_64_h |
| + procedure :: acc_copyin_array_h |
| + end interface |
| + |
| + interface acc_present_or_copyin |
| + procedure :: acc_present_or_copyin_32_h |
| + procedure :: acc_present_or_copyin_64_h |
| + procedure :: acc_present_or_copyin_array_h |
| + end interface |
| + |
| + interface acc_pcopyin |
| + procedure :: acc_present_or_copyin_32_h |
| + procedure :: acc_present_or_copyin_64_h |
| + procedure :: acc_present_or_copyin_array_h |
| + end interface |
| + |
| + interface acc_create |
| + procedure :: acc_create_32_h |
| + procedure :: acc_create_64_h |
| + procedure :: acc_create_array_h |
| + end interface |
| + |
| + interface acc_present_or_create |
| + procedure :: acc_present_or_create_32_h |
| + procedure :: acc_present_or_create_64_h |
| + procedure :: acc_present_or_create_array_h |
| + end interface |
| + |
| + interface acc_pcreate |
| + procedure :: acc_present_or_create_32_h |
| + procedure :: acc_present_or_create_64_h |
| + procedure :: acc_present_or_create_array_h |
| + end interface |
| + |
| + interface acc_copyout |
| + procedure :: acc_copyout_32_h |
| + procedure :: acc_copyout_64_h |
| + procedure :: acc_copyout_array_h |
| + end interface |
| + |
| + interface acc_copyout_finalize |
| + procedure :: acc_copyout_finalize_32_h |
| + procedure :: acc_copyout_finalize_64_h |
| + procedure :: acc_copyout_finalize_array_h |
| + end interface |
| + |
| + interface acc_delete |
| + procedure :: acc_delete_32_h |
| + procedure :: acc_delete_64_h |
| + procedure :: acc_delete_array_h |
| + end interface |
| + |
| + interface acc_delete_finalize |
| + procedure :: acc_delete_finalize_32_h |
| + procedure :: acc_delete_finalize_64_h |
| + procedure :: acc_delete_finalize_array_h |
| + end interface |
| + |
| + interface acc_update_device |
| + procedure :: acc_update_device_32_h |
| + procedure :: acc_update_device_64_h |
| + procedure :: acc_update_device_array_h |
| + end interface |
| + |
| + interface acc_update_self |
| + procedure :: acc_update_self_32_h |
| + procedure :: acc_update_self_64_h |
| + procedure :: acc_update_self_array_h |
| + end interface |
| + |
| + ! acc_map_data: Only available in C/C++ |
| + ! acc_unmap_data: Only available in C/C++ |
| + ! acc_deviceptr: Only available in C/C++ |
| + ! acc_hostptr: Only available in C/C++ |
| + |
| + interface acc_is_present |
| + procedure :: acc_is_present_32_h |
| + procedure :: acc_is_present_64_h |
| + procedure :: acc_is_present_array_h |
| + end interface |
| + |
| + ! acc_memcpy_to_device: Only available in C/C++ |
| + ! acc_memcpy_from_device: Only available in C/C++ |
| + |
| + interface acc_copyin_async |
| + procedure :: acc_copyin_async_32_h |
| + procedure :: acc_copyin_async_64_h |
| + procedure :: acc_copyin_async_array_h |
| + end interface |
| + |
| + interface acc_create_async |
| + procedure :: acc_create_async_32_h |
| + procedure :: acc_create_async_64_h |
| + procedure :: acc_create_async_array_h |
| + end interface |
| + |
| + interface acc_copyout_async |
| + procedure :: acc_copyout_async_32_h |
| + procedure :: acc_copyout_async_64_h |
| + procedure :: acc_copyout_async_array_h |
| + end interface |
| + |
| + interface acc_delete_async |
| + procedure :: acc_delete_async_32_h |
| + procedure :: acc_delete_async_64_h |
| + procedure :: acc_delete_async_array_h |
| + end interface |
| + |
| + interface acc_update_device_async |
| + procedure :: acc_update_device_async_32_h |
| + procedure :: acc_update_device_async_64_h |
| + procedure :: acc_update_device_async_array_h |
| + end interface |
| + |
| + interface acc_update_self_async |
| + procedure :: acc_update_self_async_32_h |
| + procedure :: acc_update_self_async_64_h |
| + procedure :: acc_update_self_async_array_h |
| + end interface |
| + |
| +end module |
| + |
| +function acc_get_num_devices_h (d) |
| + use openacc_internal2, only: acc_get_num_devices_l |
| + use openacc_kinds2 |
| + integer acc_get_num_devices_h |
| + integer (acc_device_kind) d |
| + acc_get_num_devices_h = acc_get_num_devices_l (d) |
| +end function |
| + |
| +subroutine acc_set_device_type_h (d) |
| + use openacc_internal2, only: acc_set_device_type_l |
| + use openacc_kinds2 |
| + integer (acc_device_kind) d |
| + call acc_set_device_type_l (d) |
| +end subroutine |
| + |
| +function acc_get_device_type_h () |
| + use openacc_internal2, only: acc_get_device_type_l |
| + use openacc_kinds2 |
| + integer (acc_device_kind) acc_get_device_type_h |
| + acc_get_device_type_h = acc_get_device_type_l () |
| +end function |
| + |
| +subroutine acc_set_device_num_h (n, d) |
| + use openacc_internal2, only: acc_set_device_num_l |
| + use openacc_kinds2 |
| + integer n |
| + integer (acc_device_kind) d |
| + call acc_set_device_num_l (n, d) |
| +end subroutine |
| + |
| +function acc_get_device_num_h (d) |
| + use openacc_internal2, only: acc_get_device_num_l |
| + use openacc_kinds2 |
| + integer acc_get_device_num_h |
| + integer (acc_device_kind) d |
| + acc_get_device_num_h = acc_get_device_num_l (d) |
| +end function |
| + |
| +function acc_async_test_h (a) |
| + use openacc_internal2, only: acc_async_test_l |
| + logical acc_async_test_h |
| + integer a |
| + if (acc_async_test_l (a) .eq. 1) then |
| + acc_async_test_h = .TRUE. |
| + else |
| + acc_async_test_h = .FALSE. |
| + end if |
| +end function |
| + |
| +function acc_async_test_all_h () |
| + use openacc_internal2, only: acc_async_test_all_l |
| + logical acc_async_test_all_h |
| + if (acc_async_test_all_l () .eq. 1) then |
| + acc_async_test_all_h = .TRUE. |
| + else |
| + acc_async_test_all_h = .FALSE. |
| + end if |
| +end function |
| + |
| +subroutine acc_wait_h (a) |
| + use openacc_internal2, only: acc_wait_l |
| + integer a |
| + call acc_wait_l (a) |
| +end subroutine |
| + |
| +subroutine acc_wait_async_h (a1, a2) |
| + use openacc_internal2, only: acc_wait_async_l |
| + integer a1, a2 |
| + call acc_wait_async_l (a1, a2) |
| +end subroutine |
| + |
| +subroutine acc_wait_all_h () |
| + use openacc_internal2, only: acc_wait_all_l |
| + call acc_wait_all_l () |
| +end subroutine |
| + |
| +subroutine acc_wait_all_async_h (a) |
| + use openacc_internal2, only: acc_wait_all_async_l |
| + integer a |
| + call acc_wait_all_async_l (a) |
| +end subroutine |
| + |
| +subroutine acc_init_h (d) |
| + use openacc_internal2, only: acc_init_l |
| + use openacc_kinds2 |
| + integer (acc_device_kind) d |
| + call acc_init_l (d) |
| +end subroutine |
| + |
| +subroutine acc_shutdown_h (d) |
| + use openacc_internal2, only: acc_shutdown_l |
| + use openacc_kinds2 |
| + integer (acc_device_kind) d |
| + call acc_shutdown_l (d) |
| +end subroutine |
| + |
| +function acc_on_device_h (d) |
| + use openacc_internal2, only: acc_on_device_l |
| + use openacc_kinds2 |
| + integer (acc_device_kind) d |
| + logical acc_on_device_h |
| + if (acc_on_device_l (d) .eq. 1) then |
| + acc_on_device_h = .TRUE. |
| + else |
| + acc_on_device_h = .FALSE. |
| + end if |
| +end function |
| + |
| +subroutine acc_copyin_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_copyin_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + call acc_copyin_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_copyin_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_copyin_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + call acc_copyin_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_copyin_array_h (a) |
| + use openacc_internal2, only: acc_copyin_l |
| + type (*), dimension (..), contiguous :: a |
| + call acc_copyin_l (a, sizeof (a)) |
| +end subroutine |
| + |
| +subroutine acc_present_or_copyin_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_present_or_copyin_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + call acc_present_or_copyin_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_present_or_copyin_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_present_or_copyin_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + call acc_present_or_copyin_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_present_or_copyin_array_h (a) |
| + use openacc_internal2, only: acc_present_or_copyin_l |
| + type (*), dimension (..), contiguous :: a |
| + call acc_present_or_copyin_l (a, sizeof (a)) |
| +end subroutine |
| + |
| +subroutine acc_create_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_create_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + call acc_create_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_create_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_create_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + call acc_create_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_create_array_h (a) |
| + use openacc_internal2, only: acc_create_l |
| + type (*), dimension (..), contiguous :: a |
| + call acc_create_l (a, sizeof (a)) |
| +end subroutine |
| + |
| +subroutine acc_present_or_create_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_present_or_create_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + call acc_present_or_create_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_present_or_create_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_present_or_create_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + call acc_present_or_create_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_present_or_create_array_h (a) |
| + use openacc_internal2, only: acc_present_or_create_l |
| + type (*), dimension (..), contiguous :: a |
| + call acc_present_or_create_l (a, sizeof (a)) |
| +end subroutine |
| + |
| +subroutine acc_copyout_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_copyout_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + call acc_copyout_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_copyout_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_copyout_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + call acc_copyout_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_copyout_array_h (a) |
| + use openacc_internal2, only: acc_copyout_l |
| + type (*), dimension (..), contiguous :: a |
| + call acc_copyout_l (a, sizeof (a)) |
| +end subroutine |
| + |
| +subroutine acc_copyout_finalize_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_copyout_finalize_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + call acc_copyout_finalize_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_copyout_finalize_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_copyout_finalize_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + call acc_copyout_finalize_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_copyout_finalize_array_h (a) |
| + use openacc_internal2, only: acc_copyout_finalize_l |
| + type (*), dimension (..), contiguous :: a |
| + call acc_copyout_finalize_l (a, sizeof (a)) |
| +end subroutine |
| + |
| +subroutine acc_delete_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_delete_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + call acc_delete_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_delete_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_delete_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + call acc_delete_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_delete_array_h (a) |
| + use openacc_internal2, only: acc_delete_l |
| + type (*), dimension (..), contiguous :: a |
| + call acc_delete_l (a, sizeof (a)) |
| +end subroutine |
| + |
| +subroutine acc_delete_finalize_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_delete_finalize_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + call acc_delete_finalize_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_delete_finalize_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_delete_finalize_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + call acc_delete_finalize_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_delete_finalize_array_h (a) |
| + use openacc_internal2, only: acc_delete_finalize_l |
| + type (*), dimension (..), contiguous :: a |
| + call acc_delete_finalize_l (a, sizeof (a)) |
| +end subroutine |
| + |
| +subroutine acc_update_device_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_update_device_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + call acc_update_device_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_update_device_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_update_device_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + call acc_update_device_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_update_device_array_h (a) |
| + use openacc_internal2, only: acc_update_device_l |
| + type (*), dimension (..), contiguous :: a |
| + call acc_update_device_l (a, sizeof (a)) |
| +end subroutine |
| + |
| +subroutine acc_update_self_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_update_self_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + call acc_update_self_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_update_self_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_update_self_l |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + call acc_update_self_l (a, int (len, kind = c_size_t)) |
| +end subroutine |
| + |
| +subroutine acc_update_self_array_h (a) |
| + use openacc_internal2, only: acc_update_self_l |
| + type (*), dimension (..), contiguous :: a |
| + call acc_update_self_l (a, sizeof (a)) |
| +end subroutine |
| + |
| +function acc_is_present_32_h (a, len) |
| + use iso_c_binding, only: c_int32_t, c_size_t |
| + use openacc_internal2, only: acc_is_present_l |
| + logical acc_is_present_32_h |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then |
| + acc_is_present_32_h = .TRUE. |
| + else |
| + acc_is_present_32_h = .FALSE. |
| + end if |
| +end function |
| + |
| +function acc_is_present_64_h (a, len) |
| + use iso_c_binding, only: c_int64_t, c_size_t |
| + use openacc_internal2, only: acc_is_present_l |
| + logical acc_is_present_64_h |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then |
| + acc_is_present_64_h = .TRUE. |
| + else |
| + acc_is_present_64_h = .FALSE. |
| + end if |
| +end function |
| + |
| +function acc_is_present_array_h (a) |
| + use openacc_internal2, only: acc_is_present_l |
| + logical acc_is_present_array_h |
| + type (*), dimension (..), contiguous :: a |
| + acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1 |
| +end function |
| + |
| +subroutine acc_copyin_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_copyin_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_copyin_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_copyin_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_copyin_async_array_h (a, async) |
| + use iso_c_binding, only: c_int |
| + use openacc_internal2, only: acc_copyin_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + call acc_copyin_async_l (a, sizeof (a), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_create_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_create_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_create_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_create_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_create_async_array_h (a, async) |
| + use iso_c_binding, only: c_int |
| + use openacc_internal2, only: acc_create_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + call acc_create_async_l (a, sizeof (a), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_copyout_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_copyout_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_copyout_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_copyout_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_copyout_async_array_h (a, async) |
| + use iso_c_binding, only: c_int |
| + use openacc_internal2, only: acc_copyout_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + call acc_copyout_async_l (a, sizeof (a), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_delete_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_delete_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_delete_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_delete_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_delete_async_array_h (a, async) |
| + use iso_c_binding, only: c_int |
| + use openacc_internal2, only: acc_delete_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + call acc_delete_async_l (a, sizeof (a), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_update_device_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_update_device_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_update_device_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_update_device_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_update_device_async_array_h (a, async) |
| + use iso_c_binding, only: c_int |
| + use openacc_internal2, only: acc_update_device_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + call acc_update_device_async_l (a, sizeof (a), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_update_self_async_32_h (a, len, async) |
| + use iso_c_binding, only: c_int32_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_update_self_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int32_t) len |
| + integer (acc_handle_kind) async |
| + call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_update_self_async_64_h (a, len, async) |
| + use iso_c_binding, only: c_int64_t, c_size_t, c_int |
| + use openacc_internal2, only: acc_update_self_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a |
| + type (*), dimension (*) :: a |
| + integer (c_int64_t) len |
| + integer (acc_handle_kind) async |
| + call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) |
| +end subroutine |
| + |
| +subroutine acc_update_self_async_array_h (a, async) |
| + use iso_c_binding, only: c_int |
| + use openacc_internal2, only: acc_update_self_async_l |
| + use openacc_kinds2, only: acc_handle_kind |
| + type (*), dimension (..), contiguous :: a |
| + integer (acc_handle_kind) async |
| + call acc_update_self_async_l (a, sizeof (a), int (async, kind = c_int)) |
| +end subroutine |
| |
| |
| @@ -149,11 +149,28 @@ GOMP_taskloop (void (*fn) (void *), void |
| |
| if (flags & GOMP_TASK_FLAG_NOGROUP) |
| { |
| - if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled) |
| - return; |
| + if (__builtin_expect (gomp_cancel_var, 0) |
| + && thr->task |
| + && thr->task->taskgroup) |
| + { |
| + if (thr->task->taskgroup->cancelled) |
| + return; |
| + if (thr->task->taskgroup->workshare |
| + && thr->task->taskgroup->prev |
| + && thr->task->taskgroup->prev->cancelled) |
| + return; |
| + } |
| } |
| else |
| - ialias_call (GOMP_taskgroup_start) (); |
| + { |
| + ialias_call (GOMP_taskgroup_start) (); |
| + if (flags & GOMP_TASK_FLAG_REDUCTION) |
| + { |
| + struct gomp_data_head { TYPE t1, t2; uintptr_t *ptr; }; |
| + uintptr_t *ptr = ((struct gomp_data_head *) data)->ptr; |
| + ialias_call (GOMP_taskgroup_reduction_register) (ptr); |
| + } |
| + } |
| |
| if (priority > gomp_max_task_priority_var) |
| priority = gomp_max_task_priority_var; |
| @@ -284,19 +301,31 @@ GOMP_taskloop (void (*fn) (void *), void |
| gomp_mutex_lock (&team->task_lock); |
| /* If parallel or taskgroup has been cancelled, don't start new |
| tasks. */ |
| - if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier) |
| - || (taskgroup && taskgroup->cancelled)) |
| - && cpyfn == NULL, 0)) |
| + if (__builtin_expect (gomp_cancel_var, 0) |
| + && cpyfn == NULL) |
| { |
| - gomp_mutex_unlock (&team->task_lock); |
| - for (i = 0; i < num_tasks; i++) |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + { |
| + do_cancel: |
| + gomp_mutex_unlock (&team->task_lock); |
| + for (i = 0; i < num_tasks; i++) |
| + { |
| + gomp_finish_task (tasks[i]); |
| + free (tasks[i]); |
| + } |
| + if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0) |
| + ialias_call (GOMP_taskgroup_end) (); |
| + return; |
| + } |
| + if (taskgroup) |
| { |
| - gomp_finish_task (tasks[i]); |
| - free (tasks[i]); |
| + if (taskgroup->cancelled) |
| + goto do_cancel; |
| + if (taskgroup->workshare |
| + && taskgroup->prev |
| + && taskgroup->prev->cancelled) |
| + goto do_cancel; |
| } |
| - if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0) |
| - ialias_call (GOMP_taskgroup_end) (); |
| - return; |
| } |
| if (taskgroup) |
| taskgroup->num_children += num_tasks; |
| |
| |
| @@ -123,7 +123,8 @@ void |
| GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads) |
| { |
| num_threads = gomp_resolve_num_threads (num_threads, 0); |
| - gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads)); |
| + gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads), |
| + NULL); |
| } |
| |
| void |
| @@ -161,14 +162,33 @@ GOMP_parallel_end (void) |
| ialias (GOMP_parallel_end) |
| |
| void |
| -GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags) |
| +GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, |
| + unsigned int flags) |
| { |
| num_threads = gomp_resolve_num_threads (num_threads, 0); |
| - gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads)); |
| + gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads), |
| + NULL); |
| fn (data); |
| ialias_call (GOMP_parallel_end) (); |
| } |
| |
| +unsigned |
| +GOMP_parallel_reductions (void (*fn) (void *), void *data, |
| + unsigned num_threads, unsigned int flags) |
| +{ |
| + struct gomp_taskgroup *taskgroup; |
| + num_threads = gomp_resolve_num_threads (num_threads, 0); |
| + uintptr_t *rdata = *(uintptr_t **)data; |
| + taskgroup = gomp_parallel_reduction_register (rdata, num_threads); |
| + gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads), |
| + taskgroup); |
| + fn (data); |
| + ialias_call (GOMP_parallel_end) (); |
| + gomp_sem_destroy (&taskgroup->taskgroup_sem); |
| + free (taskgroup); |
| + return num_threads; |
| +} |
| + |
| bool |
| GOMP_cancellation_point (int which) |
| { |
| @@ -185,8 +205,15 @@ GOMP_cancellation_point (int which) |
| } |
| else if (which & GOMP_CANCEL_TASKGROUP) |
| { |
| - if (thr->task->taskgroup && thr->task->taskgroup->cancelled) |
| - return true; |
| + if (thr->task->taskgroup) |
| + { |
| + if (thr->task->taskgroup->cancelled) |
| + return true; |
| + if (thr->task->taskgroup->workshare |
| + && thr->task->taskgroup->prev |
| + && thr->task->taskgroup->prev->cancelled) |
| + return true; |
| + } |
| /* FALLTHRU into the GOMP_CANCEL_PARALLEL case, |
| as #pragma omp cancel parallel also cancels all explicit |
| tasks. */ |
| @@ -218,11 +245,17 @@ GOMP_cancel (int which, bool do_cancel) |
| } |
| else if (which & GOMP_CANCEL_TASKGROUP) |
| { |
| - if (thr->task->taskgroup && !thr->task->taskgroup->cancelled) |
| + if (thr->task->taskgroup) |
| { |
| - gomp_mutex_lock (&team->task_lock); |
| - thr->task->taskgroup->cancelled = true; |
| - gomp_mutex_unlock (&team->task_lock); |
| + struct gomp_taskgroup *taskgroup = thr->task->taskgroup; |
| + if (taskgroup->workshare && taskgroup->prev) |
| + taskgroup = taskgroup->prev; |
| + if (!taskgroup->cancelled) |
| + { |
| + gomp_mutex_lock (&team->task_lock); |
| + taskgroup->cancelled = true; |
| + gomp_mutex_unlock (&team->task_lock); |
| + } |
| } |
| return true; |
| } |
| |
| |
| @@ -29,5 +29,6 @@ |
| |
| extern void GOMP_PLUGIN_async_unmap_vars (void *, int); |
| extern void *GOMP_PLUGIN_acc_thread (void); |
| +extern int GOMP_PLUGIN_acc_default_dim (unsigned int); |
| |
| #endif |
| |
| |
| @@ -180,16 +180,22 @@ gomp_device_copy (struct gomp_device_des |
| /* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses) |
| host to device memory transfers. */ |
| |
| +struct gomp_coalesce_chunk |
| +{ |
| + /* The starting and ending point of a coalesced chunk of memory. */ |
| + size_t start, end; |
| +}; |
| + |
| struct gomp_coalesce_buf |
| { |
| /* Buffer into which gomp_copy_host2dev will memcpy data and from which |
| it will be copied to the device. */ |
| void *buf; |
| struct target_mem_desc *tgt; |
| - /* Array with offsets, chunks[2 * i] is the starting offset and |
| - chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address |
| + /* Array with offsets, chunks[i].start is the starting offset and |
| + chunks[i].end ending offset relative to tgt->tgt_start device address |
| of chunks which are to be copied to buf and later copied to device. */ |
| - size_t *chunks; |
| + struct gomp_coalesce_chunk *chunks; |
| /* Number of chunks in chunks array, or -1 if coalesce buffering should not |
| be performed. */ |
| long chunk_cnt; |
| @@ -222,14 +228,14 @@ gomp_coalesce_buf_add (struct gomp_coale |
| { |
| if (cbuf->chunk_cnt < 0) |
| return; |
| - if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1]) |
| + if (start < cbuf->chunks[cbuf->chunk_cnt - 1].end) |
| { |
| cbuf->chunk_cnt = -1; |
| return; |
| } |
| - if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP) |
| + if (start < cbuf->chunks[cbuf->chunk_cnt - 1].end + MAX_COALESCE_BUF_GAP) |
| { |
| - cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len; |
| + cbuf->chunks[cbuf->chunk_cnt - 1].end = start + len; |
| cbuf->use_cnt++; |
| return; |
| } |
| @@ -239,8 +245,8 @@ gomp_coalesce_buf_add (struct gomp_coale |
| if (cbuf->use_cnt == 1) |
| cbuf->chunk_cnt--; |
| } |
| - cbuf->chunks[2 * cbuf->chunk_cnt] = start; |
| - cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len; |
| + cbuf->chunks[cbuf->chunk_cnt].start = start; |
| + cbuf->chunks[cbuf->chunk_cnt].end = start + len; |
| cbuf->chunk_cnt++; |
| cbuf->use_cnt = 1; |
| } |
| @@ -271,20 +277,20 @@ gomp_copy_host2dev (struct gomp_device_d |
| if (cbuf) |
| { |
| uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start; |
| - if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1]) |
| + if (doff < cbuf->chunks[cbuf->chunk_cnt - 1].end) |
| { |
| long first = 0; |
| long last = cbuf->chunk_cnt - 1; |
| while (first <= last) |
| { |
| long middle = (first + last) >> 1; |
| - if (cbuf->chunks[2 * middle + 1] <= doff) |
| + if (cbuf->chunks[middle].end <= doff) |
| first = middle + 1; |
| - else if (cbuf->chunks[2 * middle] <= doff) |
| + else if (cbuf->chunks[middle].start <= doff) |
| { |
| - if (doff + sz > cbuf->chunks[2 * middle + 1]) |
| + if (doff + sz > cbuf->chunks[middle].end) |
| gomp_fatal ("internal libgomp cbuf error"); |
| - memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]), |
| + memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0].start), |
| h, sz); |
| return; |
| } |
| @@ -510,8 +516,8 @@ gomp_map_vars (struct gomp_device_descr |
| cbuf.buf = NULL; |
| if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET) |
| { |
| - cbuf.chunks |
| - = (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t)); |
| + size_t chunks_size = (mapnum + 1) * sizeof (struct gomp_coalesce_chunk); |
| + cbuf.chunks = (struct gomp_coalesce_chunk *) gomp_alloca (chunks_size); |
| cbuf.chunk_cnt = 0; |
| } |
| if (pragma_kind == GOMP_MAP_VARS_TARGET) |
| @@ -521,8 +527,8 @@ gomp_map_vars (struct gomp_device_descr |
| tgt_size = mapnum * sizeof (void *); |
| cbuf.chunk_cnt = 1; |
| cbuf.use_cnt = 1 + (mapnum > 1); |
| - cbuf.chunks[0] = 0; |
| - cbuf.chunks[1] = tgt_size; |
| + cbuf.chunks[0].start = 0; |
| + cbuf.chunks[0].end = tgt_size; |
| } |
| |
| gomp_mutex_lock (&devicep->lock); |
| @@ -707,7 +713,7 @@ gomp_map_vars (struct gomp_device_descr |
| if (cbuf.chunk_cnt > 0) |
| { |
| cbuf.buf |
| - = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]); |
| + = malloc (cbuf.chunks[cbuf.chunk_cnt - 1].end - cbuf.chunks[0].start); |
| if (cbuf.buf) |
| { |
| cbuf.tgt = tgt; |
| @@ -859,6 +865,7 @@ gomp_map_vars (struct gomp_device_descr |
| tgt->list[i].offset = 0; |
| tgt->list[i].length = k->host_end - k->host_start; |
| k->refcount = 1; |
| + k->dynamic_refcount = 0; |
| tgt->refcount++; |
| array->left = NULL; |
| array->right = NULL; |
| @@ -956,9 +963,10 @@ gomp_map_vars (struct gomp_device_descr |
| /* Set link pointer on target to the device address of the |
| mapped object. */ |
| void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset); |
| - devicep->host2dev_func (devicep->target_id, |
| - (void *) n->tgt_offset, |
| - &tgt_addr, sizeof (void *)); |
| + /* We intentionally do not use coalescing here, as it's not |
| + data allocated by the current call to this function. */ |
| + gomp_copy_host2dev (devicep, (void *) n->tgt_offset, |
| + &tgt_addr, sizeof (void *), NULL); |
| } |
| array++; |
| } |
| @@ -981,10 +989,14 @@ gomp_map_vars (struct gomp_device_descr |
| { |
| long c = 0; |
| for (c = 0; c < cbuf.chunk_cnt; ++c) |
| - gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * c]), |
| - (char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]), |
| - cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL); |
| + gomp_copy_host2dev (devicep, |
| + (void *) (tgt->tgt_start + cbuf.chunks[c].start), |
| + (char *) cbuf.buf + (cbuf.chunks[c].start |
| + - cbuf.chunks[0].start), |
| + cbuf.chunks[c].end - cbuf.chunks[c].start, NULL); |
| free (cbuf.buf); |
| + cbuf.buf = NULL; |
| + cbufp = NULL; |
| } |
| |
| /* If the variable from "omp target enter data" map-list was already mapped, |
| @@ -1011,6 +1023,23 @@ gomp_unmap_tgt (struct target_mem_desc * |
| free (tgt); |
| } |
| |
| +attribute_hidden bool |
| +gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k) |
| +{ |
| + bool is_tgt_unmapped = false; |
| + splay_tree_remove (&devicep->mem_map, k); |
| + if (k->link_key) |
| + splay_tree_insert (&devicep->mem_map, (splay_tree_node) k->link_key); |
| + if (k->tgt->refcount > 1) |
| + k->tgt->refcount--; |
| + else |
| + { |
| + is_tgt_unmapped = true; |
| + gomp_unmap_tgt (k->tgt); |
| + } |
| + return is_tgt_unmapped; |
| +} |
| + |
| /* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant |
| variables back from device to host: if it is false, it is assumed that this |
| has been done already. */ |
| @@ -1059,16 +1088,7 @@ gomp_unmap_vars (struct target_mem_desc |
| + tgt->list[i].offset), |
| tgt->list[i].length); |
| if (do_unmap) |
| - { |
| - splay_tree_remove (&devicep->mem_map, k); |
| - if (k->link_key) |
| - splay_tree_insert (&devicep->mem_map, |
| - (splay_tree_node) k->link_key); |
| - if (k->tgt->refcount > 1) |
| - k->tgt->refcount--; |
| - else |
| - gomp_unmap_tgt (k->tgt); |
| - } |
| + gomp_remove_var (devicep, k); |
| } |
| |
| if (tgt->refcount > 1) |
| @@ -1298,17 +1318,7 @@ gomp_unload_image_from_device (struct go |
| else |
| { |
| splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &k); |
| - splay_tree_remove (&devicep->mem_map, n); |
| - if (n->link_key) |
| - { |
| - if (n->tgt->refcount > 1) |
| - n->tgt->refcount--; |
| - else |
| - { |
| - is_tgt_unmapped = true; |
| - gomp_unmap_tgt (n->tgt); |
| - } |
| - } |
| + is_tgt_unmapped = gomp_remove_var (devicep, n); |
| } |
| } |
| |
| @@ -1855,11 +1865,20 @@ GOMP_target_update_ext (int device, size |
| struct gomp_team *team = thr->ts.team; |
| /* If parallel or taskgroup has been cancelled, don't start new |
| tasks. */ |
| - if (team |
| - && (gomp_team_barrier_cancelled (&team->barrier) |
| - || (thr->task->taskgroup |
| - && thr->task->taskgroup->cancelled))) |
| - return; |
| + if (__builtin_expect (gomp_cancel_var, 0) && team) |
| + { |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + return; |
| + if (thr->task->taskgroup) |
| + { |
| + if (thr->task->taskgroup->cancelled) |
| + return; |
| + if (thr->task->taskgroup->workshare |
| + && thr->task->taskgroup->prev |
| + && thr->task->taskgroup->prev->cancelled) |
| + return; |
| + } |
| + } |
| |
| gomp_task_maybe_wait_for_dependencies (depend); |
| } |
| @@ -1874,10 +1893,20 @@ GOMP_target_update_ext (int device, size |
| struct gomp_thread *thr = gomp_thread (); |
| struct gomp_team *team = thr->ts.team; |
| /* If parallel or taskgroup has been cancelled, don't start new tasks. */ |
| - if (team |
| - && (gomp_team_barrier_cancelled (&team->barrier) |
| - || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) |
| - return; |
| + if (__builtin_expect (gomp_cancel_var, 0) && team) |
| + { |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + return; |
| + if (thr->task->taskgroup) |
| + { |
| + if (thr->task->taskgroup->cancelled) |
| + return; |
| + if (thr->task->taskgroup->workshare |
| + && thr->task->taskgroup->prev |
| + && thr->task->taskgroup->prev->cancelled) |
| + return; |
| + } |
| + } |
| |
| gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, true); |
| } |
| @@ -1986,11 +2015,20 @@ GOMP_target_enter_exit_data (int device, |
| struct gomp_team *team = thr->ts.team; |
| /* If parallel or taskgroup has been cancelled, don't start new |
| tasks. */ |
| - if (team |
| - && (gomp_team_barrier_cancelled (&team->barrier) |
| - || (thr->task->taskgroup |
| - && thr->task->taskgroup->cancelled))) |
| - return; |
| + if (__builtin_expect (gomp_cancel_var, 0) && team) |
| + { |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + return; |
| + if (thr->task->taskgroup) |
| + { |
| + if (thr->task->taskgroup->cancelled) |
| + return; |
| + if (thr->task->taskgroup->workshare |
| + && thr->task->taskgroup->prev |
| + && thr->task->taskgroup->prev->cancelled) |
| + return; |
| + } |
| + } |
| |
| gomp_task_maybe_wait_for_dependencies (depend); |
| } |
| @@ -2005,10 +2043,20 @@ GOMP_target_enter_exit_data (int device, |
| struct gomp_thread *thr = gomp_thread (); |
| struct gomp_team *team = thr->ts.team; |
| /* If parallel or taskgroup has been cancelled, don't start new tasks. */ |
| - if (team |
| - && (gomp_team_barrier_cancelled (&team->barrier) |
| - || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) |
| - return; |
| + if (__builtin_expect (gomp_cancel_var, 0) && team) |
| + { |
| + if (gomp_team_barrier_cancelled (&team->barrier)) |
| + return; |
| + if (thr->task->taskgroup) |
| + { |
| + if (thr->task->taskgroup->cancelled) |
| + return; |
| + if (thr->task->taskgroup->workshare |
| + && thr->task->taskgroup->prev |
| + && thr->task->taskgroup->prev->cancelled) |
| + return; |
| + } |
| + } |
| |
| size_t i; |
| if ((flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0) |
| @@ -2197,8 +2245,9 @@ omp_target_is_present (void *ptr, int de |
| } |
| |
| int |
| -omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset, |
| - size_t src_offset, int dst_device_num, int src_device_num) |
| +omp_target_memcpy (void *dst, void *src, size_t length, |
| + size_t dst_offset, size_t src_offset, int dst_device_num, |
| + int src_device_num) |
| { |
| struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL; |
| bool ret; |
| @@ -2287,21 +2336,25 @@ omp_target_memcpy_rect_worker (void *dst |
| return EINVAL; |
| if (dst_devicep == NULL && src_devicep == NULL) |
| { |
| - memcpy ((char *) dst + dst_off, (char *) src + src_off, length); |
| + memcpy ((char *) dst + dst_off, (char *) src + src_off, |
| + length); |
| ret = 1; |
| } |
| else if (src_devicep == NULL) |
| ret = dst_devicep->host2dev_func (dst_devicep->target_id, |
| (char *) dst + dst_off, |
| - (char *) src + src_off, length); |
| + (char *) src + src_off, |
| + length); |
| else if (dst_devicep == NULL) |
| ret = src_devicep->dev2host_func (src_devicep->target_id, |
| (char *) dst + dst_off, |
| - (char *) src + src_off, length); |
| + (char *) src + src_off, |
| + length); |
| else if (src_devicep == dst_devicep) |
| ret = src_devicep->dev2dev_func (src_devicep->target_id, |
| (char *) dst + dst_off, |
| - (char *) src + src_off, length); |
| + (char *) src + src_off, |
| + length); |
| else |
| ret = 0; |
| return ret ? 0 : EINVAL; |
| @@ -2396,8 +2449,8 @@ omp_target_memcpy_rect (void *dst, void |
| } |
| |
| int |
| -omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size, |
| - size_t device_offset, int device_num) |
| +omp_target_associate_ptr (void *host_ptr, void *device_ptr, |
| + size_t size, size_t device_offset, int device_num) |
| { |
| if (device_num == GOMP_DEVICE_HOST_FALLBACK) |
| return EINVAL; |
| @@ -2499,6 +2552,31 @@ omp_target_disassociate_ptr (void *ptr, |
| return ret; |
| } |
| |
| +int |
| +omp_pause_resource (omp_pause_resource_t kind, int device_num) |
| +{ |
| + (void) kind; |
| + if (device_num == GOMP_DEVICE_HOST_FALLBACK) |
| + return gomp_pause_host (); |
| + if (device_num < 0 || device_num >= gomp_get_num_devices ()) |
| + return -1; |
| + /* Do nothing for target devices for now. */ |
| + return 0; |
| +} |
| + |
| +int |
| +omp_pause_resource_all (omp_pause_resource_t kind) |
| +{ |
| + (void) kind; |
| + if (gomp_pause_host ()) |
| + return -1; |
| + /* Do nothing for target devices for now. */ |
| + return 0; |
| +} |
| + |
| +ialias (omp_pause_resource) |
| +ialias (omp_pause_resource_all) |
| + |
| #ifdef PLUGIN_SUPPORT |
| |
| /* This function tries to load a plugin for DEVICE. Name of plugin is passed |
| @@ -2632,9 +2710,9 @@ gomp_target_fini (void) |
| } |
| } |
| |
| -/* This function initializes the runtime needed for offloading. |
| - It parses the list of offload targets and tries to load the plugins for |
| - these targets. On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP |
| +/* This function initializes the runtime for offloading. |
| + It parses the list of offload plugins, and tries to load these. |
| + On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP |
| will be set, and the array DEVICES initialized, containing descriptors for |
| corresponding devices, first the GOMP_OFFLOAD_CAP_OPENMP_400 ones, follows |
| by the others. */ |
| @@ -2651,7 +2729,7 @@ gomp_target_init (void) |
| num_devices = 0; |
| devices = NULL; |
| |
| - cur = OFFLOAD_TARGETS; |
| + cur = OFFLOAD_PLUGINS; |
| if (*cur) |
| do |
| { |
| |
| |
| @@ -259,7 +259,8 @@ GOMP_ordered_end (void) |
| #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__) |
| |
| void |
| -gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size) |
| +gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size, |
| + size_t extra) |
| { |
| struct gomp_thread *thr = gomp_thread (); |
| struct gomp_team *team = thr->ts.team; |
| @@ -269,13 +270,24 @@ gomp_doacross_init (unsigned ncounts, lo |
| struct gomp_doacross_work_share *doacross; |
| |
| if (team == NULL || team->nthreads == 1) |
| - return; |
| + { |
| + empty: |
| + if (!extra) |
| + ws->doacross = NULL; |
| + else |
| + { |
| + doacross = gomp_malloc_cleared (sizeof (*doacross) + extra); |
| + doacross->extra = (void *) (doacross + 1); |
| + ws->doacross = doacross; |
| + } |
| + return; |
| + } |
| |
| for (i = 0; i < ncounts; i++) |
| { |
| /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ |
| if (counts[i] == 0) |
| - return; |
| + goto empty; |
| |
| if (num_bits <= MAX_COLLAPSED_BITS) |
| { |
| @@ -314,7 +326,7 @@ gomp_doacross_init (unsigned ncounts, lo |
| elt_sz = (elt_sz + 63) & ~63UL; |
| |
| doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz |
| - + shift_sz); |
| + + shift_sz + extra); |
| doacross->chunk_size = chunk_size; |
| doacross->elt_sz = elt_sz; |
| doacross->ncounts = ncounts; |
| @@ -322,6 +334,13 @@ gomp_doacross_init (unsigned ncounts, lo |
| doacross->array = (unsigned char *) |
| ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) |
| & ~(uintptr_t) 63); |
| + if (extra) |
| + { |
| + doacross->extra = doacross->array + num_ents * elt_sz; |
| + memset (doacross->extra, '\0', extra); |
| + } |
| + else |
| + doacross->extra = NULL; |
| if (num_bits <= MAX_COLLAPSED_BITS) |
| { |
| unsigned int shift_count = 0; |
| @@ -360,7 +379,8 @@ GOMP_doacross_post (long *counts) |
| unsigned long ent; |
| unsigned int i; |
| |
| - if (__builtin_expect (doacross == NULL, 0)) |
| + if (__builtin_expect (doacross == NULL, 0) |
| + || __builtin_expect (doacross->array == NULL, 0)) |
| { |
| __sync_synchronize (); |
| return; |
| @@ -411,7 +431,8 @@ GOMP_doacross_wait (long first, ...) |
| unsigned long ent; |
| unsigned int i; |
| |
| - if (__builtin_expect (doacross == NULL, 0)) |
| + if (__builtin_expect (doacross == NULL, 0) |
| + || __builtin_expect (doacross->array == NULL, 0)) |
| { |
| __sync_synchronize (); |
| return; |
| @@ -488,7 +509,8 @@ GOMP_doacross_wait (long first, ...) |
| typedef unsigned long long gomp_ull; |
| |
| void |
| -gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size) |
| +gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, |
| + gomp_ull chunk_size, size_t extra) |
| { |
| struct gomp_thread *thr = gomp_thread (); |
| struct gomp_team *team = thr->ts.team; |
| @@ -498,13 +520,24 @@ gomp_doacross_ull_init (unsigned ncounts |
| struct gomp_doacross_work_share *doacross; |
| |
| if (team == NULL || team->nthreads == 1) |
| - return; |
| + { |
| + empty: |
| + if (!extra) |
| + ws->doacross = NULL; |
| + else |
| + { |
| + doacross = gomp_malloc_cleared (sizeof (*doacross) + extra); |
| + doacross->extra = (void *) (doacross + 1); |
| + ws->doacross = doacross; |
| + } |
| + return; |
| + } |
| |
| for (i = 0; i < ncounts; i++) |
| { |
| /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ |
| if (counts[i] == 0) |
| - return; |
| + goto empty; |
| |
| if (num_bits <= MAX_COLLAPSED_BITS) |
| { |
| @@ -557,6 +590,13 @@ gomp_doacross_ull_init (unsigned ncounts |
| doacross->array = (unsigned char *) |
| ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) |
| & ~(uintptr_t) 63); |
| + if (extra) |
| + { |
| + doacross->extra = doacross->array + num_ents * elt_sz; |
| + memset (doacross->extra, '\0', extra); |
| + } |
| + else |
| + doacross->extra = NULL; |
| if (num_bits <= MAX_COLLAPSED_BITS) |
| { |
| unsigned int shift_count = 0; |
| @@ -595,7 +635,8 @@ GOMP_doacross_ull_post (gomp_ull *counts |
| unsigned long ent; |
| unsigned int i; |
| |
| - if (__builtin_expect (doacross == NULL, 0)) |
| + if (__builtin_expect (doacross == NULL, 0) |
| + || __builtin_expect (doacross->array == NULL, 0)) |
| { |
| __sync_synchronize (); |
| return; |
| @@ -667,7 +708,8 @@ GOMP_doacross_ull_wait (gomp_ull first, |
| unsigned long ent; |
| unsigned int i; |
| |
| - if (__builtin_expect (doacross == NULL, 0)) |
| + if (__builtin_expect (doacross == NULL, 0) |
| + || __builtin_expect (doacross->array == NULL, 0)) |
| { |
| __sync_synchronize (); |
| return; |
| |
| |
| @@ -57,3 +57,50 @@ gomp_realloc (void *old, size_t size) |
| gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size); |
| return ret; |
| } |
| + |
| +void * |
| +gomp_aligned_alloc (size_t al, size_t size) |
| +{ |
| + void *ret; |
| + if (al < sizeof (void *)) |
| + al = sizeof (void *); |
| +#ifdef HAVE_ALIGNED_ALLOC |
| + ret = aligned_alloc (al, size); |
| +#elif defined(HAVE__ALIGNED_MALLOC) |
| + ret = _aligned_malloc (size, al); |
| +#elif defined(HAVE_POSIX_MEMALIGN) |
| + if (posix_memalign (&ret, al, size) != 0) |
| + ret = NULL; |
| +#elif defined(HAVE_MEMALIGN) |
| + { |
| + extern void *memalign (size_t, size_t); |
| + ret = memalign (al, size); |
| + } |
| +#else |
| + ret = NULL; |
| + if ((al & (al - 1)) == 0 && size) |
| + { |
| + void *p = malloc (size + al); |
| + if (p) |
| + { |
| + void *ap = (void *) (((uintptr_t) p + al) & -al); |
| + ((void **) ap)[-1] = p; |
| + ret = ap; |
| + } |
| + } |
| +#endif |
| + if (ret == NULL) |
| + gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size); |
| + return ret; |
| +} |
| + |
| +void |
| +gomp_aligned_free (void *ptr) |
| +{ |
| +#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC |
| + free (ptr); |
| +#else |
| + if (ptr) |
| + free (((void **) ptr)[-1]); |
| +#endif |
| +} |
| |
| |
| @@ -219,6 +219,7 @@ m4_include([plugin/configfrag.ac]) |
| |
| # Check for functions needed. |
| AC_CHECK_FUNCS(getloadavg clock_gettime strtoull) |
| +AC_CHECK_FUNCS(aligned_alloc posix_memalign memalign _aligned_malloc) |
| |
| # Check for broken semaphore implementation on darwin. |
| # sem_init returns: sem_init error: Function not implemented. |
| @@ -266,6 +267,41 @@ if test $ac_cv_func_clock_gettime = no; |
| [Define to 1 if you have the `clock_gettime' function.])]) |
| fi |
| |
| +# Check for uname. |
| +AC_COMPILE_IFELSE( |
| + [AC_LANG_PROGRAM( |
| + [#include <string.h> |
| + #include <stdlib.h> |
| + #include <sys/utsname.h>], |
| + [struct utsname buf; |
| + volatile size_t len = 0; |
| + if (!uname (buf)) |
| + len = strlen (buf.nodename);])], |
| + AC_DEFINE(HAVE_UNAME, 1, |
| +[ Define if uname is supported and struct utsname has nodename field.])) |
| + |
| +# Check for gethostname. |
| +AC_COMPILE_IFELSE( |
| + [AC_LANG_PROGRAM( |
| + [#include <unistd.h>], |
| + [ |
| +changequote(,)dnl |
| + char buf[256]; |
| + if (gethostname (buf, sizeof (buf) - 1) == 0) |
| + buf[255] = '\0'; |
| +changequote([,])dnl |
| + ])], |
| + AC_DEFINE(HAVE_GETHOSTNAME, 1, |
| +[ Define if gethostname is supported.])) |
| + |
| +# Check for getpid. |
| +AC_COMPILE_IFELSE( |
| + [AC_LANG_PROGRAM( |
| + [#include <unistd.h>], |
| + [int pid = getpid ();])], |
| + AC_DEFINE(HAVE_GETPID, 1, |
| +[ Define if getpid is supported.])) |
| + |
| # See if we support thread-local storage. |
| GCC_CHECK_TLS |
| |
| |
| |
| @@ -69,7 +69,7 @@ void |
| omp_set_schedule (omp_sched_t kind, int chunk_size) |
| { |
| struct gomp_task_icv *icv = gomp_icv (true); |
| - switch (kind) |
| + switch (kind & ~omp_sched_monotonic) |
| { |
| case omp_sched_static: |
| if (chunk_size < 1) |
| |
| |
| @@ -636,6 +636,8 @@ PLUGIN_NVPTX_FALSE |
| PLUGIN_NVPTX_TRUE |
| offload_additional_lib_paths |
| offload_additional_options |
| +offload_targets |
| +offload_plugins |
| PLUGIN_HSA_LIBS |
| PLUGIN_HSA_LDFLAGS |
| PLUGIN_HSA_CPPFLAGS |
| @@ -648,7 +650,6 @@ PLUGIN_NVPTX_CPPFLAGS |
| PLUGIN_NVPTX |
| CUDA_DRIVER_LIB |
| CUDA_DRIVER_INCLUDE |
| -offload_targets |
| libtool_VERSION |
| ac_ct_FC |
| FCFLAGS |
| @@ -11157,7 +11158,7 @@ else |
| lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 |
| lt_status=$lt_dlunknown |
| cat > conftest.$ac_ext <<_LT_EOF |
| -#line 11160 "configure" |
| +#line 11161 "configure" |
| #include "confdefs.h" |
| |
| #if HAVE_DLFCN_H |
| @@ -11263,7 +11264,7 @@ else |
| lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 |
| lt_status=$lt_dlunknown |
| cat > conftest.$ac_ext <<_LT_EOF |
| -#line 11266 "configure" |
| +#line 11267 "configure" |
| #include "confdefs.h" |
| |
| #if HAVE_DLFCN_H |
| @@ -15167,8 +15168,6 @@ fi |
| # see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| # <http://www.gnu.org/licenses/>. |
| |
| -offload_targets= |
| - |
| plugin_support=yes |
| { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dlsym in -ldl" >&5 |
| $as_echo_n "checking for dlsym in -ldl... " >&6; } |
| @@ -15302,7 +15301,11 @@ if test "${with_cuda_driver_lib+set}" = |
| fi |
| |
| case "x$with_cuda_driver" in |
| - x | xno) ;; |
| + x) ;; |
| + xno) |
| + CUDA_DRIVER_INCLUDE=no |
| + CUDA_DRIVER_LIB=no |
| + ;; |
| *) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include |
| CUDA_DRIVER_LIB=$with_cuda_driver/lib |
| ;; |
| @@ -15313,10 +15316,12 @@ fi |
| if test "x$with_cuda_driver_lib" != x; then |
| CUDA_DRIVER_LIB=$with_cuda_driver_lib |
| fi |
| -if test "x$CUDA_DRIVER_INCLUDE" != x; then |
| +if test "x$CUDA_DRIVER_INCLUDE" != x \ |
| + && test "x$CUDA_DRIVER_INCLUDE" != xno; then |
| CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE |
| fi |
| -if test "x$CUDA_DRIVER_LIB" != x; then |
| +if test "x$CUDA_DRIVER_LIB" != x \ |
| + && test "x$CUDA_DRIVER_LIB" != xno; then |
| CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB |
| fi |
| |
| @@ -15383,7 +15388,13 @@ PLUGIN_HSA_LIBS= |
| |
| |
| |
| -# Get offload targets and path to install tree of offloading compiler. |
| +# Parse '--enable-offload-targets', figure out the corresponding libgomp |
| +# plugins, and configure to find the corresponding offload compilers. |
| +# 'offload_plugins' and 'offload_targets' will be populated in the same order. |
| +offload_plugins= |
| +offload_targets= |
| + |
| + |
| offload_additional_options= |
| offload_additional_lib_paths= |
| |
| @@ -15392,25 +15403,27 @@ if test x"$enable_offload_targets" != x; |
| for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do |
| tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'` |
| tgt=`echo $tgt | sed 's/=.*//'` |
| - tgt_name= |
| + tgt_plugin= |
| case $tgt in |
| *-intelmic-* | *-intelmicemul-*) |
| - tgt_name=intelmic |
| + tgt_plugin=intelmic |
| ;; |
| nvptx*) |
| - tgt_name=nvptx |
| + tgt_plugin=nvptx |
| PLUGIN_NVPTX=$tgt |
| - PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS |
| - PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS |
| - PLUGIN_NVPTX_LIBS='-lcuda' |
| - |
| - PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS |
| - CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS" |
| - PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS |
| - LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS" |
| - PLUGIN_NVPTX_save_LIBS=$LIBS |
| - LIBS="$PLUGIN_NVPTX_LIBS $LIBS" |
| - cat confdefs.h - <<_ACEOF >conftest.$ac_ext |
| + if test "x$CUDA_DRIVER_LIB" != xno \ |
| + && test "x$CUDA_DRIVER_LIB" != xno; then |
| + PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS |
| + PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS |
| + PLUGIN_NVPTX_LIBS='-lcuda' |
| + |
| + PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS |
| + CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS" |
| + PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS |
| + LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS" |
| + PLUGIN_NVPTX_save_LIBS=$LIBS |
| + LIBS="$PLUGIN_NVPTX_LIBS $LIBS" |
| + cat confdefs.h - <<_ACEOF >conftest.$ac_ext |
| /* end confdefs.h. */ |
| #include "cuda.h" |
| int |
| @@ -15426,13 +15439,16 @@ if ac_fn_c_try_link "$LINENO"; then : |
| fi |
| rm -f core conftest.err conftest.$ac_objext \ |
| conftest$ac_exeext conftest.$ac_ext |
| - CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS |
| - LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS |
| - LIBS=$PLUGIN_NVPTX_save_LIBS |
| + CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS |
| + LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS |
| + LIBS=$PLUGIN_NVPTX_save_LIBS |
| + fi |
| case $PLUGIN_NVPTX in |
| nvptx*) |
| - if test "x$CUDA_DRIVER_INCLUDE" = x \ |
| - && test "x$CUDA_DRIVER_LIB" = x; then |
| + if (test "x$CUDA_DRIVER_INCLUDE" = x \ |
| + || test "x$CUDA_DRIVER_INCLUDE" = xno) \ |
| + && (test "x$CUDA_DRIVER_LIB" = x \ |
| + || test "x$CUDA_DRIVER_LIB" = xno); then |
| PLUGIN_NVPTX=1 |
| PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda' |
| PLUGIN_NVPTX_LIBS='-ldl' |
| @@ -15452,7 +15468,7 @@ rm -f core conftest.err conftest.$ac_obj |
| PLUGIN_HSA=0 |
| ;; |
| *) |
| - tgt_name=hsa |
| + tgt_plugin=hsa |
| PLUGIN_HSA=$tgt |
| PLUGIN_HSA_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS |
| PLUGIN_HSA_LDFLAGS="$HSA_RUNTIME_LDFLAGS" |
| @@ -15470,7 +15486,7 @@ rm -f core conftest.err conftest.$ac_obj |
| LDFLAGS=$PLUGIN_HSA_save_LDFLAGS |
| LIBS=$PLUGIN_HSA_save_LIBS |
| case $PLUGIN_HSA in |
| - hsa*) |
| + hsa*) |
| HSA_PLUGIN=0 |
| as_fn_error "HSA run-time package required for HSA support" "$LINENO" 5 |
| ;; |
| @@ -15487,16 +15503,19 @@ rm -f core conftest.err conftest.$ac_obj |
| as_fn_error "unknown offload target specified" "$LINENO" 5 |
| ;; |
| esac |
| - if test x"$tgt_name" = x; then |
| - # Don't configure libgomp for this offloading target if we don't build |
| - # the corresponding plugin. |
| + if test x"$tgt_plugin" = x; then |
| + # Not configuring libgomp for this offload target if we're not building |
| + # the corresponding offload plugin. |
| continue |
| - elif test x"$offload_targets" = x; then |
| - offload_targets=$tgt_name |
| + elif test x"$offload_plugins" = x; then |
| + offload_plugins=$tgt_plugin |
| + offload_targets=$tgt |
| else |
| - offload_targets=$offload_targets,$tgt_name |
| + offload_plugins=$offload_plugins,$tgt_plugin |
| + offload_targets=$offload_targets,$tgt |
| fi |
| - if test "$tgt_name" = hsa; then |
| + # Configure additional search paths. |
| + if test "$tgt_plugin" = hsa; then |
| # Offloading compilation is all handled by the target compiler. |
| : |
| elif test x"$tgt_dir" != x; then |
| @@ -15510,7 +15529,7 @@ rm -f core conftest.err conftest.$ac_obj |
| fi |
| |
| cat >>confdefs.h <<_ACEOF |
| -#define OFFLOAD_TARGETS "$offload_targets" |
| +#define OFFLOAD_PLUGINS "$offload_plugins" |
| _ACEOF |
| |
| if test $PLUGIN_NVPTX = 1; then |
| @@ -15570,6 +15589,19 @@ _ACEOF |
| fi |
| done |
| |
| +for ac_func in aligned_alloc posix_memalign memalign _aligned_malloc |
| +do : |
| + as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` |
| +ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" |
| +eval as_val=\$$as_ac_var |
| + if test "x$as_val" = x""yes; then : |
| + cat >>confdefs.h <<_ACEOF |
| +#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1 |
| +_ACEOF |
| + |
| +fi |
| +done |
| + |
| |
| # Check for broken semaphore implementation on darwin. |
| # sem_init returns: sem_init error: Function not implemented. |
| @@ -15784,6 +15816,72 @@ fi |
| |
| fi |
| |
| +# Check for uname. |
| +cat confdefs.h - <<_ACEOF >conftest.$ac_ext |
| +/* end confdefs.h. */ |
| +#include <string.h> |
| + #include <stdlib.h> |
| + #include <sys/utsname.h> |
| +int |
| +main () |
| +{ |
| +struct utsname buf; |
| + volatile size_t len = 0; |
| + if (!uname (buf)) |
| + len = strlen (buf.nodename); |
| + ; |
| + return 0; |
| +} |
| +_ACEOF |
| +if ac_fn_c_try_compile "$LINENO"; then : |
| + |
| +$as_echo "#define HAVE_UNAME 1" >>confdefs.h |
| + |
| +fi |
| +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext |
| + |
| +# Check for gethostname. |
| +cat confdefs.h - <<_ACEOF >conftest.$ac_ext |
| +/* end confdefs.h. */ |
| +#include <unistd.h> |
| +int |
| +main () |
| +{ |
| + |
| + char buf[256]; |
| + if (gethostname (buf, sizeof (buf) - 1) == 0) |
| + buf[255] = '\0'; |
| + |
| + ; |
| + return 0; |
| +} |
| +_ACEOF |
| +if ac_fn_c_try_compile "$LINENO"; then : |
| + |
| +$as_echo "#define HAVE_GETHOSTNAME 1" >>confdefs.h |
| + |
| +fi |
| +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext |
| + |
| +# Check for getpid. |
| +cat confdefs.h - <<_ACEOF >conftest.$ac_ext |
| +/* end confdefs.h. */ |
| +#include <unistd.h> |
| +int |
| +main () |
| +{ |
| +int pid = getpid (); |
| + ; |
| + return 0; |
| +} |
| +_ACEOF |
| +if ac_fn_c_try_compile "$LINENO"; then : |
| + |
| +$as_echo "#define HAVE_GETPID 1" >>confdefs.h |
| + |
| +fi |
| +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext |
| + |
| # See if we support thread-local storage. |
| |
| |
| |
| |
| @@ -63,12 +63,13 @@ libgomp_la_SOURCES = alloc.c atomic.c ba |
| parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \ |
| proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \ |
| splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \ |
| - oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c |
| + oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \ |
| + affinity-fmt.c teams.c |
| |
| include $(top_srcdir)/plugin/Makefrag.am |
| |
| if USE_FORTRAN |
| -libgomp_la_SOURCES += openacc.f90 |
| +libgomp_la_SOURCES += openacc2.f90 |
| endif |
| |
| nodist_noinst_HEADERS = libgomp_f.h |
| @@ -87,8 +88,6 @@ omp_lib_kinds.mod: omp_lib.mod |
| : |
| openacc_kinds.mod: openacc.mod |
| : |
| -openacc.mod: openacc.lo |
| - : |
| %.mod: %.f90 |
| $(FC) $(FCFLAGS) -fsyntax-only $< |
| fortran.lo: libgomp_f.h |
| |
| |
| @@ -153,8 +153,9 @@ acc_free (void *d) |
| gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); |
| } |
| |
| -void |
| -acc_memcpy_to_device (void *d, void *h, size_t s) |
| +static void |
| +memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, |
| + const char *libfnname) |
| { |
| /* No need to call lazy open here, as the device pointer must have |
| been obtained from a routine that did that. */ |
| @@ -164,31 +165,49 @@ acc_memcpy_to_device (void *d, void *h, |
| |
| if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) |
| { |
| - memmove (d, h, s); |
| + if (from) |
| + memmove (h, d, s); |
| + else |
| + memmove (d, h, s); |
| return; |
| } |
| |
| - if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s)) |
| - gomp_fatal ("error in %s", __FUNCTION__); |
| + if (async > acc_async_sync) |
| + thr->dev->openacc.async_set_async_func (async); |
| + |
| + bool ret = (from |
| + ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s) |
| + : thr->dev->host2dev_func (thr->dev->target_id, d, h, s)); |
| + |
| + if (async > acc_async_sync) |
| + thr->dev->openacc.async_set_async_func (acc_async_sync); |
| + |
| + if (!ret) |
| + gomp_fatal ("error in %s", libfnname); |
| } |
| |
| void |
| -acc_memcpy_from_device (void *h, void *d, size_t s) |
| +acc_memcpy_to_device (void *d, void *h, size_t s) |
| { |
| - /* No need to call lazy open here, as the device pointer must have |
| - been obtained from a routine that did that. */ |
| - struct goacc_thread *thr = goacc_thread (); |
| + memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__); |
| +} |
| |
| - assert (thr && thr->dev); |
| +void |
| +acc_memcpy_to_device_async (void *d, void *h, size_t s, int async) |
| +{ |
| + memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__); |
| +} |
| |
| - if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) |
| - { |
| - memmove (h, d, s); |
| - return; |
| - } |
| +void |
| +acc_memcpy_from_device (void *h, void *d, size_t s) |
| +{ |
| + memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__); |
| +} |
| |
| - if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s)) |
| - gomp_fatal ("error in %s", __FUNCTION__); |
| +void |
| +acc_memcpy_from_device_async (void *h, void *d, size_t s, int async) |
| +{ |
| + memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__); |
| } |
| |
| /* Return the device pointer that corresponds to host data H. Or NULL |
| @@ -347,6 +366,7 @@ acc_map_data (void *h, void *d, size_t s |
| |
| tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes, |
| &kinds, true, GOMP_MAP_VARS_OPENACC); |
| + tgt->list[0].key->refcount = REFCOUNT_INFINITY; |
| } |
| |
| gomp_mutex_lock (&acc_dev->lock); |
| @@ -389,6 +409,9 @@ acc_unmap_data (void *h) |
| (void *) n->host_start, (int) host_size, (void *) h); |
| } |
| |
| + /* Mark for removal. */ |
| + n->refcount = 1; |
| + |
| t = n->tgt; |
| |
| if (t->refcount == 2) |
| @@ -424,7 +447,7 @@ acc_unmap_data (void *h) |
| #define FLAG_COPY (1 << 2) |
| |
| static void * |
| -present_create_copy (unsigned f, void *h, size_t s) |
| +present_create_copy (unsigned f, void *h, size_t s, int async) |
| { |
| void *d; |
| splay_tree_key n; |
| @@ -460,6 +483,11 @@ present_create_copy (unsigned f, void *h |
| gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s); |
| } |
| |
| + if (n->refcount != REFCOUNT_INFINITY) |
| + { |
| + n->refcount++; |
| + n->dynamic_refcount++; |
| + } |
| gomp_mutex_unlock (&acc_dev->lock); |
| } |
| else if (!(f & FLAG_CREATE)) |
| @@ -481,8 +509,16 @@ present_create_copy (unsigned f, void *h |
| |
| gomp_mutex_unlock (&acc_dev->lock); |
| |
| + if (async > acc_async_sync) |
| + acc_dev->openacc.async_set_async_func (async); |
| + |
| tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true, |
| GOMP_MAP_VARS_OPENACC); |
| + /* Initialize dynamic refcount. */ |
| + tgt->list[0].key->dynamic_refcount = 1; |
| + |
| + if (async > acc_async_sync) |
| + acc_dev->openacc.async_set_async_func (acc_async_sync); |
| |
| gomp_mutex_lock (&acc_dev->lock); |
| |
| @@ -499,53 +535,71 @@ present_create_copy (unsigned f, void *h |
| void * |
| acc_create (void *h, size_t s) |
| { |
| - return present_create_copy (FLAG_CREATE, h, s); |
| + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync); |
| } |
| |
| -void * |
| -acc_copyin (void *h, size_t s) |
| +void |
| +acc_create_async (void *h, size_t s, int async) |
| { |
| - return present_create_copy (FLAG_CREATE | FLAG_COPY, h, s); |
| + present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async); |
| } |
| |
| +/* acc_present_or_create used to be what acc_create is now. */ |
| +/* acc_pcreate is acc_present_or_create by a different name. */ |
| +#ifdef HAVE_ATTRIBUTE_ALIAS |
| +strong_alias (acc_create, acc_present_or_create) |
| +strong_alias (acc_create, acc_pcreate) |
| +#else |
| void * |
| acc_present_or_create (void *h, size_t s) |
| { |
| - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); |
| + return acc_create (h, s); |
| } |
| |
| -/* acc_pcreate is acc_present_or_create by a different name. */ |
| -#ifdef HAVE_ATTRIBUTE_ALIAS |
| -strong_alias (acc_present_or_create, acc_pcreate) |
| -#else |
| void * |
| acc_pcreate (void *h, size_t s) |
| { |
| - return acc_present_or_create (h, s); |
| + return acc_create (h, s); |
| } |
| #endif |
| |
| void * |
| -acc_present_or_copyin (void *h, size_t s) |
| +acc_copyin (void *h, size_t s) |
| +{ |
| + return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, |
| + acc_async_sync); |
| +} |
| + |
| +void |
| +acc_copyin_async (void *h, size_t s, int async) |
| { |
| - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); |
| + present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async); |
| } |
| |
| +/* acc_present_or_copyin used to be what acc_copyin is now. */ |
| /* acc_pcopyin is acc_present_or_copyin by a different name. */ |
| #ifdef HAVE_ATTRIBUTE_ALIAS |
| -strong_alias (acc_present_or_copyin, acc_pcopyin) |
| +strong_alias (acc_copyin, acc_present_or_copyin) |
| +strong_alias (acc_copyin, acc_pcopyin) |
| #else |
| void * |
| +acc_present_or_copyin (void *h, size_t s) |
| +{ |
| + return acc_copyin (h, s); |
| +} |
| + |
| +void * |
| acc_pcopyin (void *h, size_t s) |
| { |
| - return acc_present_or_copyin (h, s); |
| + return acc_copyin (h, s); |
| } |
| #endif |
| |
| -#define FLAG_COPYOUT (1 << 0) |
| +#define FLAG_COPYOUT (1 << 0) |
| +#define FLAG_FINALIZE (1 << 1) |
| |
| static void |
| -delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) |
| +delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname) |
| { |
| size_t host_size; |
| splay_tree_key n; |
| @@ -581,31 +635,111 @@ delete_copyout (unsigned f, void *h, siz |
| (void *) n->host_start, (int) host_size, (void *) h, (int) s); |
| } |
| |
| - gomp_mutex_unlock (&acc_dev->lock); |
| + if (n->refcount == REFCOUNT_INFINITY) |
| + { |
| + n->refcount = 0; |
| + n->dynamic_refcount = 0; |
| + } |
| + if (n->refcount < n->dynamic_refcount) |
| + { |
| + gomp_mutex_unlock (&acc_dev->lock); |
| + gomp_fatal ("Dynamic reference counting assert fail\n"); |
| + } |
| |
| - if (f & FLAG_COPYOUT) |
| - acc_dev->dev2host_func (acc_dev->target_id, h, d, s); |
| + if (f & FLAG_FINALIZE) |
| + { |
| + n->refcount -= n->dynamic_refcount; |
| + n->dynamic_refcount = 0; |
| + } |
| + else if (n->dynamic_refcount) |
| + { |
| + n->dynamic_refcount--; |
| + n->refcount--; |
| + } |
| + |
| + if (n->refcount == 0) |
| + { |
| + if (n->tgt->refcount == 2) |
| + { |
| + struct target_mem_desc *tp, *t; |
| + for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; |
| + tp = t, t = t->prev) |
| + if (n->tgt == t) |
| + { |
| + if (tp) |
| + tp->prev = t->prev; |
| + else |
| + acc_dev->openacc.data_environ = t->prev; |
| + break; |
| + } |
| + } |
| + |
| + if (f & FLAG_COPYOUT) |
| + { |
| + if (async > acc_async_sync) |
| + acc_dev->openacc.async_set_async_func (async); |
| + acc_dev->dev2host_func (acc_dev->target_id, h, d, s); |
| + if (async > acc_async_sync) |
| + acc_dev->openacc.async_set_async_func (acc_async_sync); |
| + } |
| |
| - acc_unmap_data (h); |
| + gomp_remove_var (acc_dev, n); |
| + } |
| |
| - if (!acc_dev->free_func (acc_dev->target_id, d)) |
| - gomp_fatal ("error in freeing device memory in %s", libfnname); |
| + gomp_mutex_unlock (&acc_dev->lock); |
| } |
| |
| void |
| acc_delete (void *h , size_t s) |
| { |
| - delete_copyout (0, h, s, __FUNCTION__); |
| + delete_copyout (0, h, s, acc_async_sync, __FUNCTION__); |
| +} |
| + |
| +void |
| +acc_delete_async (void *h , size_t s, int async) |
| +{ |
| + delete_copyout (0, h, s, async, __FUNCTION__); |
| +} |
| + |
| +void |
| +acc_delete_finalize (void *h , size_t s) |
| +{ |
| + delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__); |
| +} |
| + |
| +void |
| +acc_delete_finalize_async (void *h , size_t s, int async) |
| +{ |
| + delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__); |
| } |
| |
| void |
| acc_copyout (void *h, size_t s) |
| { |
| - delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__); |
| + delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__); |
| +} |
| + |
| +void |
| +acc_copyout_async (void *h, size_t s, int async) |
| +{ |
| + delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__); |
| +} |
| + |
| +void |
| +acc_copyout_finalize (void *h, size_t s) |
| +{ |
| + delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync, |
| + __FUNCTION__); |
| +} |
| + |
| +void |
| +acc_copyout_finalize_async (void *h, size_t s, int async) |
| +{ |
| + delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__); |
| } |
| |
| static void |
| -update_dev_host (int is_dev, void *h, size_t s) |
| +update_dev_host (int is_dev, void *h, size_t s, int async) |
| { |
| splay_tree_key n; |
| void *d; |
| @@ -631,24 +765,42 @@ update_dev_host (int is_dev, void *h, si |
| d = (void *) (n->tgt->tgt_start + n->tgt_offset |
| + (uintptr_t) h - n->host_start); |
| |
| + if (async > acc_async_sync) |
| + acc_dev->openacc.async_set_async_func (async); |
| + |
| if (is_dev) |
| acc_dev->host2dev_func (acc_dev->target_id, d, h, s); |
| else |
| acc_dev->dev2host_func (acc_dev->target_id, h, d, s); |
| |
| + if (async > acc_async_sync) |
| + acc_dev->openacc.async_set_async_func (acc_async_sync); |
| + |
| gomp_mutex_unlock (&acc_dev->lock); |
| } |
| |
| void |
| acc_update_device (void *h, size_t s) |
| { |
| - update_dev_host (1, h, s); |
| + update_dev_host (1, h, s, acc_async_sync); |
| +} |
| + |
| +void |
| +acc_update_device_async (void *h, size_t s, int async) |
| +{ |
| + update_dev_host (1, h, s, async); |
| } |
| |
| void |
| acc_update_self (void *h, size_t s) |
| { |
| - update_dev_host (0, h, s); |
| + update_dev_host (0, h, s, acc_async_sync); |
| +} |
| + |
| +void |
| +acc_update_self_async (void *h, size_t s, int async) |
| +{ |
| + update_dev_host (0, h, s, async); |
| } |
| |
| void |
| @@ -659,11 +811,37 @@ gomp_acc_insert_pointer (size_t mapnum, |
| struct goacc_thread *thr = goacc_thread (); |
| struct gomp_device_descr *acc_dev = thr->dev; |
| |
| + if (acc_is_present (*hostaddrs, *sizes)) |
| + { |
| + splay_tree_key n; |
| + gomp_mutex_lock (&acc_dev->lock); |
| + n = lookup_host (acc_dev, *hostaddrs, *sizes); |
| + gomp_mutex_unlock (&acc_dev->lock); |
| + |
| + tgt = n->tgt; |
| + for (size_t i = 0; i < tgt->list_count; i++) |
| + if (tgt->list[i].key == n) |
| + { |
| + for (size_t j = 0; j < mapnum; j++) |
| + if (i + j < tgt->list_count && tgt->list[i + j].key) |
| + { |
| + tgt->list[i + j].key->refcount++; |
| + tgt->list[i + j].key->dynamic_refcount++; |
| + } |
| + return; |
| + } |
| + /* Should not reach here. */ |
| + gomp_fatal ("Dynamic refcount incrementing failed for pointer/pset"); |
| + } |
| + |
| gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__); |
| tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, |
| NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC); |
| gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__); |
| |
| + /* Initialize dynamic refcount. */ |
| + tgt->list[0].key->dynamic_refcount = 1; |
| + |
| gomp_mutex_lock (&acc_dev->lock); |
| tgt->prev = acc_dev->openacc.data_environ; |
| acc_dev->openacc.data_environ = tgt; |
| @@ -671,7 +849,8 @@ gomp_acc_insert_pointer (size_t mapnum, |
| } |
| |
| void |
| -gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum) |
| +gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async, |
| + int finalize, int mapnum) |
| { |
| struct goacc_thread *thr = goacc_thread (); |
| struct gomp_device_descr *acc_dev = thr->dev; |
| @@ -679,6 +858,9 @@ gomp_acc_remove_pointer (void *h, bool f |
| struct target_mem_desc *t; |
| int minrefs = (mapnum == 1) ? 2 : 3; |
| |
| + if (!acc_is_present (h, s)) |
| + return; |
| + |
| gomp_mutex_lock (&acc_dev->lock); |
| |
| n = lookup_host (acc_dev, h, 1); |
| @@ -693,40 +875,65 @@ gomp_acc_remove_pointer (void *h, bool f |
| |
| t = n->tgt; |
| |
| - struct target_mem_desc *tp; |
| + if (n->refcount < n->dynamic_refcount) |
| + { |
| + gomp_mutex_unlock (&acc_dev->lock); |
| + gomp_fatal ("Dynamic reference counting assert fail\n"); |
| + } |
| |
| - if (t->refcount == minrefs) |
| + if (finalize) |
| { |
| - /* This is the last reference, so pull the descriptor off the |
| - chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from |
| - freeing the device memory. */ |
| - t->tgt_end = 0; |
| - t->to_free = 0; |
| + n->refcount -= n->dynamic_refcount; |
| + n->dynamic_refcount = 0; |
| + } |
| + else if (n->dynamic_refcount) |
| + { |
| + n->dynamic_refcount--; |
| + n->refcount--; |
| + } |
| |
| - for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; |
| - tp = t, t = t->prev) |
| + gomp_mutex_unlock (&acc_dev->lock); |
| + |
| + if (n->refcount == 0) |
| + { |
| + if (t->refcount == minrefs) |
| { |
| - if (n->tgt == t) |
| + /* This is the last reference, so pull the descriptor off the |
| + chain. This prevents gomp_unmap_vars via gomp_unmap_tgt from |
| + freeing the device memory. */ |
| + struct target_mem_desc *tp; |
| + for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; |
| + tp = t, t = t->prev) |
| { |
| - if (tp) |
| - tp->prev = t->prev; |
| - else |
| - acc_dev->openacc.data_environ = t->prev; |
| - break; |
| + if (n->tgt == t) |
| + { |
| + if (tp) |
| + tp->prev = t->prev; |
| + else |
| + acc_dev->openacc.data_environ = t->prev; |
| + break; |
| + } |
| } |
| } |
| - } |
| |
| - if (force_copyfrom) |
| - t->list[0].copy_from = 1; |
| + /* Set refcount to 1 to allow gomp_unmap_vars to unmap it. */ |
| + n->refcount = 1; |
| + t->refcount = minrefs; |
| + for (size_t i = 0; i < t->list_count; i++) |
| + if (t->list[i].key == n) |
| + { |
| + t->list[i].copy_from = force_copyfrom ? 1 : 0; |
| + break; |
| + } |
| |
| - gomp_mutex_unlock (&acc_dev->lock); |
| + /* If running synchronously, unmap immediately. */ |
| + if (async < acc_async_noval) |
| + gomp_unmap_vars (t, true); |
| + else |
| + t->device_descr->openacc.register_async_cleanup_func (t, async); |
| + } |
| |
| - /* If running synchronously, unmap immediately. */ |
| - if (async < acc_async_noval) |
| - gomp_unmap_vars (t, true); |
| - else |
| - t->device_descr->openacc.register_async_cleanup_func (t, async); |
| + gomp_mutex_unlock (&acc_dev->lock); |
| |
| gomp_debug (0, " %s: mappings restored\n", __FUNCTION__); |
| } |
| |
| |
| @@ -88,8 +88,12 @@ void **gomp_places_list; |
| unsigned long gomp_places_list_len; |
| int gomp_debug_var; |
| unsigned int gomp_num_teams_var; |
| +bool gomp_display_affinity_var; |
| +char *gomp_affinity_format_var = "level %L thread %i affinity %A"; |
| +size_t gomp_affinity_format_len; |
| char *goacc_device_type; |
| int goacc_device_num; |
| +int goacc_default_dims[GOMP_DIM_MAX]; |
| |
| #ifndef LIBGOMP_OFFLOADED_ONLY |
| |
| @@ -100,6 +104,7 @@ parse_schedule (void) |
| { |
| char *env, *end; |
| unsigned long value; |
| + int monotonic = 0; |
| |
| env = getenv ("OMP_SCHEDULE"); |
| if (env == NULL) |
| @@ -107,6 +112,26 @@ parse_schedule (void) |
| |
| while (isspace ((unsigned char) *env)) |
| ++env; |
| + if (strncasecmp (env, "monotonic", 9) == 0) |
| + { |
| + monotonic = 1; |
| + env += 9; |
| + } |
| + else if (strncasecmp (env, "nonmonotonic", 12) == 0) |
| + { |
| + monotonic = -1; |
| + env += 12; |
| + } |
| + if (monotonic) |
| + { |
| + while (isspace ((unsigned char) *env)) |
| + ++env; |
| + if (*env != ':') |
| + goto unknown; |
| + ++env; |
| + while (isspace ((unsigned char) *env)) |
| + ++env; |
| + } |
| if (strncasecmp (env, "static", 6) == 0) |
| { |
| gomp_global_icv.run_sched_var = GFS_STATIC; |
| @@ -130,12 +155,16 @@ parse_schedule (void) |
| else |
| goto unknown; |
| |
| + if (monotonic == 1 |
| + || (monotonic == 0 && gomp_global_icv.run_sched_var == GFS_STATIC)) |
| + gomp_global_icv.run_sched_var |= GFS_MONOTONIC; |
| + |
| while (isspace ((unsigned char) *env)) |
| ++env; |
| if (*env == '\0') |
| { |
| gomp_global_icv.run_sched_chunk_size |
| - = gomp_global_icv.run_sched_var != GFS_STATIC; |
| + = (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) != GFS_STATIC; |
| return; |
| } |
| if (*env++ != ',') |
| @@ -158,7 +187,8 @@ parse_schedule (void) |
| if ((int)value != value) |
| goto invalid; |
| |
| - if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC) |
| + if (value == 0 |
| + && (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) != GFS_STATIC) |
| value = 1; |
| gomp_global_icv.run_sched_chunk_size = value; |
| return; |
| @@ -1066,6 +1096,36 @@ parse_acc_device_type (void) |
| } |
| |
| static void |
| +parse_gomp_openacc_dim (void) |
| +{ |
| + /* The syntax is the same as for the -fopenacc-dim compilation option. */ |
| + const char *var_name = "GOMP_OPENACC_DIM"; |
| + const char *env_var = getenv (var_name); |
| + if (!env_var) |
| + return; |
| + |
| + const char *pos = env_var; |
| + int i; |
| + for (i = 0; *pos && i != GOMP_DIM_MAX; i++) |
| + { |
| + if (i && *pos++ != ':') |
| + break; |
| + |
| + if (*pos == ':') |
| + continue; |
| + |
| + const char *eptr; |
| + errno = 0; |
| + long val = strtol (pos, (char **)&eptr, 10); |
| + if (errno || val < 0 || (unsigned)val != val) |
| + break; |
| + |
| + goacc_default_dims[i] = (int)val; |
| + pos = eptr; |
| + } |
| +} |
| + |
| +static void |
| handle_omp_display_env (unsigned long stacksize, int wait_policy) |
| { |
| const char *env; |
| @@ -1119,19 +1179,34 @@ handle_omp_display_env (unsigned long st |
| fputs ("'\n", stderr); |
| |
| fprintf (stderr, " OMP_SCHEDULE = '"); |
| - switch (gomp_global_icv.run_sched_var) |
| + if ((gomp_global_icv.run_sched_var & GFS_MONOTONIC)) |
| + { |
| + if (gomp_global_icv.run_sched_var != (GFS_MONOTONIC | GFS_STATIC)) |
| + fputs ("MONOTONIC:", stderr); |
| + } |
| + else if (gomp_global_icv.run_sched_var == GFS_STATIC) |
| + fputs ("NONMONOTONIC:", stderr); |
| + switch (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) |
| { |
| case GFS_RUNTIME: |
| fputs ("RUNTIME", stderr); |
| + if (gomp_global_icv.run_sched_chunk_size != 1) |
| + fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size); |
| break; |
| case GFS_STATIC: |
| fputs ("STATIC", stderr); |
| + if (gomp_global_icv.run_sched_chunk_size != 0) |
| + fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size); |
| break; |
| case GFS_DYNAMIC: |
| fputs ("DYNAMIC", stderr); |
| + if (gomp_global_icv.run_sched_chunk_size != 1) |
| + fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size); |
| break; |
| case GFS_GUIDED: |
| fputs ("GUIDED", stderr); |
| + if (gomp_global_icv.run_sched_chunk_size != 1) |
| + fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size); |
| break; |
| case GFS_AUTO: |
| fputs ("AUTO", stderr); |
| @@ -1197,6 +1272,10 @@ handle_omp_display_env (unsigned long st |
| gomp_global_icv.default_device_var); |
| fprintf (stderr, " OMP_MAX_TASK_PRIORITY = '%d'\n", |
| gomp_max_task_priority_var); |
| + fprintf (stderr, " OMP_DISPLAY_AFFINITY = '%s'\n", |
| + gomp_display_affinity_var ? "TRUE" : "FALSE"); |
| + fprintf (stderr, " OMP_AFFINITY_FORMAT = '%s'\n", |
| + gomp_affinity_format_var); |
| |
| if (verbose) |
| { |
| @@ -1228,6 +1307,7 @@ initialize_env (void) |
| parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var); |
| parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var); |
| parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var); |
| + parse_boolean ("OMP_DISPLAY_AFFINITY", &gomp_display_affinity_var); |
| parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true); |
| parse_int ("OMP_MAX_TASK_PRIORITY", &gomp_max_task_priority_var, true); |
| parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var, |
| @@ -1277,6 +1357,13 @@ initialize_env (void) |
| } |
| if (gomp_global_icv.bind_var != omp_proc_bind_false) |
| gomp_init_affinity (); |
| + |
| + { |
| + const char *env = getenv ("OMP_AFFINITY_FORMAT"); |
| + if (env != NULL) |
| + gomp_set_affinity_format (env, strlen (env)); |
| + } |
| + |
| wait_policy = parse_wait_policy (); |
| if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var)) |
| { |
| @@ -1302,7 +1389,6 @@ initialize_env (void) |
| |
| /* Not strictly environment related, but ordering constructors is tricky. */ |
| pthread_attr_init (&gomp_thread_attr); |
| - pthread_attr_setdetachstate (&gomp_thread_attr, PTHREAD_CREATE_DETACHED); |
| |
| if (parse_stacksize ("OMP_STACKSIZE", &stacksize) |
| || parse_stacksize ("GOMP_STACKSIZE", &stacksize) |
| @@ -1336,6 +1422,7 @@ initialize_env (void) |
| goacc_device_num = 0; |
| |
| parse_acc_device_type (); |
| + parse_gomp_openacc_dim (); |
| |
| goacc_runtime_initialize (); |
| } |
| |
| |
| @@ -28,6 +28,8 @@ |
| #include "libgomp.h" |
| #include "libgomp_f.h" |
| #include <stdlib.h> |
| +#include <stdio.h> |
| +#include <string.h> |
| #include <limits.h> |
| |
| #ifdef HAVE_ATTRIBUTE_ALIAS |
| @@ -82,6 +84,8 @@ ialias_redirect (omp_get_team_num) |
| ialias_redirect (omp_is_initial_device) |
| ialias_redirect (omp_get_initial_device) |
| ialias_redirect (omp_get_max_task_priority) |
| +ialias_redirect (omp_pause_resource) |
| +ialias_redirect (omp_pause_resource_all) |
| #endif |
| |
| #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING |
| @@ -368,7 +372,9 @@ omp_get_schedule_ (int32_t *kind, int32_ |
| omp_sched_t k; |
| int cs; |
| omp_get_schedule (&k, &cs); |
| - *kind = k; |
| + /* For now mask off GFS_MONOTONIC, because OpenMP 4.5 code will not |
| + expect to see it. */ |
| + *kind = k & ~GFS_MONOTONIC; |
| *chunk_size = cs; |
| } |
| |
| @@ -378,7 +384,8 @@ omp_get_schedule_8_ (int32_t *kind, int6 |
| omp_sched_t k; |
| int cs; |
| omp_get_schedule (&k, &cs); |
| - *kind = k; |
| + /* See above. */ |
| + *kind = k & ~GFS_MONOTONIC; |
| *chunk_size = cs; |
| } |
| |
| @@ -576,3 +583,96 @@ omp_get_max_task_priority_ (void) |
| { |
| return omp_get_max_task_priority (); |
| } |
| + |
| +void |
| +omp_set_affinity_format_ (const char *format, size_t format_len) |
| +{ |
| + gomp_set_affinity_format (format, format_len); |
| +} |
| + |
| +int32_t |
| +omp_get_affinity_format_ (char *buffer, size_t buffer_len) |
| +{ |
| + size_t len = strlen (gomp_affinity_format_var); |
| + if (buffer_len) |
| + { |
| + if (len < buffer_len) |
| + { |
| + memcpy (buffer, gomp_affinity_format_var, len); |
| + memset (buffer + len, ' ', buffer_len - len); |
| + } |
| + else |
| + memcpy (buffer, gomp_affinity_format_var, buffer_len); |
| + } |
| + return len; |
| +} |
| + |
| +void |
| +omp_display_affinity_ (const char *format, size_t format_len) |
| +{ |
| + char *fmt = NULL, fmt_buf[256]; |
| + char buf[512]; |
| + if (format_len) |
| + { |
| + fmt = format_len < 256 ? fmt_buf : gomp_malloc (format_len + 1); |
| + memcpy (fmt, format, format_len); |
| + fmt[format_len] = '\0'; |
| + } |
| + struct gomp_thread *thr = gomp_thread (); |
| + size_t ret |
| + = gomp_display_affinity (buf, sizeof buf, |
| + format_len ? fmt : gomp_affinity_format_var, |
| + gomp_thread_self (), &thr->ts, thr->place); |
| + if (ret < sizeof buf) |
| + { |
| + buf[ret] = '\n'; |
| + gomp_print_string (buf, ret + 1); |
| + } |
| + else |
| + { |
| + char *b = gomp_malloc (ret + 1); |
| + gomp_display_affinity (buf, sizeof buf, |
| + format_len ? fmt : gomp_affinity_format_var, |
| + gomp_thread_self (), &thr->ts, thr->place); |
| + b[ret] = '\n'; |
| + gomp_print_string (b, ret + 1); |
| + free (b); |
| + } |
| + if (fmt && fmt != fmt_buf) |
| + free (fmt); |
| +} |
| + |
| +int32_t |
| +omp_capture_affinity_ (char *buffer, const char *format, |
| + size_t buffer_len, size_t format_len) |
| +{ |
| + char *fmt = NULL, fmt_buf[256]; |
| + if (format_len) |
| + { |
| + fmt = format_len < 256 ? fmt_buf : gomp_malloc (format_len + 1); |
| + memcpy (fmt, format, format_len); |
| + fmt[format_len] = '\0'; |
| + } |
| + struct gomp_thread *thr = gomp_thread (); |
| + size_t ret |
| + = gomp_display_affinity (buffer, buffer_len, |
| + format_len ? fmt : gomp_affinity_format_var, |
| + gomp_thread_self (), &thr->ts, thr->place); |
| + if (fmt && fmt != fmt_buf) |
| + free (fmt); |
| + if (ret < buffer_len) |
| + memset (buffer + ret, ' ', buffer_len - ret); |
| + return ret; |
| +} |
| + |
| +int32_t |
| +omp_pause_resource_ (const int32_t *kind, const int32_t *device_num) |
| +{ |
| + return omp_pause_resource (*kind, *device_num); |
| +} |
| + |
| +int32_t |
| +omp_pause_resource_all_ (const int32_t *kind) |
| +{ |
| + return omp_pause_resource_all (*kind); |
| +} |
| |
| |
| @@ -18,7 +18,7 @@ if test $gcc_cv_have_tls = yes ; then |
| ;; |
| |
| *-*-linux* | *-*-gnu*) |
| - XCFLAGS="${XCFLAGS} -ftls-model=initial-exec" |
| + XCFLAGS="${XCFLAGS} -ftls-model=initial-exec -DUSING_INITIAL_EXEC_TLS" |
| ;; |
| |
| *-*-rtems*) |
| |
| |
| @@ -49,20 +49,6 @@ omp_get_num_devices (void) |
| } |
| |
| int |
| -omp_get_num_teams (void) |
| -{ |
| - /* Hardcoded to 1 on host, MIC, HSAIL? Maybe variable on PTX. */ |
| - return 1; |
| -} |
| - |
| -int |
| -omp_get_team_num (void) |
| -{ |
| - /* Hardcoded to 0 on host, MIC, HSAIL? Maybe variable on PTX. */ |
| - return 0; |
| -} |
| - |
| -int |
| omp_is_initial_device (void) |
| { |
| /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX. */ |
| @@ -72,6 +58,4 @@ omp_is_initial_device (void) |
| ialias (omp_set_default_device) |
| ialias (omp_get_default_device) |
| ialias (omp_get_num_devices) |
| -ialias (omp_get_num_teams) |
| -ialias (omp_get_team_num) |
| ialias (omp_is_initial_device) |
| |
| |
| @@ -90,7 +90,7 @@ DIST_COMMON = $(top_srcdir)/plugin/Makef |
| $(srcdir)/libgomp.spec.in $(srcdir)/../depcomp |
| @PLUGIN_NVPTX_TRUE@am__append_1 = libgomp-plugin-nvptx.la |
| @PLUGIN_HSA_TRUE@am__append_2 = libgomp-plugin-hsa.la |
| -@USE_FORTRAN_TRUE@am__append_3 = openacc.f90 |
| +@USE_FORTRAN_TRUE@am__append_3 = openacc2.f90 |
| subdir = . |
| ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 |
| am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \ |
| @@ -172,7 +172,7 @@ libgomp_plugin_nvptx_la_LINK = $(LIBTOOL |
| @PLUGIN_NVPTX_TRUE@am_libgomp_plugin_nvptx_la_rpath = -rpath \ |
| @PLUGIN_NVPTX_TRUE@ $(toolexeclibdir) |
| libgomp_la_LIBADD = |
| -@USE_FORTRAN_TRUE@am__objects_1 = openacc.lo |
| +@USE_FORTRAN_TRUE@am__objects_1 = openacc2.lo |
| am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \ |
| env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \ |
| loop.lo loop_ull.lo ordered.lo parallel.lo sections.lo \ |
| @@ -180,7 +180,8 @@ am_libgomp_la_OBJECTS = alloc.lo atomic. |
| sem.lo bar.lo ptrlock.lo time.lo fortran.lo affinity.lo \ |
| target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \ |
| oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \ |
| - oacc-plugin.lo oacc-cuda.lo priority_queue.lo $(am__objects_1) |
| + oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \ |
| + teams.lo $(am__objects_1) |
| libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) |
| DEFAULT_INCLUDES = -I.@am__isrc@ |
| depcomp = $(SHELL) $(top_srcdir)/../depcomp |
| @@ -380,6 +381,7 @@ mkdir_p = @mkdir_p@ |
| multi_basedir = @multi_basedir@ |
| offload_additional_lib_paths = @offload_additional_lib_paths@ |
| offload_additional_options = @offload_additional_options@ |
| +offload_plugins = @offload_plugins@ |
| offload_targets = @offload_targets@ |
| oldincludedir = @oldincludedir@ |
| pdfdir = @pdfdir@ |
| @@ -436,7 +438,7 @@ libgomp_la_SOURCES = alloc.c atomic.c ba |
| affinity.c target.c splay-tree.c libgomp-plugin.c \ |
| oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \ |
| oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \ |
| - $(am__append_3) |
| + affinity-fmt.c teams.c $(am__append_3) |
| |
| # Nvidia PTX OpenACC plugin. |
| @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION) |
| @@ -599,6 +601,7 @@ mostlyclean-compile: |
| distclean-compile: |
| -rm -f *.tab.c |
| |
| +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity-fmt.Plo@am__quote@ |
| @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity.Plo@am__quote@ |
| @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc.Plo@am__quote@ |
| @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic.Plo@am__quote@ |
| @@ -638,6 +641,7 @@ distclean-compile: |
| @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@ |
| @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@ |
| @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@ |
| +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/teams.Plo@am__quote@ |
| @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@ |
| @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/work.Plo@am__quote@ |
| |
| @@ -1292,8 +1296,6 @@ omp_lib_kinds.mod: omp_lib.mod |
| : |
| openacc_kinds.mod: openacc.mod |
| : |
| -openacc.mod: openacc.lo |
| - : |
| %.mod: %.f90 |
| $(FC) $(FCFLAGS) -fsyntax-only $< |
| fortran.lo: libgomp_f.h |
| |
| |
| @@ -44,6 +44,7 @@ typedef void *CUevent; |
| typedef void *CUfunction; |
| typedef void *CUlinkState; |
| typedef void *CUmodule; |
| +typedef size_t (*CUoccupancyB2DSize)(int); |
| typedef void *CUstream; |
| |
| typedef enum { |
| @@ -88,6 +89,7 @@ typedef enum { |
| CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4, |
| CU_JIT_ERROR_LOG_BUFFER = 5, |
| CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6, |
| + CU_JIT_OPTIMIZATION_LEVEL = 7, |
| CU_JIT_LOG_VERBOSE = 12 |
| } CUjit_option; |
| |
| @@ -169,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr |
| CUresult cuModuleLoad (CUmodule *, const char *); |
| CUresult cuModuleLoadData (CUmodule *, const void *); |
| CUresult cuModuleUnload (CUmodule); |
| +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, |
| + CUoccupancyB2DSize, size_t, int); |
| CUresult cuStreamCreate (CUstream *, unsigned); |
| #define cuStreamDestroy cuStreamDestroy_v2 |
| CUresult cuStreamDestroy (CUstream); |
| |
| |
| @@ -0,0 +1,49 @@ |
| +CUDA_ONE_CALL (cuCtxCreate) |
| +CUDA_ONE_CALL (cuCtxDestroy) |
| +CUDA_ONE_CALL (cuCtxGetCurrent) |
| +CUDA_ONE_CALL (cuCtxGetDevice) |
| +CUDA_ONE_CALL (cuCtxPopCurrent) |
| +CUDA_ONE_CALL (cuCtxPushCurrent) |
| +CUDA_ONE_CALL (cuCtxSynchronize) |
| +CUDA_ONE_CALL (cuDeviceGet) |
| +CUDA_ONE_CALL (cuDeviceGetAttribute) |
| +CUDA_ONE_CALL (cuDeviceGetCount) |
| +CUDA_ONE_CALL (cuEventCreate) |
| +CUDA_ONE_CALL (cuEventDestroy) |
| +CUDA_ONE_CALL (cuEventElapsedTime) |
| +CUDA_ONE_CALL (cuEventQuery) |
| +CUDA_ONE_CALL (cuEventRecord) |
| +CUDA_ONE_CALL (cuEventSynchronize) |
| +CUDA_ONE_CALL (cuFuncGetAttribute) |
| +CUDA_ONE_CALL_MAYBE_NULL (cuGetErrorString) |
| +CUDA_ONE_CALL (cuInit) |
| +CUDA_ONE_CALL (cuLaunchKernel) |
| +CUDA_ONE_CALL (cuLinkAddData) |
| +CUDA_ONE_CALL_MAYBE_NULL (cuLinkAddData_v2) |
| +CUDA_ONE_CALL (cuLinkComplete) |
| +CUDA_ONE_CALL (cuLinkCreate) |
| +CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2) |
| +CUDA_ONE_CALL (cuLinkDestroy) |
| +CUDA_ONE_CALL (cuMemAlloc) |
| +CUDA_ONE_CALL (cuMemAllocHost) |
| +CUDA_ONE_CALL (cuMemcpy) |
| +CUDA_ONE_CALL (cuMemcpyDtoDAsync) |
| +CUDA_ONE_CALL (cuMemcpyDtoH) |
| +CUDA_ONE_CALL (cuMemcpyDtoHAsync) |
| +CUDA_ONE_CALL (cuMemcpyHtoD) |
| +CUDA_ONE_CALL (cuMemcpyHtoDAsync) |
| +CUDA_ONE_CALL (cuMemFree) |
| +CUDA_ONE_CALL (cuMemFreeHost) |
| +CUDA_ONE_CALL (cuMemGetAddressRange) |
| +CUDA_ONE_CALL (cuMemHostGetDevicePointer) |
| +CUDA_ONE_CALL (cuModuleGetFunction) |
| +CUDA_ONE_CALL (cuModuleGetGlobal) |
| +CUDA_ONE_CALL (cuModuleLoad) |
| +CUDA_ONE_CALL (cuModuleLoadData) |
| +CUDA_ONE_CALL (cuModuleUnload) |
| +CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) |
| +CUDA_ONE_CALL (cuStreamCreate) |
| +CUDA_ONE_CALL (cuStreamDestroy) |
| +CUDA_ONE_CALL (cuStreamQuery) |
| +CUDA_ONE_CALL (cuStreamSynchronize) |
| +CUDA_ONE_CALL (cuStreamWaitEvent) |
| |
| |
| @@ -31,6 +31,7 @@ |
| is not clear as to what that state might be. Or how one might |
| propagate it from one thread to another. */ |
| |
| +#define _GNU_SOURCE |
| #include "openacc.h" |
| #include "config.h" |
| #include "libgomp-plugin.h" |
| @@ -48,60 +49,41 @@ |
| #include <assert.h> |
| #include <errno.h> |
| |
| +#if CUDA_VERSION < 6000 |
| +extern CUresult cuGetErrorString (CUresult, const char **); |
| +#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82 |
| +#endif |
| + |
| +#if CUDA_VERSION >= 6050 |
| +#undef cuLinkCreate |
| +#undef cuLinkAddData |
| +CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, |
| + const char *, unsigned, CUjit_option *, void **); |
| +CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *); |
| +#else |
| +typedef size_t (*CUoccupancyB2DSize)(int); |
| +CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t, |
| + const char *, unsigned, CUjit_option *, void **); |
| +CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *); |
| +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, |
| + CUoccupancyB2DSize, size_t, int); |
| +#endif |
| + |
| +#define DO_PRAGMA(x) _Pragma (#x) |
| + |
| #if PLUGIN_NVPTX_DYNAMIC |
| # include <dlfcn.h> |
| |
| -# define CUDA_CALLS \ |
| -CUDA_ONE_CALL (cuCtxCreate) \ |
| -CUDA_ONE_CALL (cuCtxDestroy) \ |
| -CUDA_ONE_CALL (cuCtxGetCurrent) \ |
| -CUDA_ONE_CALL (cuCtxGetDevice) \ |
| -CUDA_ONE_CALL (cuCtxPopCurrent) \ |
| -CUDA_ONE_CALL (cuCtxPushCurrent) \ |
| -CUDA_ONE_CALL (cuCtxSynchronize) \ |
| -CUDA_ONE_CALL (cuDeviceGet) \ |
| -CUDA_ONE_CALL (cuDeviceGetAttribute) \ |
| -CUDA_ONE_CALL (cuDeviceGetCount) \ |
| -CUDA_ONE_CALL (cuEventCreate) \ |
| -CUDA_ONE_CALL (cuEventDestroy) \ |
| -CUDA_ONE_CALL (cuEventElapsedTime) \ |
| -CUDA_ONE_CALL (cuEventQuery) \ |
| -CUDA_ONE_CALL (cuEventRecord) \ |
| -CUDA_ONE_CALL (cuEventSynchronize) \ |
| -CUDA_ONE_CALL (cuFuncGetAttribute) \ |
| -CUDA_ONE_CALL (cuGetErrorString) \ |
| -CUDA_ONE_CALL (cuInit) \ |
| -CUDA_ONE_CALL (cuLaunchKernel) \ |
| -CUDA_ONE_CALL (cuLinkAddData) \ |
| -CUDA_ONE_CALL (cuLinkComplete) \ |
| -CUDA_ONE_CALL (cuLinkCreate) \ |
| -CUDA_ONE_CALL (cuLinkDestroy) \ |
| -CUDA_ONE_CALL (cuMemAlloc) \ |
| -CUDA_ONE_CALL (cuMemAllocHost) \ |
| -CUDA_ONE_CALL (cuMemcpy) \ |
| -CUDA_ONE_CALL (cuMemcpyDtoDAsync) \ |
| -CUDA_ONE_CALL (cuMemcpyDtoH) \ |
| -CUDA_ONE_CALL (cuMemcpyDtoHAsync) \ |
| -CUDA_ONE_CALL (cuMemcpyHtoD) \ |
| -CUDA_ONE_CALL (cuMemcpyHtoDAsync) \ |
| -CUDA_ONE_CALL (cuMemFree) \ |
| -CUDA_ONE_CALL (cuMemFreeHost) \ |
| -CUDA_ONE_CALL (cuMemGetAddressRange) \ |
| -CUDA_ONE_CALL (cuMemHostGetDevicePointer)\ |
| -CUDA_ONE_CALL (cuModuleGetFunction) \ |
| -CUDA_ONE_CALL (cuModuleGetGlobal) \ |
| -CUDA_ONE_CALL (cuModuleLoad) \ |
| -CUDA_ONE_CALL (cuModuleLoadData) \ |
| -CUDA_ONE_CALL (cuModuleUnload) \ |
| -CUDA_ONE_CALL (cuStreamCreate) \ |
| -CUDA_ONE_CALL (cuStreamDestroy) \ |
| -CUDA_ONE_CALL (cuStreamQuery) \ |
| -CUDA_ONE_CALL (cuStreamSynchronize) \ |
| -CUDA_ONE_CALL (cuStreamWaitEvent) |
| -# define CUDA_ONE_CALL(call) \ |
| - __typeof (call) *call; |
| struct cuda_lib_s { |
| - CUDA_CALLS |
| + |
| +# define CUDA_ONE_CALL(call) \ |
| + __typeof (call) *call; |
| +# define CUDA_ONE_CALL_MAYBE_NULL(call) \ |
| + CUDA_ONE_CALL (call) |
| +#include "cuda-lib.def" |
| +# undef CUDA_ONE_CALL |
| +# undef CUDA_ONE_CALL_MAYBE_NULL |
| + |
| } cuda_lib; |
| |
| /* -1 if init_cuda_lib has not been called yet, false |
| @@ -120,24 +102,41 @@ init_cuda_lib (void) |
| cuda_lib_inited = false; |
| if (h == NULL) |
| return false; |
| -# undef CUDA_ONE_CALL |
| -# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call) |
| -# define CUDA_ONE_CALL_1(call) \ |
| + |
| +# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false) |
| +# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true) |
| +# define CUDA_ONE_CALL_1(call, allow_null) \ |
| cuda_lib.call = dlsym (h, #call); \ |
| - if (cuda_lib.call == NULL) \ |
| + if (!allow_null && cuda_lib.call == NULL) \ |
| return false; |
| - CUDA_CALLS |
| +#include "cuda-lib.def" |
| +# undef CUDA_ONE_CALL |
| +# undef CUDA_ONE_CALL_1 |
| +# undef CUDA_ONE_CALL_MAYBE_NULL |
| + |
| cuda_lib_inited = true; |
| return true; |
| } |
| -# undef CUDA_ONE_CALL |
| -# undef CUDA_ONE_CALL_1 |
| # define CUDA_CALL_PREFIX cuda_lib. |
| #else |
| + |
| +# define CUDA_ONE_CALL(call) |
| +# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call) |
| +#include "cuda-lib.def" |
| +#undef CUDA_ONE_CALL_MAYBE_NULL |
| +#undef CUDA_ONE_CALL |
| + |
| # define CUDA_CALL_PREFIX |
| # define init_cuda_lib() true |
| #endif |
| |
| +#include "secure_getenv.h" |
| + |
| +#undef MIN |
| +#undef MAX |
| +#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) |
| +#define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) |
| + |
| /* Convenience macros for the frequently used CUDA library call and |
| error handling sequence as well as CUDA library calls that |
| do the error checking themselves or don't do it at all. */ |
| @@ -171,40 +170,42 @@ init_cuda_lib (void) |
| #define CUDA_CALL_NOCHECK(FN, ...) \ |
| CUDA_CALL_PREFIX FN (__VA_ARGS__) |
| |
| +#define CUDA_CALL_EXISTS(FN) \ |
| + CUDA_CALL_PREFIX FN |
| + |
| static const char * |
| cuda_error (CUresult r) |
| { |
| -#if CUDA_VERSION < 7000 |
| - /* Specified in documentation and present in library from at least |
| - 5.5. Not declared in header file prior to 7.0. */ |
| - extern CUresult cuGetErrorString (CUresult, const char **); |
| -#endif |
| + const char *fallback = "unknown cuda error"; |
| const char *desc; |
| |
| + if (!CUDA_CALL_EXISTS (cuGetErrorString)) |
| + return fallback; |
| + |
| r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc); |
| - if (r != CUDA_SUCCESS) |
| - desc = "unknown cuda error"; |
| + if (r == CUDA_SUCCESS) |
| + return desc; |
| |
| - return desc; |
| + return fallback; |
| } |
| |
| static unsigned int instantiated_devices = 0; |
| static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; |
| |
| +struct cuda_map |
| +{ |
| + CUdeviceptr d; |
| + size_t size; |
| + bool active; |
| + struct cuda_map *next; |
| +}; |
| + |
| struct ptx_stream |
| { |
| CUstream stream; |
| pthread_t host_thread; |
| bool multithreaded; |
| - |
| - CUdeviceptr d; |
| - void *h; |
| - void *h_begin; |
| - void *h_end; |
| - void *h_next; |
| - void *h_prev; |
| - void *h_tail; |
| - |
| + struct cuda_map *map; |
| struct ptx_stream *next; |
| }; |
| |
| @@ -216,12 +217,64 @@ struct nvptx_thread |
| struct ptx_device *ptx_dev; |
| }; |
| |
| -struct map |
| +static struct cuda_map * |
| +cuda_map_create (size_t size) |
| { |
| - int async; |
| - size_t size; |
| - char mappings[0]; |
| -}; |
| + struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map)); |
| + |
| + assert (map); |
| + |
| + map->next = NULL; |
| + map->size = size; |
| + map->active = false; |
| + |
| + CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size); |
| + assert (map->d); |
| + |
| + return map; |
| +} |
| + |
| +static void |
| +cuda_map_destroy (struct cuda_map *map) |
| +{ |
| + if (map->active) |
| + /* Possible reasons for the map to be still active: |
| + - the associated async kernel might still be running. |
| + - the associated async kernel might have finished, but the |
| + corresponding event that should trigger the pop_map has not been |
| + processed by event_gc. |
| + - the associated sync kernel might have aborted |
| + |
| + The async cases could happen if the user specified an async region |
| + without adding a corresponding wait that is guaranteed to be executed |
| + (before returning from main, or in an atexit handler). |
| + We do not want to deallocate a device pointer that is still being |
| + used, so skip it. |
| + |
| + In the sync case, the device pointer is no longer used, but deallocating |
| + it using cuMemFree will not succeed, so skip it. |
| + |
| + TODO: Handle this in a more constructive way, by f.i. waiting for streams |
| + to finish before de-allocating them (PR88981), or by ensuring the CUDA |
| + lib atexit handler is called before rather than after the libgomp plugin |
| + atexit handler (PR83795). */ |
| + ; |
| + else |
| + CUDA_CALL_NOCHECK (cuMemFree, map->d); |
| + |
| + free (map); |
| +} |
| + |
| +/* The following map_* routines manage the CUDA device memory that |
| + contains the data mapping arguments for cuLaunchKernel. Each |
| + asynchronous PTX stream may have multiple pending kernel |
| + invocations, which are launched in a FIFO order. As such, the map |
| + routines maintains a queue of cuLaunchKernel arguments. |
| + |
| + Calls to map_push and map_pop must be guarded by ptx_event_lock. |
| + Likewise, calls to map_init and map_fini are guarded by |
| + ptx_dev_lock inside GOMP_OFFLOAD_init_device and |
| + GOMP_OFFLOAD_fini_device, respectively. */ |
| |
| static bool |
| map_init (struct ptx_stream *s) |
| @@ -229,109 +282,83 @@ map_init (struct ptx_stream *s) |
| int size = getpagesize (); |
| |
| assert (s); |
| - assert (!s->d); |
| - assert (!s->h); |
| - |
| - CUDA_CALL (cuMemAllocHost, &s->h, size); |
| - CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0); |
| |
| - assert (s->h); |
| + s->map = cuda_map_create (size); |
| |
| - s->h_begin = s->h; |
| - s->h_end = s->h_begin + size; |
| - s->h_next = s->h_prev = s->h_tail = s->h_begin; |
| - |
| - assert (s->h_next); |
| - assert (s->h_end); |
| return true; |
| } |
| |
| static bool |
| map_fini (struct ptx_stream *s) |
| { |
| - CUDA_CALL (cuMemFreeHost, s->h); |
| + assert (s->map->next == NULL); |
| + |
| + cuda_map_destroy (s->map); |
| + |
| return true; |
| } |
| |
| static void |
| map_pop (struct ptx_stream *s) |
| { |
| - struct map *m; |
| + struct cuda_map *next; |
| |
| assert (s != NULL); |
| - assert (s->h_next); |
| - assert (s->h_prev); |
| - assert (s->h_tail); |
| - |
| - m = s->h_tail; |
| - |
| - s->h_tail += m->size; |
| - |
| - if (s->h_tail >= s->h_end) |
| - s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); |
| - |
| - if (s->h_next == s->h_tail) |
| - s->h_prev = s->h_next; |
| |
| - assert (s->h_next >= s->h_begin); |
| - assert (s->h_tail >= s->h_begin); |
| - assert (s->h_prev >= s->h_begin); |
| + if (s->map->next == NULL) |
| + { |
| + s->map->active = false; |
| + return; |
| + } |
| |
| - assert (s->h_next <= s->h_end); |
| - assert (s->h_tail <= s->h_end); |
| - assert (s->h_prev <= s->h_end); |
| + next = s->map->next; |
| + cuda_map_destroy (s->map); |
| + s->map = next; |
| } |
| |
| -static void |
| -map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d) |
| +static CUdeviceptr |
| +map_push (struct ptx_stream *s, size_t size) |
| { |
| - int left; |
| - int offset; |
| - struct map *m; |
| + struct cuda_map *map = NULL; |
| + struct cuda_map **t; |
| |
| - assert (s != NULL); |
| - |
| - left = s->h_end - s->h_next; |
| - size += sizeof (struct map); |
| - |
| - assert (s->h_prev); |
| - assert (s->h_next); |
| + assert (s); |
| + assert (s->map); |
| |
| - if (size >= left) |
| + /* Select an element to push. */ |
| + if (s->map->active) |
| + map = cuda_map_create (size); |
| + else |
| { |
| - m = s->h_prev; |
| - m->size += left; |
| - s->h_next = s->h_begin; |
| - |
| - if (s->h_next + size > s->h_end) |
| - GOMP_PLUGIN_fatal ("unable to push map"); |
| - } |
| - |
| - assert (s->h_next); |
| - |
| - m = s->h_next; |
| - m->async = async; |
| - m->size = size; |
| + /* Pop the inactive front element. */ |
| + struct cuda_map *pop = s->map; |
| + s->map = pop->next; |
| + pop->next = NULL; |
| |
| - offset = (void *)&m->mappings[0] - s->h; |
| + if (pop->size < size) |
| + { |
| + cuda_map_destroy (pop); |
| |
| - *d = (void *)(s->d + offset); |
| - *h = (void *)(s->h + offset); |
| + map = cuda_map_create (size); |
| + } |
| + else |
| + map = pop; |
| + } |
| |
| - s->h_prev = s->h_next; |
| - s->h_next += size; |
| + /* Check that the element is as expected. */ |
| + assert (map->next == NULL); |
| + assert (!map->active); |
| |
| - assert (s->h_prev); |
| - assert (s->h_next); |
| + /* Mark the element active. */ |
| + map->active = true; |
| |
| - assert (s->h_next >= s->h_begin); |
| - assert (s->h_tail >= s->h_begin); |
| - assert (s->h_prev >= s->h_begin); |
| - assert (s->h_next <= s->h_end); |
| - assert (s->h_tail <= s->h_end); |
| - assert (s->h_prev <= s->h_end); |
| + /* Push the element to the back of the list. */ |
| + for (t = &s->map; (*t) != NULL; t = &(*t)->next) |
| + ; |
| + assert (t != NULL && *t == NULL); |
| + *t = map; |
| |
| - return; |
| + return map->d; |
| } |
| |
| /* Target data function launch information. */ |
| @@ -411,6 +438,10 @@ struct ptx_device |
| int num_sms; |
| int regs_per_block; |
| int regs_per_sm; |
| + int warp_size; |
| + int max_threads_per_block; |
| + int max_threads_per_multiprocessor; |
| + int default_dims[GOMP_DIM_MAX]; |
| |
| struct ptx_image_data *images; /* Images loaded on device. */ |
| pthread_mutex_t image_lock; /* Lock for above list. */ |
| @@ -458,8 +489,6 @@ init_streams_for_device (struct ptx_devi |
| null_stream->stream = NULL; |
| null_stream->host_thread = pthread_self (); |
| null_stream->multithreaded = true; |
| - null_stream->d = (CUdeviceptr) NULL; |
| - null_stream->h = NULL; |
| if (!map_init (null_stream)) |
| return false; |
| |
| @@ -594,8 +623,6 @@ select_stream_for_async (int async, pthr |
| s->host_thread = thread; |
| s->multithreaded = false; |
| |
| - s->d = (CUdeviceptr) NULL; |
| - s->h = NULL; |
| if (!map_init (s)) |
| { |
| pthread_mutex_unlock (&ptx_dev->stream_lock); |
| @@ -777,9 +804,11 @@ nvptx_open_device (int n) |
| &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev); |
| ptx_dev->regs_per_block = pi; |
| |
| - /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only |
| + /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only |
| in CUDA 6.0 and newer. */ |
| - r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev); |
| + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, |
| + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, |
| + dev); |
| /* Fallback: use limit of registers per block, which is usually equal. */ |
| if (r == CUDA_ERROR_INVALID_VALUE) |
| pi = ptx_dev->regs_per_block; |
| @@ -797,12 +826,24 @@ nvptx_open_device (int n) |
| GOMP_PLUGIN_error ("Only warp size 32 is supported"); |
| return NULL; |
| } |
| + ptx_dev->warp_size = pi; |
| + |
| + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, |
| + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev); |
| + ptx_dev->max_threads_per_block = pi; |
| + |
| + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, |
| + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); |
| + ptx_dev->max_threads_per_multiprocessor = pi; |
| |
| r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, |
| CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); |
| if (r != CUDA_SUCCESS) |
| async_engines = 1; |
| |
| + for (int i = 0; i != GOMP_DIM_MAX; i++) |
| + ptx_dev->default_dims[i] = 0; |
| + |
| ptx_dev->images = NULL; |
| pthread_mutex_init (&ptx_dev->image_lock, NULL); |
| |
| @@ -876,12 +917,42 @@ notify_var (const char *var_name, const |
| GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var); |
| } |
| |
| +static void |
| +process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o) |
| +{ |
| + const char *var_name = "GOMP_NVPTX_JIT"; |
| + const char *env_var = secure_getenv (var_name); |
| + notify_var (var_name, env_var); |
| + |
| + if (env_var == NULL) |
| + return; |
| + |
| + const char *c = env_var; |
| + while (*c != '\0') |
| + { |
| + while (*c == ' ') |
| + c++; |
| + |
| + if (c[0] == '-' && c[1] == 'O' |
| + && '0' <= c[2] && c[2] <= '4' |
| + && (c[3] == '\0' || c[3] == ' ')) |
| + { |
| + *gomp_nvptx_o = c[2] - '0'; |
| + c += 3; |
| + continue; |
| + } |
| + |
| + GOMP_PLUGIN_error ("Error parsing %s", var_name); |
| + break; |
| + } |
| +} |
| + |
| static bool |
| link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, |
| unsigned num_objs) |
| { |
| - CUjit_option opts[6]; |
| - void *optvals[6]; |
| + CUjit_option opts[7]; |
| + void *optvals[7]; |
| float elapsed = 0.0; |
| char elog[1024]; |
| char ilog[16384]; |
| @@ -908,16 +979,41 @@ link_ptx (CUmodule *module, const struct |
| opts[5] = CU_JIT_LOG_VERBOSE; |
| optvals[5] = (void *) 1; |
| |
| - CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate); |
| + static intptr_t gomp_nvptx_o = -1; |
| + |
| + static bool init_done = false; |
| + if (!init_done) |
| + { |
| + process_GOMP_NVPTX_JIT (&gomp_nvptx_o); |
| + init_done = true; |
| + } |
| + |
| + int nopts = 6; |
| + if (gomp_nvptx_o != -1) |
| + { |
| + opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL; |
| + optvals[nopts] = (void *) gomp_nvptx_o; |
| + nopts++; |
| + } |
| + |
| + if (CUDA_CALL_EXISTS (cuLinkCreate_v2)) |
| + CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate); |
| + else |
| + CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate); |
| |
| for (; num_objs--; ptx_objs++) |
| { |
| /* cuLinkAddData's 'data' argument erroneously omits the const |
| qualifier. */ |
| GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code); |
| - r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX, |
| - (char *) ptx_objs->code, ptx_objs->size, |
| - 0, 0, 0, 0); |
| + if (CUDA_CALL_EXISTS (cuLinkAddData_v2)) |
| + r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX, |
| + (char *) ptx_objs->code, ptx_objs->size, |
| + 0, 0, 0, 0); |
| + else |
| + r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX, |
| + (char *) ptx_objs->code, ptx_objs->size, |
| + 0, 0, 0, 0); |
| if (r != CUDA_SUCCESS) |
| { |
| GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); |
| @@ -1067,8 +1163,10 @@ nvptx_exec (void (*fn), size_t mapnum, v |
| int i; |
| struct ptx_stream *dev_str; |
| void *kargs[1]; |
| - void *hp, *dp; |
| + void *hp; |
| + CUdeviceptr dp = 0; |
| struct nvptx_thread *nvthd = nvptx_thread (); |
| + int warp_size = nvthd->ptx_dev->warp_size; |
| const char *maybe_abort_msg = "(perhaps abort was called)"; |
| |
| function = targ_fn->fn; |
| @@ -1090,68 +1188,36 @@ nvptx_exec (void (*fn), size_t mapnum, v |
| |
| if (seen_zero) |
| { |
| - /* See if the user provided GOMP_OPENACC_DIM environment |
| - variable to specify runtime defaults. */ |
| - static int default_dims[GOMP_DIM_MAX]; |
| - |
| pthread_mutex_lock (&ptx_dev_lock); |
| - if (!default_dims[0]) |
| - { |
| - const char *var_name = "GOMP_OPENACC_DIM"; |
| - /* We only read the environment variable once. You can't |
| - change it in the middle of execution. The syntax is |
| - the same as for the -fopenacc-dim compilation option. */ |
| - const char *env_var = getenv (var_name); |
| - notify_var (var_name, env_var); |
| - if (env_var) |
| - { |
| - const char *pos = env_var; |
| |
| - for (i = 0; *pos && i != GOMP_DIM_MAX; i++) |
| - { |
| - if (i && *pos++ != ':') |
| - break; |
| - if (*pos != ':') |
| - { |
| - const char *eptr; |
| - |
| - errno = 0; |
| - long val = strtol (pos, (char **)&eptr, 10); |
| - if (errno || val < 0 || (unsigned)val != val) |
| - break; |
| - default_dims[i] = (int)val; |
| - pos = eptr; |
| - } |
| - } |
| - } |
| + static int gomp_openacc_dims[GOMP_DIM_MAX]; |
| + if (!gomp_openacc_dims[0]) |
| + { |
| + /* See if the user provided GOMP_OPENACC_DIM environment |
| + variable to specify runtime defaults. */ |
| + for (int i = 0; i < GOMP_DIM_MAX; ++i) |
| + gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i); |
| + } |
| |
| - int warp_size, block_size, dev_size, cpu_size; |
| - CUdevice dev = nvptx_thread()->ptx_dev->dev; |
| - /* 32 is the default for known hardware. */ |
| - int gang = 0, worker = 32, vector = 32; |
| - CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm; |
| - |
| - cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK; |
| - cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE; |
| - cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT; |
| - cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR; |
| - |
| - if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb, |
| - dev) == CUDA_SUCCESS |
| - && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws, |
| - dev) == CUDA_SUCCESS |
| - && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc, |
| - dev) == CUDA_SUCCESS |
| - && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm, |
| - dev) == CUDA_SUCCESS) |
| - { |
| - GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d," |
| - " dev_size=%d, cpu_size=%d\n", |
| - warp_size, block_size, dev_size, cpu_size); |
| - gang = (cpu_size / block_size) * dev_size; |
| - worker = block_size / warp_size; |
| - vector = warp_size; |
| - } |
| + if (!nvthd->ptx_dev->default_dims[0]) |
| + { |
| + int default_dims[GOMP_DIM_MAX]; |
| + for (int i = 0; i < GOMP_DIM_MAX; ++i) |
| + default_dims[i] = gomp_openacc_dims[i]; |
| + |
| + int gang, worker, vector; |
| + { |
| + int block_size = nvthd->ptx_dev->max_threads_per_block; |
| + int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor; |
| + int dev_size = nvthd->ptx_dev->num_sms; |
| + GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d," |
| + " dev_size=%d, cpu_size=%d\n", |
| + warp_size, block_size, dev_size, cpu_size); |
| + |
| + gang = (cpu_size / block_size) * dev_size; |
| + worker = block_size / warp_size; |
| + vector = warp_size; |
| + } |
| |
| /* There is no upper bound on the gang size. The best size |
| matches the hardware configuration. Logical gangs are |
| @@ -1172,29 +1238,150 @@ nvptx_exec (void (*fn), size_t mapnum, v |
| default_dims[GOMP_DIM_GANG], |
| default_dims[GOMP_DIM_WORKER], |
| default_dims[GOMP_DIM_VECTOR]); |
| + |
| + for (i = 0; i != GOMP_DIM_MAX; i++) |
| + nvthd->ptx_dev->default_dims[i] = default_dims[i]; |
| } |
| pthread_mutex_unlock (&ptx_dev_lock); |
| |
| - for (i = 0; i != GOMP_DIM_MAX; i++) |
| - if (!dims[i]) |
| - dims[i] = default_dims[i]; |
| - } |
| - |
| - /* This reserves a chunk of a pre-allocated page of memory mapped on both |
| - the host and the device. HP is a host pointer to the new chunk, and DP is |
| - the corresponding device pointer. */ |
| - map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); |
| - |
| - GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); |
| - |
| - /* Copy the array of arguments to the mapped page. */ |
| - for (i = 0; i < mapnum; i++) |
| - ((void **) hp)[i] = devaddrs[i]; |
| - |
| - /* Copy the (device) pointers to arguments to the device (dp and hp might in |
| - fact have the same value on a unified-memory system). */ |
| - CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp, |
| - mapnum * sizeof (void *)); |
| + { |
| + bool default_dim_p[GOMP_DIM_MAX]; |
| + for (i = 0; i != GOMP_DIM_MAX; i++) |
| + default_dim_p[i] = !dims[i]; |
| + |
| + if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize)) |
| + { |
| + for (i = 0; i != GOMP_DIM_MAX; i++) |
| + if (default_dim_p[i]) |
| + dims[i] = nvthd->ptx_dev->default_dims[i]; |
| + |
| + if (default_dim_p[GOMP_DIM_VECTOR]) |
| + dims[GOMP_DIM_VECTOR] |
| + = MIN (dims[GOMP_DIM_VECTOR], |
| + (targ_fn->max_threads_per_block / warp_size |
| + * warp_size)); |
| + |
| + if (default_dim_p[GOMP_DIM_WORKER]) |
| + dims[GOMP_DIM_WORKER] |
| + = MIN (dims[GOMP_DIM_WORKER], |
| + targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]); |
| + } |
| + else |
| + { |
| + /* Handle the case that the compiler allows the runtime to choose |
| + the vector-length conservatively, by ignoring |
| + gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle |
| + it. */ |
| + int vectors = 0; |
| + /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that |
| + gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not |
| + exceed targ_fn->max_threads_per_block. */ |
| + int workers = gomp_openacc_dims[GOMP_DIM_WORKER]; |
| + int gangs = gomp_openacc_dims[GOMP_DIM_GANG]; |
| + int grids, blocks; |
| + |
| + CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids, |
| + &blocks, function, NULL, 0, |
| + dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); |
| + GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " |
| + "grid = %d, block = %d\n", grids, blocks); |
| + |
| + /* Keep the num_gangs proportional to the block size. In |
| + the case were a block size is limited by shared-memory |
| + or the register file capacity, the runtime will not |
| + excessively over assign gangs to the multiprocessor |
| + units if their state is going to be swapped out even |
| + more than necessary. The constant factor 2 is there to |
| + prevent threads from idling when there is insufficient |
| + work for them. */ |
| + if (gangs == 0) |
| + gangs = 2 * grids * (blocks / warp_size); |
| + |
| + if (vectors == 0) |
| + vectors = warp_size; |
| + |
| + if (workers == 0) |
| + { |
| + int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR] |
| + ? vectors |
| + : dims[GOMP_DIM_VECTOR]); |
| + workers = blocks / actual_vectors; |
| + workers = MAX (workers, 1); |
| + /* If we need a per-worker barrier ... . */ |
| + if (actual_vectors > 32) |
| + /* Don't use more barriers than available. */ |
| + workers = MIN (workers, 15); |
| + } |
| + |
| + for (i = 0; i != GOMP_DIM_MAX; i++) |
| + if (default_dim_p[i]) |
| + switch (i) |
| + { |
| + case GOMP_DIM_GANG: dims[i] = gangs; break; |
| + case GOMP_DIM_WORKER: dims[i] = workers; break; |
| + case GOMP_DIM_VECTOR: dims[i] = vectors; break; |
| + default: GOMP_PLUGIN_fatal ("invalid dim"); |
| + } |
| + } |
| + } |
| + } |
| + |
| + /* Check if the accelerator has sufficient hardware resources to |
| + launch the offloaded kernel. */ |
| + if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] |
| + > targ_fn->max_threads_per_block) |
| + { |
| + const char *msg |
| + = ("The Nvidia accelerator has insufficient resources to launch '%s'" |
| + " with num_workers = %d and vector_length = %d" |
| + "; " |
| + "recompile the program with 'num_workers = x and vector_length = y'" |
| + " on that offloaded region or '-fopenacc-dim=:x:y' where" |
| + " x * y <= %d" |
| + ".\n"); |
| + GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER], |
| + dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block); |
| + } |
| + |
| + /* Check if the accelerator has sufficient barrier resources to |
| + launch the offloaded kernel. */ |
| + if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32) |
| + { |
| + const char *msg |
| + = ("The Nvidia accelerator has insufficient barrier resources to launch" |
| + " '%s' with num_workers = %d and vector_length = %d" |
| + "; " |
| + "recompile the program with 'num_workers = x' on that offloaded" |
| + " region or '-fopenacc-dim=:x:' where x <= 15" |
| + "; " |
| + "or, recompile the program with 'vector_length = 32' on that" |
| + " offloaded region or '-fopenacc-dim=::32'" |
| + ".\n"); |
| + GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER], |
| + dims[GOMP_DIM_VECTOR]); |
| + } |
| + |
| + if (mapnum > 0) |
| + { |
| + /* This reserves a chunk of a pre-allocated page of memory mapped on both |
| + the host and the device. HP is a host pointer to the new chunk, and DP is |
| + the corresponding device pointer. */ |
| + pthread_mutex_lock (&ptx_event_lock); |
| + dp = map_push (dev_str, mapnum * sizeof (void *)); |
| + pthread_mutex_unlock (&ptx_event_lock); |
| + |
| + GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); |
| + |
| + /* Copy the array of arguments to the mapped page. */ |
| + hp = alloca(sizeof(void *) * mapnum); |
| + for (i = 0; i < mapnum; i++) |
| + ((void **) hp)[i] = devaddrs[i]; |
| + |
| + /* Copy the (device) pointers to arguments to the device */ |
| + CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp, |
| + mapnum * sizeof (void *)); |
| + } |
| + |
| GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" |
| " gangs=%u, workers=%u, vectors=%u\n", |
| __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG], |
| @@ -1239,7 +1426,8 @@ nvptx_exec (void (*fn), size_t mapnum, v |
| |
| CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream); |
| |
| - event_add (PTX_EVT_KNL, e, (void *)dev_str, 0); |
| + if (mapnum > 0) |
| + event_add (PTX_EVT_KNL, e, (void *)dev_str, 0); |
| } |
| #else |
| r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); |
| @@ -1256,7 +1444,10 @@ nvptx_exec (void (*fn), size_t mapnum, v |
| #ifndef DISABLE_ASYNC |
| if (async < acc_async_noval) |
| #endif |
| - map_pop (dev_str); |
| + { |
| + if (mapnum > 0) |
| + map_pop (dev_str); |
| + } |
| } |
| |
| void * openacc_get_current_cuda_context (void); |
| @@ -1415,9 +1606,8 @@ nvptx_async_test (int async) |
| struct ptx_stream *s; |
| |
| s = select_stream_for_async (async, pthread_self (), false, NULL); |
| - |
| if (!s) |
| - GOMP_PLUGIN_fatal ("unknown async %d", async); |
| + return 1; |
| |
| r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream); |
| if (r == CUDA_SUCCESS) |
| @@ -1472,7 +1662,7 @@ nvptx_wait (int async) |
| |
| s = select_stream_for_async (async, pthread_self (), false, NULL); |
| if (!s) |
| - GOMP_PLUGIN_fatal ("unknown async %d", async); |
| + return; |
| |
| CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream); |
| |
| @@ -1486,16 +1676,17 @@ nvptx_wait_async (int async1, int async2 |
| struct ptx_stream *s1, *s2; |
| pthread_t self = pthread_self (); |
| |
| + s1 = select_stream_for_async (async1, self, false, NULL); |
| + if (!s1) |
| + return; |
| + |
| /* The stream that is waiting (rather than being waited for) doesn't |
| necessarily have to exist already. */ |
| s2 = select_stream_for_async (async2, self, true, NULL); |
| |
| - s1 = select_stream_for_async (async1, self, false, NULL); |
| - if (!s1) |
| - GOMP_PLUGIN_fatal ("invalid async 1\n"); |
| - |
| + /* A stream is always synchronized with itself. */ |
| if (s1 == s2) |
| - GOMP_PLUGIN_fatal ("identical parameters"); |
| + return; |
| |
| e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); |
| |
| @@ -1629,8 +1820,14 @@ nvptx_set_cuda_stream (int async, void * |
| pthread_t self = pthread_self (); |
| struct nvptx_thread *nvthd = nvptx_thread (); |
| |
| - if (async < 0) |
| - GOMP_PLUGIN_fatal ("bad async %d", async); |
| + /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used |
| + to change the stream handle associated with "acc_async_sync". */ |
| + if (async == acc_async_sync) |
| + { |
| + GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated" |
| + " with \"acc_async_sync\"\n"); |
| + return 0; |
| + } |
| |
| pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); |
| |
| @@ -1739,6 +1936,12 @@ GOMP_OFFLOAD_fini_device (int n) |
| instantiated_devices--; |
| } |
| |
| + if (instantiated_devices == 0) |
| + { |
| + free (ptx_devices); |
| + ptx_devices = NULL; |
| + } |
| + |
| pthread_mutex_unlock (&ptx_dev_lock); |
| return true; |
| } |
| |
| |
| @@ -26,8 +26,6 @@ |
| # see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| # <http://www.gnu.org/licenses/>. |
| |
| -offload_targets= |
| -AC_SUBST(offload_targets) |
| plugin_support=yes |
| AC_CHECK_LIB(dl, dlsym, , [plugin_support=no]) |
| if test x"$plugin_support" = xyes; then |
| @@ -59,7 +57,11 @@ AC_ARG_WITH(cuda-driver-lib, |
| [AS_HELP_STRING([--with-cuda-driver-lib=PATH], |
| [specify directory for the installed CUDA driver library])]) |
| case "x$with_cuda_driver" in |
| - x | xno) ;; |
| + x) ;; |
| + xno) |
| + CUDA_DRIVER_INCLUDE=no |
| + CUDA_DRIVER_LIB=no |
| + ;; |
| *) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include |
| CUDA_DRIVER_LIB=$with_cuda_driver/lib |
| ;; |
| @@ -70,10 +72,12 @@ fi |
| if test "x$with_cuda_driver_lib" != x; then |
| CUDA_DRIVER_LIB=$with_cuda_driver_lib |
| fi |
| -if test "x$CUDA_DRIVER_INCLUDE" != x; then |
| +if test "x$CUDA_DRIVER_INCLUDE" != x \ |
| + && test "x$CUDA_DRIVER_INCLUDE" != xno; then |
| CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE |
| fi |
| -if test "x$CUDA_DRIVER_LIB" != x; then |
| +if test "x$CUDA_DRIVER_LIB" != x \ |
| + && test "x$CUDA_DRIVER_LIB" != xno; then |
| CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB |
| fi |
| |
| @@ -133,7 +137,13 @@ AC_SUBST(PLUGIN_HSA_CPPFLAGS) |
| AC_SUBST(PLUGIN_HSA_LDFLAGS) |
| AC_SUBST(PLUGIN_HSA_LIBS) |
| |
| -# Get offload targets and path to install tree of offloading compiler. |
| +# Parse '--enable-offload-targets', figure out the corresponding libgomp |
| +# plugins, and configure to find the corresponding offload compilers. |
| +# 'offload_plugins' and 'offload_targets' will be populated in the same order. |
| +offload_plugins= |
| +offload_targets= |
| +AC_SUBST(offload_plugins) |
| +AC_SUBST(offload_targets) |
| offload_additional_options= |
| offload_additional_lib_paths= |
| AC_SUBST(offload_additional_options) |
| @@ -142,36 +152,41 @@ if test x"$enable_offload_targets" != x; |
| for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do |
| tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'` |
| tgt=`echo $tgt | sed 's/=.*//'` |
| - tgt_name= |
| + tgt_plugin= |
| case $tgt in |
| *-intelmic-* | *-intelmicemul-*) |
| - tgt_name=intelmic |
| + tgt_plugin=intelmic |
| ;; |
| nvptx*) |
| - tgt_name=nvptx |
| + tgt_plugin=nvptx |
| PLUGIN_NVPTX=$tgt |
| - PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS |
| - PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS |
| - PLUGIN_NVPTX_LIBS='-lcuda' |
| - |
| - PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS |
| - CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS" |
| - PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS |
| - LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS" |
| - PLUGIN_NVPTX_save_LIBS=$LIBS |
| - LIBS="$PLUGIN_NVPTX_LIBS $LIBS" |
| - AC_LINK_IFELSE( |
| - [AC_LANG_PROGRAM( |
| - [#include "cuda.h"], |
| - [CUresult r = cuCtxPushCurrent (NULL);])], |
| - [PLUGIN_NVPTX=1]) |
| - CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS |
| - LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS |
| - LIBS=$PLUGIN_NVPTX_save_LIBS |
| + if test "x$CUDA_DRIVER_LIB" != xno \ |
| + && test "x$CUDA_DRIVER_LIB" != xno; then |
| + PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS |
| + PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS |
| + PLUGIN_NVPTX_LIBS='-lcuda' |
| + |
| + PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS |
| + CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS" |
| + PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS |
| + LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS" |
| + PLUGIN_NVPTX_save_LIBS=$LIBS |
| + LIBS="$PLUGIN_NVPTX_LIBS $LIBS" |
| + AC_LINK_IFELSE( |
| + [AC_LANG_PROGRAM( |
| + [#include "cuda.h"], |
| + [CUresult r = cuCtxPushCurrent (NULL);])], |
| + [PLUGIN_NVPTX=1]) |
| + CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS |
| + LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS |
| + LIBS=$PLUGIN_NVPTX_save_LIBS |
| + fi |
| case $PLUGIN_NVPTX in |
| nvptx*) |
| - if test "x$CUDA_DRIVER_INCLUDE" = x \ |
| - && test "x$CUDA_DRIVER_LIB" = x; then |
| + if (test "x$CUDA_DRIVER_INCLUDE" = x \ |
| + || test "x$CUDA_DRIVER_INCLUDE" = xno) \ |
| + && (test "x$CUDA_DRIVER_LIB" = x \ |
| + || test "x$CUDA_DRIVER_LIB" = xno); then |
| PLUGIN_NVPTX=1 |
| PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda' |
| PLUGIN_NVPTX_LIBS='-ldl' |
| @@ -191,7 +206,7 @@ if test x"$enable_offload_targets" != x; |
| PLUGIN_HSA=0 |
| ;; |
| *) |
| - tgt_name=hsa |
| + tgt_plugin=hsa |
| PLUGIN_HSA=$tgt |
| PLUGIN_HSA_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS |
| PLUGIN_HSA_LDFLAGS="$HSA_RUNTIME_LDFLAGS" |
| @@ -209,7 +224,7 @@ if test x"$enable_offload_targets" != x; |
| LDFLAGS=$PLUGIN_HSA_save_LDFLAGS |
| LIBS=$PLUGIN_HSA_save_LIBS |
| case $PLUGIN_HSA in |
| - hsa*) |
| + hsa*) |
| HSA_PLUGIN=0 |
| AC_MSG_ERROR([HSA run-time package required for HSA support]) |
| ;; |
| @@ -226,16 +241,19 @@ if test x"$enable_offload_targets" != x; |
| AC_MSG_ERROR([unknown offload target specified]) |
| ;; |
| esac |
| - if test x"$tgt_name" = x; then |
| - # Don't configure libgomp for this offloading target if we don't build |
| - # the corresponding plugin. |
| + if test x"$tgt_plugin" = x; then |
| + # Not configuring libgomp for this offload target if we're not building |
| + # the corresponding offload plugin. |
| continue |
| - elif test x"$offload_targets" = x; then |
| - offload_targets=$tgt_name |
| + elif test x"$offload_plugins" = x; then |
| + offload_plugins=$tgt_plugin |
| + offload_targets=$tgt |
| else |
| - offload_targets=$offload_targets,$tgt_name |
| + offload_plugins=$offload_plugins,$tgt_plugin |
| + offload_targets=$offload_targets,$tgt |
| fi |
| - if test "$tgt_name" = hsa; then |
| + # Configure additional search paths. |
| + if test "$tgt_plugin" = hsa; then |
| # Offloading compilation is all handled by the target compiler. |
| : |
| elif test x"$tgt_dir" != x; then |
| @@ -247,8 +265,8 @@ if test x"$enable_offload_targets" != x; |
| fi |
| done |
| fi |
| -AC_DEFINE_UNQUOTED(OFFLOAD_TARGETS, "$offload_targets", |
| - [Define to offload targets, separated by commas.]) |
| +AC_DEFINE_UNQUOTED(OFFLOAD_PLUGINS, "$offload_plugins", |
| + [Define to offload plugins, separated by commas.]) |
| AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1]) |
| AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX], |
| [Define to 1 if the NVIDIA plugin is built, 0 if not.]) |
| |
| |
| @@ -0,0 +1,495 @@ |
| +/* Copyright (C) 2018-2019 Free Software Foundation, Inc. |
| + Contributed by Jakub Jelinek <jakub@redhat.com>. |
| + |
| + This file is part of the GNU Offloading and Multi Processing Library |
| + (libgomp). |
| + |
| + Libgomp is free software; you can redistribute it and/or modify it |
| + under the terms of the GNU General Public License as published by |
| + the Free Software Foundation; either version 3, or (at your option) |
| + any later version. |
| + |
| + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY |
| + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| + FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
| + more details. |
| + |
| + Under Section 7 of GPL version 3, you are granted additional |
| + permissions described in the GCC Runtime Library Exception, version |
| + 3.1, as published by the Free Software Foundation. |
| + |
| + You should have received a copy of the GNU General Public License and |
| + a copy of the GCC Runtime Library Exception along with this program; |
| + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| + <http://www.gnu.org/licenses/>. */ |
| + |
| +#include "libgomp.h" |
| +#include <string.h> |
| +#include <stdio.h> |
| +#include <stdlib.h> |
| +#ifdef HAVE_UNISTD_H |
| +#include <unistd.h> |
| +#endif |
| +#ifdef HAVE_INTTYPES_H |
| +# include <inttypes.h> /* For PRIx64. */ |
| +#endif |
| +#ifdef HAVE_UNAME |
| +#include <sys/utsname.h> |
| +#endif |
| + |
| +void |
| +gomp_print_string (const char *str, size_t len) |
| +{ |
| + fwrite (str, 1, len, stderr); |
| +} |
| + |
| +void |
| +gomp_set_affinity_format (const char *format, size_t len) |
| +{ |
| + if (len < gomp_affinity_format_len) |
| + memcpy (gomp_affinity_format_var, format, len); |
| + else |
| + { |
| + char *p; |
| + if (gomp_affinity_format_len) |
| + p = gomp_realloc (gomp_affinity_format_var, len + 1); |
| + else |
| + p = gomp_malloc (len + 1); |
| + memcpy (p, format, len); |
| + gomp_affinity_format_var = p; |
| + gomp_affinity_format_len = len + 1; |
| + } |
| + gomp_affinity_format_var[len] = '\0'; |
| +} |
| + |
| +void |
| +omp_set_affinity_format (const char *format) |
| +{ |
| + gomp_set_affinity_format (format, strlen (format)); |
| +} |
| + |
| +size_t |
| +omp_get_affinity_format (char *buffer, size_t size) |
| +{ |
| + size_t len = strlen (gomp_affinity_format_var); |
| + if (size) |
| + { |
| + if (len < size) |
| + memcpy (buffer, gomp_affinity_format_var, len + 1); |
| + else |
| + { |
| + memcpy (buffer, gomp_affinity_format_var, size - 1); |
| + buffer[size - 1] = '\0'; |
| + } |
| + } |
| + return len; |
| +} |
| + |
| +void |
| +gomp_display_string (char *buffer, size_t size, size_t *ret, |
| + const char *str, size_t len) |
| +{ |
| + size_t r = *ret; |
| + if (size && r < size) |
| + { |
| + size_t l = len; |
| + if (size - r < len) |
| + l = size - r; |
| + memcpy (buffer + r, str, l); |
| + } |
| + *ret += len; |
| + if (__builtin_expect (r > *ret, 0)) |
| + gomp_fatal ("overflow in omp_capture_affinity"); |
| +} |
| + |
| +static void |
| +gomp_display_repeat (char *buffer, size_t size, size_t *ret, |
| + char c, size_t len) |
| +{ |
| + size_t r = *ret; |
| + if (size && r < size) |
| + { |
| + size_t l = len; |
| + if (size - r < len) |
| + l = size - r; |
| + memset (buffer + r, c, l); |
| + } |
| + *ret += len; |
| + if (__builtin_expect (r > *ret, 0)) |
| + gomp_fatal ("overflow in omp_capture_affinity"); |
| +} |
| + |
| +static void |
| +gomp_display_num (char *buffer, size_t size, size_t *ret, |
| + bool zero, bool right, size_t sz, char *buf) |
| +{ |
| + size_t l = strlen (buf); |
| + if (sz == (size_t) -1 || l >= sz) |
| + { |
| + gomp_display_string (buffer, size, ret, buf, l); |
| + return; |
| + } |
| + if (zero) |
| + { |
| + if (buf[0] == '-') |
| + gomp_display_string (buffer, size, ret, buf, 1); |
| + else if (buf[0] == '0' && buf[1] == 'x') |
| + gomp_display_string (buffer, size, ret, buf, 2); |
| + gomp_display_repeat (buffer, size, ret, '0', sz - l); |
| + if (buf[0] == '-') |
| + gomp_display_string (buffer, size, ret, buf + 1, l - 1); |
| + else if (buf[0] == '0' && buf[1] == 'x') |
| + gomp_display_string (buffer, size, ret, buf + 2, l - 2); |
| + else |
| + gomp_display_string (buffer, size, ret, buf, l); |
| + } |
| + else if (right) |
| + { |
| + gomp_display_repeat (buffer, size, ret, ' ', sz - l); |
| + gomp_display_string (buffer, size, ret, buf, l); |
| + } |
| + else |
| + { |
| + gomp_display_string (buffer, size, ret, buf, l); |
| + gomp_display_repeat (buffer, size, ret, ' ', sz - l); |
| + } |
| +} |
| + |
| +static void |
| +gomp_display_int (char *buffer, size_t size, size_t *ret, |
| + bool zero, bool right, size_t sz, int num) |
| +{ |
| + char buf[3 * sizeof (int) + 2]; |
| + sprintf (buf, "%d", num); |
| + gomp_display_num (buffer, size, ret, zero, right, sz, buf); |
| +} |
| + |
| +static void |
| +gomp_display_string_len (char *buffer, size_t size, size_t *ret, |
| + bool right, size_t sz, char *str, size_t len) |
| +{ |
| + if (sz == (size_t) -1 || len >= sz) |
| + { |
| + gomp_display_string (buffer, size, ret, str, len); |
| + return; |
| + } |
| + |
| + if (right) |
| + { |
| + gomp_display_repeat (buffer, size, ret, ' ', sz - len); |
| + gomp_display_string (buffer, size, ret, str, len); |
| + } |
| + else |
| + { |
| + gomp_display_string (buffer, size, ret, str, len); |
| + gomp_display_repeat (buffer, size, ret, ' ', sz - len); |
| + } |
| +} |
| + |
| +static void |
| +gomp_display_hostname (char *buffer, size_t size, size_t *ret, |
| + bool right, size_t sz) |
| +{ |
| +#ifdef HAVE_GETHOSTNAME |
| + { |
| + char buf[256]; |
| + char *b = buf; |
| + size_t len = 256; |
| + do |
| + { |
| + b[len - 1] = '\0'; |
| + if (gethostname (b, len - 1) == 0) |
| + { |
| + size_t l = strlen (b); |
| + if (l < len - 1) |
| + { |
| + gomp_display_string_len (buffer, size, ret, |
| + right, sz, b, l); |
| + if (b != buf) |
| + free (b); |
| + return; |
| + } |
| + } |
| + if (len == 1048576) |
| + break; |
| + len = len * 2; |
| + if (len == 512) |
| + b = gomp_malloc (len); |
| + else |
| + b = gomp_realloc (b, len); |
| + } |
| + while (1); |
| + if (b != buf) |
| + free (b); |
| + } |
| +#endif |
| +#ifdef HAVE_UNAME |
| + { |
| + struct utsname buf; |
| + if (uname (&buf) == 0) |
| + { |
| + gomp_display_string_len (buffer, size, ret, right, sz, |
| + buf.nodename, strlen (buf.nodename)); |
| + return; |
| + } |
| + } |
| +#endif |
| + gomp_display_string_len (buffer, size, ret, right, sz, "node", 4); |
| +} |
| + |
| +struct affinity_types_struct { |
| + char long_str[18]; |
| + char long_len; |
| + char short_c; }; |
| + |
| +static struct affinity_types_struct affinity_types[] = |
| +{ |
| +#define AFFINITY_TYPE(l, s) \ |
| + { #l, sizeof (#l) - 1, s } |
| + AFFINITY_TYPE (team_num, 't'), |
| + AFFINITY_TYPE (num_teams, 'T'), |
| + AFFINITY_TYPE (nesting_level, 'L'), |
| + AFFINITY_TYPE (thread_num, 'n'), |
| + AFFINITY_TYPE (num_threads, 'N'), |
| + AFFINITY_TYPE (ancestor_tnum, 'a'), |
| + AFFINITY_TYPE (host, 'H'), |
| + AFFINITY_TYPE (process_id, 'P'), |
| + AFFINITY_TYPE (native_thread_id, 'i'), |
| + AFFINITY_TYPE (thread_affinity, 'A') |
| +#undef AFFINITY_TYPE |
| +}; |
| + |
| +size_t |
| +gomp_display_affinity (char *buffer, size_t size, |
| + const char *format, gomp_thread_handle handle, |
| + struct gomp_team_state *ts, unsigned int place) |
| +{ |
| + size_t ret = 0; |
| + do |
| + { |
| + const char *p = strchr (format, '%'); |
| + bool zero = false; |
| + bool right = false; |
| + size_t sz = -1; |
| + char c; |
| + int val; |
| + if (p == NULL) |
| + p = strchr (format, '\0'); |
| + if (p != format) |
| + gomp_display_string (buffer, size, &ret, |
| + format, p - format); |
| + if (*p == '\0') |
| + break; |
| + p++; |
| + if (*p == '%') |
| + { |
| + gomp_display_string (buffer, size, &ret, "%", 1); |
| + format = p + 1; |
| + continue; |
| + } |
| + if (*p == '0') |
| + { |
| + zero = true; |
| + p++; |
| + if (*p != '.') |
| + gomp_fatal ("leading zero not followed by dot in affinity format"); |
| + } |
| + if (*p == '.') |
| + { |
| + right = true; |
| + p++; |
| + } |
| + if (*p >= '1' && *p <= '9') |
| + { |
| + char *end; |
| + sz = strtoul (p, &end, 10); |
| + p = end; |
| + } |
| + else if (zero || right) |
| + gomp_fatal ("leading zero or right justification in affinity format " |
| + "requires size"); |
| + c = *p; |
| + if (c == '{') |
| + { |
| + int i; |
| + for (i = 0; |
| + i < sizeof (affinity_types) / sizeof (affinity_types[0]); ++i) |
| + if (strncmp (p + 1, affinity_types[i].long_str, |
| + affinity_types[i].long_len) == 0 |
| + && p[affinity_types[i].long_len + 1] == '}') |
| + { |
| + c = affinity_types[i].short_c; |
| + p += affinity_types[i].long_len + 1; |
| + break; |
| + } |
| + if (c == '{') |
| + { |
| + char *q = strchr (p + 1, '}'); |
| + if (q) |
| + gomp_fatal ("unsupported long type name '%.*s' in affinity " |
| + "format", (int) (q - (p + 1)), p + 1); |
| + else |
| + gomp_fatal ("unterminated long type name '%s' in affinity " |
| + "format", p + 1); |
| + } |
| + } |
| + switch (c) |
| + { |
| + case 't': |
| + val = omp_get_team_num (); |
| + goto do_int; |
| + case 'T': |
| + val = omp_get_num_teams (); |
| + goto do_int; |
| + case 'L': |
| + val = ts->level; |
| + goto do_int; |
| + case 'n': |
| + val = ts->team_id; |
| + goto do_int; |
| + case 'N': |
| + val = ts->team ? ts->team->nthreads : 1; |
| + goto do_int; |
| + case 'a': |
| + val = ts->team ? ts->team->prev_ts.team_id : -1; |
| + goto do_int; |
| + case 'H': |
| + gomp_display_hostname (buffer, size, &ret, right, sz); |
| + break; |
| + case 'P': |
| +#ifdef HAVE_GETPID |
| + val = getpid (); |
| +#else |
| + val = 0; |
| +#endif |
| + goto do_int; |
| + case 'i': |
| +#if defined(LIBGOMP_USE_PTHREADS) && defined(__GNUC__) |
| + { |
| + char buf[3 * (sizeof (handle) + sizeof (uintptr_t) + sizeof (int)) |
| + + 4]; |
| + /* This macro returns expr unmodified for integral or pointer |
| + types and 0 for anything else (e.g. aggregates). */ |
| +#define gomp_nonaggregate(expr) \ |
| + __builtin_choose_expr (__builtin_classify_type (expr) == 1 \ |
| + || __builtin_classify_type (expr) == 5, expr, 0) |
| + /* This macro returns expr unmodified for integral types, |
| + (uintptr_t) (expr) for pointer types and 0 for anything else |
| + (e.g. aggregates). */ |
| +#define gomp_integral(expr) \ |
| + __builtin_choose_expr (__builtin_classify_type (expr) == 5, \ |
| + (uintptr_t) gomp_nonaggregate (expr), \ |
| + gomp_nonaggregate (expr)) |
| + |
| + if (sizeof (gomp_integral (handle)) == sizeof (unsigned long)) |
| + sprintf (buf, "0x%lx", (unsigned long) gomp_integral (handle)); |
| +#if defined (HAVE_INTTYPES_H) && defined (PRIx64) |
| + else if (sizeof (gomp_integral (handle)) == sizeof (uint64_t)) |
| + sprintf (buf, "0x%" PRIx64, (uint64_t) gomp_integral (handle)); |
| +#else |
| + else if (sizeof (gomp_integral (handle)) |
| + == sizeof (unsigned long long)) |
| + sprintf (buf, "0x%llx", |
| + (unsigned long long) gomp_integral (handle)); |
| +#endif |
| + else |
| + sprintf (buf, "0x%x", (unsigned int) gomp_integral (handle)); |
| + gomp_display_num (buffer, size, &ret, zero, right, sz, buf); |
| + break; |
| + } |
| +#else |
| + val = 0; |
| + goto do_int; |
| +#endif |
| + case 'A': |
| + if (sz == (size_t) -1) |
| + gomp_display_affinity_place (buffer, size, &ret, |
| + place - 1); |
| + else if (right) |
| + { |
| + size_t len = 0; |
| + gomp_display_affinity_place (NULL, 0, &len, place - 1); |
| + if (len < sz) |
| + gomp_display_repeat (buffer, size, &ret, ' ', sz - len); |
| + gomp_display_affinity_place (buffer, size, &ret, place - 1); |
| + } |
| + else |
| + { |
| + size_t start = ret; |
| + gomp_display_affinity_place (buffer, size, &ret, place - 1); |
| + if (ret - start < sz) |
| + gomp_display_repeat (buffer, size, &ret, ' ', sz - (ret - start)); |
| + } |
| + break; |
| + do_int: |
| + gomp_display_int (buffer, size, &ret, zero, right, sz, val); |
| + break; |
| + default: |
| + gomp_fatal ("unsupported type %c in affinity format", c); |
| + } |
| + format = p + 1; |
| + } |
| + while (1); |
| + return ret; |
| +} |
| + |
| +size_t |
| +omp_capture_affinity (char *buffer, size_t size, const char *format) |
| +{ |
| + struct gomp_thread *thr = gomp_thread (); |
| + size_t ret |
| + = gomp_display_affinity (buffer, size, |
| + format && *format |
| + ? format : gomp_affinity_format_var, |
| + gomp_thread_self (), &thr->ts, thr->place); |
| + if (size) |
| + { |
| + if (ret >= size) |
| + buffer[size - 1] = '\0'; |
| + else |
| + buffer[ret] = '\0'; |
| + } |
| + return ret; |
| +} |
| +ialias (omp_capture_affinity) |
| + |
| +void |
| +omp_display_affinity (const char *format) |
| +{ |
| + char buf[512]; |
| + char *b; |
| + size_t ret = ialias_call (omp_capture_affinity) (buf, sizeof buf, format); |
| + if (ret < sizeof buf) |
| + { |
| + buf[ret] = '\n'; |
| + gomp_print_string (buf, ret + 1); |
| + return; |
| + } |
| + b = gomp_malloc (ret + 1); |
| + ialias_call (omp_capture_affinity) (b, ret + 1, format); |
| + b[ret] = '\n'; |
| + gomp_print_string (b, ret + 1); |
| + free (b); |
| +} |
| + |
| +void |
| +gomp_display_affinity_thread (gomp_thread_handle handle, |
| + struct gomp_team_state *ts, unsigned int place) |
| +{ |
| + char buf[512]; |
| + char *b; |
| + size_t ret = gomp_display_affinity (buf, sizeof buf, gomp_affinity_format_var, |
| + handle, ts, place); |
| + if (ret < sizeof buf) |
| + { |
| + buf[ret] = '\n'; |
| + gomp_print_string (buf, ret + 1); |
| + return; |
| + } |
| + b = gomp_malloc (ret + 1); |
| + gomp_display_affinity (b, ret + 1, gomp_affinity_format_var, |
| + handle, ts, place); |
| + b[ret] = '\n'; |
| + gomp_print_string (b, ret + 1); |
| + free (b); |
| +} |
| |
| |
| @@ -47,7 +47,7 @@ GOMP_single_start (void) |
| return __sync_bool_compare_and_swap (&team->single_count, single_count, |
| single_count + 1L); |
| #else |
| - bool ret = gomp_work_share_start (false); |
| + bool ret = gomp_work_share_start (0); |
| if (ret) |
| gomp_work_share_init_done (); |
| gomp_work_share_end_nowait (); |
| @@ -68,7 +68,7 @@ GOMP_single_copy_start (void) |
| bool first; |
| void *ret; |
| |
| - first = gomp_work_share_start (false); |
| + first = gomp_work_share_start (0); |
| |
| if (first) |
| { |
| |
| |
| @@ -58,7 +58,7 @@ acc_get_cuda_stream (int async) |
| { |
| struct goacc_thread *thr = goacc_thread (); |
| |
| - if (async < 0) |
| + if (!async_valid_p (async)) |
| return NULL; |
| |
| if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func) |
| @@ -72,7 +72,7 @@ acc_set_cuda_stream (int async, void *st |
| { |
| struct goacc_thread *thr; |
| |
| - if (async < 0 || stream == NULL) |
| + if (!async_valid_p (async) || stream == NULL) |
| return 0; |
| |
| goacc_lazy_initialize (); |
| |
| |
| @@ -76,7 +76,15 @@ alloc_work_share (struct gomp_team *team |
| #endif |
| |
| team->work_share_chunk *= 2; |
| + /* Allocating gomp_work_share structures aligned is just an |
| + optimization, don't do it when using the fallback method. */ |
| +#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC |
| + ws = gomp_aligned_alloc (__alignof (struct gomp_work_share), |
| + team->work_share_chunk |
| + * sizeof (struct gomp_work_share)); |
| +#else |
| ws = gomp_malloc (team->work_share_chunk * sizeof (struct gomp_work_share)); |
| +#endif |
| ws->next_alloc = team->work_shares[0].next_alloc; |
| team->work_shares[0].next_alloc = ws; |
| team->work_share_list_alloc = &ws[1]; |
| @@ -90,30 +98,35 @@ alloc_work_share (struct gomp_team *team |
| This shouldn't touch the next_alloc field. */ |
| |
| void |
| -gomp_init_work_share (struct gomp_work_share *ws, bool ordered, |
| +gomp_init_work_share (struct gomp_work_share *ws, size_t ordered, |
| unsigned nthreads) |
| { |
| gomp_mutex_init (&ws->lock); |
| if (__builtin_expect (ordered, 0)) |
| { |
| -#define INLINE_ORDERED_TEAM_IDS_CNT \ |
| - ((sizeof (struct gomp_work_share) \ |
| - - offsetof (struct gomp_work_share, inline_ordered_team_ids)) \ |
| - / sizeof (((struct gomp_work_share *) 0)->inline_ordered_team_ids[0])) |
| - |
| - if (nthreads > INLINE_ORDERED_TEAM_IDS_CNT) |
| - ws->ordered_team_ids |
| - = gomp_malloc (nthreads * sizeof (*ws->ordered_team_ids)); |
| +#define INLINE_ORDERED_TEAM_IDS_SIZE \ |
| + (sizeof (struct gomp_work_share) \ |
| + - offsetof (struct gomp_work_share, inline_ordered_team_ids)) |
| + |
| + if (__builtin_expect (ordered != 1, 0)) |
| + { |
| + ordered += nthreads * sizeof (*ws->ordered_team_ids) - 1; |
| + ordered = ordered + __alignof__ (long long) - 1; |
| + ordered &= ~(__alignof__ (long long) - 1); |
| + } |
| + else |
| + ordered = nthreads * sizeof (*ws->ordered_team_ids); |
| + if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE) |
| + ws->ordered_team_ids = gomp_malloc (ordered); |
| else |
| ws->ordered_team_ids = ws->inline_ordered_team_ids; |
| - memset (ws->ordered_team_ids, '\0', |
| - nthreads * sizeof (*ws->ordered_team_ids)); |
| + memset (ws->ordered_team_ids, '\0', ordered); |
| ws->ordered_num_used = 0; |
| ws->ordered_owner = -1; |
| ws->ordered_cur = 0; |
| } |
| else |
| - ws->ordered_team_ids = NULL; |
| + ws->ordered_team_ids = ws->inline_ordered_team_ids; |
| gomp_ptrlock_init (&ws->next_ws, NULL); |
| ws->threads_completed = 0; |
| } |
| @@ -166,7 +179,7 @@ free_work_share (struct gomp_team *team, |
| if this was the first thread to reach this point. */ |
| |
| bool |
| -gomp_work_share_start (bool ordered) |
| +gomp_work_share_start (size_t ordered) |
| { |
| struct gomp_thread *thr = gomp_thread (); |
| struct gomp_team *team = thr->ts.team; |
| @@ -178,7 +191,7 @@ gomp_work_share_start (bool ordered) |
| ws = gomp_malloc (sizeof (*ws)); |
| gomp_init_work_share (ws, ordered, 1); |
| thr->ts.work_share = ws; |
| - return ws; |
| + return true; |
| } |
| |
| ws = thr->ts.work_share; |
| |
| |
| @@ -189,6 +189,7 @@ enum gomp_map_kind |
| #define GOMP_TASK_FLAG_GRAINSIZE (1 << 9) |
| #define GOMP_TASK_FLAG_IF (1 << 10) |
| #define GOMP_TASK_FLAG_NOGROUP (1 << 11) |
| +#define GOMP_TASK_FLAG_REDUCTION (1 << 12) |
| |
| /* GOMP_target{_ext,update_ext,enter_exit_data} flags argument. */ |
| #define GOMP_TARGET_FLAG_NOWAIT (1 << 0) |
| @@ -196,6 +197,18 @@ enum gomp_map_kind |
| /* Internal to libgomp. */ |
| #define GOMP_TARGET_FLAG_UPDATE (1U << 31) |
| |
| + |
| +/* OpenACC construct flags. */ |
| + |
| +/* Force host fallback execution. */ |
| +#define GOACC_FLAG_HOST_FALLBACK (1 << 0) |
| + |
| +/* For legacy reasons, in the ABI, the GOACC_FLAGs are encoded as an inverted |
| + bitmask. */ |
| +#define GOACC_FLAGS_MARSHAL_OP BIT_NOT_EXPR |
| +#define GOACC_FLAGS_UNMARSHAL(X) (~(X)) |
| + |
| + |
| /* Versions of libgomp and device-specific plugins. GOMP_VERSION |
| should be incremented whenever an ABI-incompatible change is introduced |
| to the plugin interface defined in libgomp/libgomp.h. */ |
| @@ -251,6 +264,12 @@ enum gomp_map_kind |
| at most and shifted by this many bits. */ |
| #define GOMP_TARGET_ARG_VALUE_SHIFT 16 |
| |
| +/* Dependence types in omp_depend_t objects. */ |
| +#define GOMP_DEPEND_IN 1 |
| +#define GOMP_DEPEND_OUT 2 |
| +#define GOMP_DEPEND_INOUT 3 |
| +#define GOMP_DEPEND_MUTEXINOUTSET 4 |
| + |
| /* HSA specific data structures. */ |
| |
| /* Identifiers of device-specific target arguments. */ |