--- libgomp/config/linux/wait.h.jj 2013-01-31 20:29:10.091548989 +0100 +++ libgomp/config/linux/wait.h 2016-07-13 16:57:18.902355979 +0200 @@ -34,13 +34,13 @@ #define FUTEX_WAIT 0 #define FUTEX_WAKE 1 -#define FUTEX_PRIVATE_FLAG 128L +#define FUTEX_PRIVATE_FLAG 128 #ifdef HAVE_ATTRIBUTE_VISIBILITY # pragma GCC visibility push(hidden) #endif -extern long int gomp_futex_wait, gomp_futex_wake; +extern int gomp_futex_wait, gomp_futex_wake; #include @@ -48,7 +48,9 @@ static inline int do_spin (int *addr, in { unsigned long long i, count = gomp_spin_count_var; - if (__builtin_expect (gomp_managed_threads > gomp_available_cpus, 0)) + if (__builtin_expect (__atomic_load_n (&gomp_managed_threads, + MEMMODEL_RELAXED) + > gomp_available_cpus, 0)) count = gomp_throttled_spin_count_var; for (i = 0; i < count; i++) if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val, 0)) --- libgomp/config/linux/affinity.c.jj 2014-05-15 10:56:37.499502573 +0200 +++ libgomp/config/linux/affinity.c 2016-07-13 16:57:18.902355979 +0200 @@ -352,6 +352,45 @@ gomp_affinity_print_place (void *p) fprintf (stderr, ":%lu", len); } +int +omp_get_place_num_procs (int place_num) +{ + if (place_num < 0 || place_num >= gomp_places_list_len) + return 0; + + cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num]; + return gomp_cpuset_popcount (gomp_cpuset_size, cpusetp); +} + +void +omp_get_place_proc_ids (int place_num, int *ids) +{ + if (place_num < 0 || place_num >= gomp_places_list_len) + return; + + cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num]; + unsigned long i, max = 8 * gomp_cpuset_size; + for (i = 0; i < max; i++) + if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp)) + *ids++ = i; +} + +void +gomp_get_place_proc_ids_8 (int place_num, int64_t *ids) +{ + if (place_num < 0 || place_num >= gomp_places_list_len) + return; + + cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num]; + unsigned long i, max = 8 * gomp_cpuset_size; + for (i = 0; i < max; i++) + if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp)) + *ids++ = i; +} + +ialias(omp_get_place_num_procs) +ialias(omp_get_place_proc_ids) + #else #include "../posix/affinity.c" --- libgomp/config/linux/mutex.c.jj 2013-01-21 16:00:38.220917670 +0100 +++ libgomp/config/linux/mutex.c 2016-07-13 16:57:18.870356375 +0200 @@ -28,8 +28,8 @@ #include "wait.h" -long int gomp_futex_wake = FUTEX_WAKE | FUTEX_PRIVATE_FLAG; -long int gomp_futex_wait = FUTEX_WAIT | FUTEX_PRIVATE_FLAG; +int gomp_futex_wake = FUTEX_WAKE | FUTEX_PRIVATE_FLAG; +int gomp_futex_wait = FUTEX_WAIT | FUTEX_PRIVATE_FLAG; void gomp_mutex_lock_slow (gomp_mutex_t *mutex, int oldval) --- libgomp/config/posix/affinity.c.jj 2014-05-15 10:56:37.987498844 +0200 +++ libgomp/config/posix/affinity.c 2016-07-15 12:08:28.410015743 +0200 @@ -113,3 +113,27 @@ gomp_affinity_print_place (void *p) { (void) p; } + +int +omp_get_place_num_procs (int place_num) +{ + (void) place_num; + return 0; +} + +void +omp_get_place_proc_ids (int place_num, int *ids) +{ + (void) place_num; + (void) ids; +} + +void +gomp_get_place_proc_ids_8 (int place_num, int64_t *ids) +{ + (void) place_num; + (void) ids; +} + +ialias(omp_get_place_num_procs) +ialias(omp_get_place_proc_ids) --- libgomp/loop_ull.c.jj 2013-01-21 16:00:46.477871806 +0100 +++ libgomp/loop_ull.c 2016-07-13 16:57:18.918355780 +0200 @@ -174,15 +174,15 @@ GOMP_loop_ull_runtime_start (bool up, go { case GFS_STATIC: return gomp_loop_ull_static_start (up, start, end, incr, - icv->run_sched_modifier, + icv->run_sched_chunk_size, istart, iend); case GFS_DYNAMIC: return gomp_loop_ull_dynamic_start (up, start, end, incr, - icv->run_sched_modifier, + icv->run_sched_chunk_size, istart, iend); case GFS_GUIDED: return gomp_loop_ull_guided_start (up, start, end, incr, - icv->run_sched_modifier, + icv->run_sched_chunk_size, istart, iend); case GFS_AUTO: /* For now map to schedule(static), later on we could play with feedback @@ -278,15 +278,15 @@ GOMP_loop_ull_ordered_runtime_start (boo { case GFS_STATIC: return gomp_loop_ull_ordered_static_start (up, start, end, incr, - icv->run_sched_modifier, + icv->run_sched_chunk_size, istart, iend); case GFS_DYNAMIC: return gomp_loop_ull_ordered_dynamic_start (up, start, end, incr, - icv->run_sched_modifier, + icv->run_sched_chunk_size, istart, iend); case GFS_GUIDED: return gomp_loop_ull_ordered_guided_start (up, start, end, incr, - icv->run_sched_modifier, + icv->run_sched_chunk_size, istart, iend); case GFS_AUTO: /* For now map to schedule(static), later on we could play with feedback @@ -298,6 +298,114 @@ GOMP_loop_ull_ordered_runtime_start (boo } } +/* The *_doacross_*_start routines are similar. The only difference is that + this work-share construct is initialized to expect an ORDERED(N) - DOACROSS + section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1 + and other COUNTS array elements tell the library number of iterations + in the ordered inner loops. */ + +static bool +gomp_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + struct gomp_thread *thr = gomp_thread (); + + thr->ts.static_trip = 0; + if (gomp_work_share_start (false)) + { + gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, + GFS_STATIC, chunk_size); + gomp_doacross_ull_init (ncounts, counts, chunk_size); + gomp_work_share_init_done (); + } + + return !gomp_iter_ull_static_next (istart, iend); +} + +static bool +gomp_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + struct gomp_thread *thr = gomp_thread (); + bool ret; + + if (gomp_work_share_start (false)) + { + gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, + GFS_DYNAMIC, chunk_size); + gomp_doacross_ull_init (ncounts, counts, chunk_size); + gomp_work_share_init_done (); + } + +#if defined HAVE_SYNC_BUILTINS && defined __LP64__ + ret = gomp_iter_ull_dynamic_next (istart, iend); +#else + gomp_mutex_lock (&thr->ts.work_share->lock); + ret = gomp_iter_ull_dynamic_next_locked (istart, iend); + gomp_mutex_unlock (&thr->ts.work_share->lock); +#endif + + return ret; +} + +static bool +gomp_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + struct gomp_thread *thr = gomp_thread (); + bool ret; + + if (gomp_work_share_start (false)) + { + gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, + GFS_GUIDED, chunk_size); + gomp_doacross_ull_init (ncounts, counts, chunk_size); + gomp_work_share_init_done (); + } + +#if defined HAVE_SYNC_BUILTINS && defined __LP64__ + ret = gomp_iter_ull_guided_next (istart, iend); +#else + gomp_mutex_lock (&thr->ts.work_share->lock); + ret = gomp_iter_ull_guided_next_locked (istart, iend); + gomp_mutex_unlock (&thr->ts.work_share->lock); +#endif + + return ret; +} + +bool +GOMP_loop_ull_doacross_runtime_start (unsigned ncounts, gomp_ull *counts, + gomp_ull *istart, gomp_ull *iend) +{ + struct gomp_task_icv *icv = gomp_icv (false); + switch (icv->run_sched_var) + { + case GFS_STATIC: + return gomp_loop_ull_doacross_static_start (ncounts, counts, + icv->run_sched_chunk_size, + istart, iend); + case GFS_DYNAMIC: + return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, + icv->run_sched_chunk_size, + istart, iend); + case GFS_GUIDED: + return gomp_loop_ull_doacross_guided_start (ncounts, counts, + icv->run_sched_chunk_size, + istart, iend); + case GFS_AUTO: + /* For now map to schedule(static), later on we could play with feedback + driven choice. */ + return gomp_loop_ull_doacross_static_start (ncounts, counts, + 0, istart, iend); + default: + abort (); + } +} + /* The *_next routines are called when the thread completes processing of the iteration block currently assigned to it. If the work-share construct is bound directly to a parallel construct, then the iteration @@ -457,6 +565,10 @@ extern __typeof(gomp_loop_ull_dynamic_st __attribute__((alias ("gomp_loop_ull_dynamic_start"))); extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_guided_start __attribute__((alias ("gomp_loop_ull_guided_start"))); +extern __typeof(gomp_loop_ull_dynamic_start) GOMP_loop_ull_nonmonotonic_dynamic_start + __attribute__((alias ("gomp_loop_ull_dynamic_start"))); +extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_nonmonotonic_guided_start + __attribute__((alias ("gomp_loop_ull_guided_start"))); extern __typeof(gomp_loop_ull_ordered_static_start) GOMP_loop_ull_ordered_static_start __attribute__((alias ("gomp_loop_ull_ordered_static_start"))); @@ -465,12 +577,23 @@ extern __typeof(gomp_loop_ull_ordered_dy extern __typeof(gomp_loop_ull_ordered_guided_start) GOMP_loop_ull_ordered_guided_start __attribute__((alias ("gomp_loop_ull_ordered_guided_start"))); +extern __typeof(gomp_loop_ull_doacross_static_start) GOMP_loop_ull_doacross_static_start + __attribute__((alias ("gomp_loop_ull_doacross_static_start"))); +extern __typeof(gomp_loop_ull_doacross_dynamic_start) GOMP_loop_ull_doacross_dynamic_start + __attribute__((alias ("gomp_loop_ull_doacross_dynamic_start"))); +extern __typeof(gomp_loop_ull_doacross_guided_start) GOMP_loop_ull_doacross_guided_start + __attribute__((alias ("gomp_loop_ull_doacross_guided_start"))); + extern __typeof(gomp_loop_ull_static_next) GOMP_loop_ull_static_next __attribute__((alias ("gomp_loop_ull_static_next"))); extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_dynamic_next __attribute__((alias ("gomp_loop_ull_dynamic_next"))); extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_guided_next __attribute__((alias ("gomp_loop_ull_guided_next"))); +extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_nonmonotonic_dynamic_next + __attribute__((alias ("gomp_loop_ull_dynamic_next"))); +extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_nonmonotonic_guided_next + __attribute__((alias ("gomp_loop_ull_guided_next"))); extern __typeof(gomp_loop_ull_ordered_static_next) GOMP_loop_ull_ordered_static_next __attribute__((alias ("gomp_loop_ull_ordered_static_next"))); @@ -507,6 +630,25 @@ GOMP_loop_ull_guided_start (bool up, gom } bool +GOMP_loop_ull_nonmonotonic_dynamic_start (bool up, gomp_ull start, + gomp_ull end, gomp_ull incr, + gomp_ull chunk_size, + gomp_ull *istart, gomp_ull *iend) +{ + return gomp_loop_ull_dynamic_start (up, start, end, incr, chunk_size, istart, + iend); +} + +bool +GOMP_loop_ull_nonmonotonic_guided_start (bool up, gomp_ull start, gomp_ull end, + gomp_ull incr, gomp_ull chunk_size, + gomp_ull *istart, gomp_ull *iend) +{ + return gomp_loop_ull_guided_start (up, start, end, incr, chunk_size, istart, + iend); +} + +bool GOMP_loop_ull_ordered_static_start (bool up, gomp_ull start, gomp_ull end, gomp_ull incr, gomp_ull chunk_size, gomp_ull *istart, gomp_ull *iend) @@ -534,6 +676,33 @@ GOMP_loop_ull_ordered_guided_start (bool } bool +GOMP_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + return gomp_loop_ull_doacross_static_start (ncounts, counts, chunk_size, + istart, iend); +} + +bool +GOMP_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, chunk_size, + istart, iend); +} + +bool +GOMP_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + return gomp_loop_ull_doacross_guided_start (ncounts, counts, chunk_size, + istart, iend); +} + +bool GOMP_loop_ull_static_next (gomp_ull *istart, gomp_ull *iend) { return gomp_loop_ull_static_next (istart, iend); @@ -550,6 +719,18 @@ GOMP_loop_ull_guided_next (gomp_ull *ist { return gomp_loop_ull_guided_next (istart, iend); } + +bool +GOMP_loop_ull_nonmonotonic_dynamic_next (gomp_ull *istart, gomp_ull *iend) +{ + return gomp_loop_ull_dynamic_next (istart, iend); +} + +bool +GOMP_loop_ull_nonmonotonic_guided_next (gomp_ull *istart, gomp_ull *iend) +{ + return gomp_loop_ull_guided_next (istart, iend); +} bool GOMP_loop_ull_ordered_static_next (gomp_ull *istart, gomp_ull *iend) --- libgomp/team.c.jj 2014-05-15 10:56:32.092524669 +0200 +++ libgomp/team.c 2016-07-13 17:58:01.907291111 +0200 @@ -133,6 +133,25 @@ gomp_thread_start (void *xdata) return NULL; } +static inline struct gomp_team * +get_last_team (unsigned nthreads) +{ + struct gomp_thread *thr = gomp_thread (); + if (thr->ts.team == NULL) + { + struct gomp_thread_pool *pool = thr->thread_pool; + if (pool != NULL) + { + struct gomp_team *last_team = pool->last_team; + if (last_team != NULL && last_team->nthreads == nthreads) + { + pool->last_team = NULL; + return last_team; + } + } + } + return NULL; +} /* Create a new team data structure. */ @@ -140,18 +159,27 @@ struct gomp_team * gomp_new_team (unsigned nthreads) { struct gomp_team *team; - size_t size; int i; - size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0]) - + sizeof (team->implicit_task[0])); - team = gomp_malloc (size); + team = get_last_team (nthreads); + if (team == NULL) + { + size_t extra = sizeof (team->ordered_release[0]) + + sizeof (team->implicit_task[0]); + team = gomp_malloc (sizeof (*team) + nthreads * extra); + +#ifndef HAVE_SYNC_BUILTINS + gomp_mutex_init (&team->work_share_list_free_lock); +#endif + gomp_barrier_init (&team->barrier, nthreads); + gomp_mutex_init (&team->task_lock); + + team->nthreads = nthreads; + } team->work_share_chunk = 8; #ifdef HAVE_SYNC_BUILTINS team->single_count = 0; -#else - gomp_mutex_init (&team->work_share_list_free_lock); #endif team->work_shares_to_free = &team->work_shares[0]; gomp_init_work_share (&team->work_shares[0], false, nthreads); @@ -162,15 +190,11 @@ gomp_new_team (unsigned nthreads) team->work_shares[i].next_free = &team->work_shares[i + 1]; team->work_shares[i].next_free = NULL; - team->nthreads = nthreads; - gomp_barrier_init (&team->barrier, nthreads); - gomp_sem_init (&team->master_release, 0); team->ordered_release = (void *) &team->implicit_task[nthreads]; team->ordered_release[0] = &team->master_release; - gomp_mutex_init (&team->task_lock); - team->task_queue = NULL; + priority_queue_init (&team->task_queue); team->task_count = 0; team->task_queued_count = 0; team->task_running_count = 0; @@ -186,8 +210,12 @@ gomp_new_team (unsigned nthreads) static void free_team (struct gomp_team *team) { +#ifndef HAVE_SYNC_BUILTINS + gomp_mutex_destroy (&team->work_share_list_free_lock); +#endif gomp_barrier_destroy (&team->barrier); gomp_mutex_destroy (&team->task_lock); + priority_queue_free (&team->task_queue); free (team); } @@ -258,6 +286,8 @@ gomp_free_thread (void *arg __attribute_ free (pool); thr->thread_pool = NULL; } + if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0)) + gomp_team_end (); if (thr->task != NULL) { struct gomp_task *task = thr->task; @@ -287,7 +317,7 @@ gomp_team_start (void (*fn) (void *), vo struct gomp_thread **affinity_thr = NULL; thr = gomp_thread (); - nested = thr->ts.team != NULL; + nested = thr->ts.level; if (__builtin_expect (thr->thread_pool == NULL, 0)) { thr->thread_pool = gomp_new_thread_pool (); @@ -894,9 +924,6 @@ gomp_team_end (void) while (ws != NULL); } gomp_sem_destroy (&team->master_release); -#ifndef HAVE_SYNC_BUILTINS - gomp_mutex_destroy (&team->work_share_list_free_lock); -#endif if (__builtin_expect (thr->ts.team != NULL, 0) || __builtin_expect (team->nthreads == 1, 0)) --- libgomp/target.c.jj 2014-05-15 10:56:38.313498020 +0200 +++ libgomp/target.c 2016-07-15 16:58:29.249328861 +0200 @@ -22,14 +22,22 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see . */ -/* This file handles the maintainence of threads in response to team - creation and termination. */ +/* This file contains the support of offloading. */ +#include "config.h" #include "libgomp.h" +#include "oacc-plugin.h" +#include "oacc-int.h" +#include "gomp-constants.h" #include #include #include +#ifdef HAVE_INTTYPES_H +# include /* For PRIu64. */ +#endif #include +#include +#include attribute_hidden int gomp_get_num_devices (void) @@ -37,22 +45,87 @@ gomp_get_num_devices (void) return 0; } -/* Called when encountering a target directive. If DEVICE - is -1, it means use device-var ICV. If it is -2 (or any other value - larger than last available hw device, use host fallback. - FN is address of host code, OPENMP_TARGET contains value of the - __OPENMP_TARGET__ symbol in the shared library or binary that invokes - GOMP_target. HOSTADDRS, SIZES and KINDS are arrays - with MAPNUM entries, with addresses of the host objects, - sizes of the host objects (resp. for pointer kind pointer bias - and assumed sizeof (void *) size) and kinds. */ +/* This function should be called from every offload image while loading. + It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of + the target, and TARGET_DATA needed by target plugin. */ void -GOMP_target (int device, void (*fn) (void *), const void *openmp_target, - size_t mapnum, void **hostaddrs, size_t *sizes, - unsigned char *kinds) +GOMP_offload_register_ver (unsigned version, const void *host_table, + int target_type, const void *target_data) +{ + (void) version; + (void) host_table; + (void) target_type; + (void) target_data; +} + +void +GOMP_offload_register (const void *host_table, int target_type, + const void *target_data) +{ + (void) host_table; + (void) target_type; + (void) target_data; +} + +/* This function should be called from every offload image while unloading. + It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of + the target, and TARGET_DATA needed by target plugin. */ + +void +GOMP_offload_unregister_ver (unsigned version, const void *host_table, + int target_type, const void *target_data) +{ + (void) version; + (void) host_table; + (void) target_type; + (void) target_data; +} + +void +GOMP_offload_unregister (const void *host_table, int target_type, + const void *target_data) +{ + (void) host_table; + (void) target_type; + (void) target_data; +} + +/* This function initializes the target device, specified by DEVICEP. DEVICEP + must be locked on entry, and remains locked on return. */ + +attribute_hidden void +gomp_init_device (struct gomp_device_descr *devicep) +{ + devicep->state = GOMP_DEVICE_INITIALIZED; +} + +attribute_hidden void +gomp_unload_device (struct gomp_device_descr *devicep) +{ +} + +/* Free address mapping tables. MM must be locked on entry, and remains locked + on return. */ + +attribute_hidden void +gomp_free_memmap (struct splay_tree_s *mem_map) +{ + while (mem_map->root) + { + struct target_mem_desc *tgt = mem_map->root->key.tgt; + + splay_tree_remove (mem_map, &mem_map->root->key); + free (tgt->array); + free (tgt); + } +} + +/* Host fallback for GOMP_target{,_ext} routines. */ + +static void +gomp_target_fallback (void (*fn) (void *), void **hostaddrs) { - /* Host fallback. */ struct gomp_thread old_thr, *thr = gomp_thread (); old_thr = *thr; memset (thr, '\0', sizeof (*thr)); @@ -66,10 +139,167 @@ GOMP_target (int device, void (*fn) (voi *thr = old_thr; } +/* Calculate alignment and size requirements of a private copy of data shared + as GOMP_MAP_FIRSTPRIVATE and store them to TGT_ALIGN and TGT_SIZE. */ + +static inline void +calculate_firstprivate_requirements (size_t mapnum, size_t *sizes, + unsigned short *kinds, size_t *tgt_align, + size_t *tgt_size) +{ + size_t i; + for (i = 0; i < mapnum; i++) + if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE) + { + size_t align = (size_t) 1 << (kinds[i] >> 8); + if (*tgt_align < align) + *tgt_align = align; + *tgt_size = (*tgt_size + align - 1) & ~(align - 1); + *tgt_size += sizes[i]; + } +} + +/* Copy data shared as GOMP_MAP_FIRSTPRIVATE to DST. */ + +static inline void +copy_firstprivate_data (char *tgt, size_t mapnum, void **hostaddrs, + size_t *sizes, unsigned short *kinds, size_t tgt_align, + size_t tgt_size) +{ + uintptr_t al = (uintptr_t) tgt & (tgt_align - 1); + if (al) + tgt += tgt_align - al; + tgt_size = 0; + size_t i; + for (i = 0; i < mapnum; i++) + if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE) + { + size_t align = (size_t) 1 << (kinds[i] >> 8); + tgt_size = (tgt_size + align - 1) & ~(align - 1); + memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]); + hostaddrs[i] = tgt + tgt_size; + tgt_size = tgt_size + sizes[i]; + } +} + +/* Called when encountering a target directive. If DEVICE + is GOMP_DEVICE_ICV, it means use device-var ICV. If it is + GOMP_DEVICE_HOST_FALLBACK (or any value + larger than last available hw device), use host fallback. + FN is address of host code, UNUSED is part of the current ABI, but + we're not actually using it. HOSTADDRS, SIZES and KINDS are arrays + with MAPNUM entries, with addresses of the host objects, + sizes of the host objects (resp. for pointer kind pointer bias + and assumed sizeof (void *) size) and kinds. */ + +void +GOMP_target (int device, void (*fn) (void *), const void *unused, + size_t mapnum, void **hostaddrs, size_t *sizes, + unsigned char *kinds) +{ + return gomp_target_fallback (fn, hostaddrs); +} + +/* Like GOMP_target, but KINDS is 16-bit, UNUSED is no longer present, + and several arguments have been added: + FLAGS is a bitmask, see GOMP_TARGET_FLAG_* in gomp-constants.h. + DEPEND is array of dependencies, see GOMP_task for details. + + ARGS is a pointer to an array consisting of a variable number of both + device-independent and device-specific arguments, which can take one two + elements where the first specifies for which device it is intended, the type + and optionally also the value. If the value is not present in the first + one, the whole second element the actual value. The last element of the + array is a single NULL. Among the device independent can be for example + NUM_TEAMS and THREAD_LIMIT. + + NUM_TEAMS is positive if GOMP_teams will be called in the body with + that value, or 1 if teams construct is not present, or 0, if + teams construct does not have num_teams clause and so the choice is + implementation defined, and -1 if it can't be determined on the host + what value will GOMP_teams have on the device. + THREAD_LIMIT similarly is positive if GOMP_teams will be called in the + body with that value, or 0, if teams construct does not have thread_limit + clause or the teams construct is not present, or -1 if it can't be + determined on the host what value will GOMP_teams have on the device. */ + +void +GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned short *kinds, + unsigned int flags, void **depend, void **args) +{ + size_t tgt_align = 0, tgt_size = 0; + bool fpc_done = false; + + if (flags & GOMP_TARGET_FLAG_NOWAIT) + { + struct gomp_thread *thr = gomp_thread (); + if (thr->ts.team + && !thr->task->final_task) + { + gomp_create_target_task (NULL, fn, mapnum, hostaddrs, + sizes, kinds, flags, depend, args, + GOMP_TARGET_TASK_BEFORE_MAP); + return; + } + } + + /* If there are depend clauses, but nowait is not present + (or we are in a final task), block the parent task until the + dependencies are resolved and then just continue with the rest + of the function as if it is a merged task. */ + if (depend != NULL) + { + struct gomp_thread *thr = gomp_thread (); + if (thr->task && thr->task->depend_hash) + { + /* If we might need to wait, copy firstprivate now. */ + calculate_firstprivate_requirements (mapnum, sizes, kinds, + &tgt_align, &tgt_size); + if (tgt_align) + { + char *tgt = gomp_alloca (tgt_size + tgt_align - 1); + copy_firstprivate_data (tgt, mapnum, hostaddrs, sizes, kinds, + tgt_align, tgt_size); + } + fpc_done = true; + gomp_task_maybe_wait_for_dependencies (depend); + } + } + + if (!fpc_done) + { + calculate_firstprivate_requirements (mapnum, sizes, kinds, + &tgt_align, &tgt_size); + if (tgt_align) + { + char *tgt = gomp_alloca (tgt_size + tgt_align - 1); + copy_firstprivate_data (tgt, mapnum, hostaddrs, sizes, kinds, + tgt_align, tgt_size); + } + } + gomp_target_fallback (fn, hostaddrs); +} + +/* Host fallback for GOMP_target_data{,_ext} routines. */ + +static void +gomp_target_data_fallback (void) +{ +} + void -GOMP_target_data (int device, const void *openmp_target, size_t mapnum, +GOMP_target_data (int device, const void *unused, size_t mapnum, void **hostaddrs, size_t *sizes, unsigned char *kinds) { + return gomp_target_data_fallback (); +} + +void +GOMP_target_data_ext (int device, size_t mapnum, void **hostaddrs, + size_t *sizes, unsigned short *kinds) +{ + return gomp_target_data_fallback (); } void @@ -78,12 +308,112 @@ GOMP_target_end_data (void) } void -GOMP_target_update (int device, const void *openmp_target, size_t mapnum, +GOMP_target_update (int device, const void *unused, size_t mapnum, void **hostaddrs, size_t *sizes, unsigned char *kinds) { } void +GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs, + size_t *sizes, unsigned short *kinds, + unsigned int flags, void **depend) +{ + /* If there are depend clauses, but nowait is not present, + block the parent task until the dependencies are resolved + and then just continue with the rest of the function as if it + is a merged task. Until we are able to schedule task during + variable mapping or unmapping, ignore nowait if depend clauses + are not present. */ + if (depend != NULL) + { + struct gomp_thread *thr = gomp_thread (); + if (thr->task && thr->task->depend_hash) + { + if ((flags & GOMP_TARGET_FLAG_NOWAIT) + && thr->ts.team + && !thr->task->final_task) + { + if (gomp_create_target_task (NULL, (void (*) (void *)) NULL, + mapnum, hostaddrs, sizes, kinds, + flags | GOMP_TARGET_FLAG_UPDATE, + depend, NULL, GOMP_TARGET_TASK_DATA)) + return; + } + else + { + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ + if (team + && (gomp_team_barrier_cancelled (&team->barrier) + || (thr->task->taskgroup + && thr->task->taskgroup->cancelled))) + return; + + gomp_task_maybe_wait_for_dependencies (depend); + } + } + } +} + +void +GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs, + size_t *sizes, unsigned short *kinds, + unsigned int flags, void **depend) +{ + /* If there are depend clauses, but nowait is not present, + block the parent task until the dependencies are resolved + and then just continue with the rest of the function as if it + is a merged task. Until we are able to schedule task during + variable mapping or unmapping, ignore nowait if depend clauses + are not present. */ + if (depend != NULL) + { + struct gomp_thread *thr = gomp_thread (); + if (thr->task && thr->task->depend_hash) + { + if ((flags & GOMP_TARGET_FLAG_NOWAIT) + && thr->ts.team + && !thr->task->final_task) + { + if (gomp_create_target_task (NULL, (void (*) (void *)) NULL, + mapnum, hostaddrs, sizes, kinds, + flags, depend, NULL, + GOMP_TARGET_TASK_DATA)) + return; + } + else + { + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ + if (team + && (gomp_team_barrier_cancelled (&team->barrier) + || (thr->task->taskgroup + && thr->task->taskgroup->cancelled))) + return; + + gomp_task_maybe_wait_for_dependencies (depend); + } + } + } +} + +bool +gomp_target_task_fn (void *data) +{ + struct gomp_target_task *ttask = (struct gomp_target_task *) data; + + if (ttask->fn != NULL) + { + ttask->state = GOMP_TARGET_TASK_FALLBACK; + gomp_target_fallback (ttask->fn, ttask->hostaddrs); + return false; + } + return false; +} + +void GOMP_teams (unsigned int num_teams, unsigned int thread_limit) { if (thread_limit) @@ -94,3 +424,153 @@ GOMP_teams (unsigned int num_teams, unsi } (void) num_teams; } + +void * +omp_target_alloc (size_t size, int device_num) +{ + if (device_num == GOMP_DEVICE_HOST_FALLBACK) + return malloc (size); + + return NULL; +} + +void +omp_target_free (void *device_ptr, int device_num) +{ + if (device_ptr == NULL) + return; + + if (device_num == GOMP_DEVICE_HOST_FALLBACK) + { + free (device_ptr); + return; + } +} + +int +omp_target_is_present (void *ptr, int device_num) +{ + if (ptr == NULL) + return 1; + + if (device_num == GOMP_DEVICE_HOST_FALLBACK) + return 1; + + return 0; +} + +int +omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset, + size_t src_offset, int dst_device_num, int src_device_num) +{ + if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK) + return EINVAL; + if (src_device_num != GOMP_DEVICE_HOST_FALLBACK) + return EINVAL; + memcpy ((char *) dst + dst_offset, (char *) src + src_offset, length); + return 0; +} + +#define HALF_SIZE_T (((size_t) 1) << (8 * sizeof (size_t) / 2)) + +#define __builtin_mul_overflow(x, y, z) \ + ({ bool retval = false; \ + size_t xval = (x); \ + size_t yval = (y); \ + size_t zval = xval * yval; \ + if (__builtin_expect ((xval | yval) >= HALF_SIZE_T, 0)) \ + { \ + if (xval && zval / xval != yval) \ + retval = true; \ + } \ + *(z) = zval; \ + retval; }) + +static int +omp_target_memcpy_rect_worker (void *dst, void *src, size_t element_size, + int num_dims, const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions) +{ + size_t dst_slice = element_size; + size_t src_slice = element_size; + size_t j, dst_off, src_off, length; + int i, ret; + + + if (num_dims == 1) + { + if (__builtin_mul_overflow (element_size, volume[0], &length) + || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off) + || __builtin_mul_overflow (element_size, src_offsets[0], &src_off)) + return EINVAL; + memcpy ((char *) dst + dst_off, (char *) src + src_off, length); + ret = 1; + return ret ? 0 : EINVAL; + } + + /* FIXME: it would be nice to have some plugin function to handle + num_dims == 2 and num_dims == 3 more efficiently. Larger ones can + be handled in the generic recursion below, and for host-host it + should be used even for any num_dims >= 2. */ + + for (i = 1; i < num_dims; i++) + if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice) + || __builtin_mul_overflow (src_slice, src_dimensions[i], &src_slice)) + return EINVAL; + if (__builtin_mul_overflow (dst_slice, dst_offsets[0], &dst_off) + || __builtin_mul_overflow (src_slice, src_offsets[0], &src_off)) + return EINVAL; + for (j = 0; j < volume[0]; j++) + { + ret = omp_target_memcpy_rect_worker ((char *) dst + dst_off, + (char *) src + src_off, + element_size, num_dims - 1, + volume + 1, dst_offsets + 1, + src_offsets + 1, dst_dimensions + 1, + src_dimensions + 1); + if (ret) + return ret; + dst_off += dst_slice; + src_off += src_slice; + } + return 0; +} + +int +omp_target_memcpy_rect (void *dst, void *src, size_t element_size, + int num_dims, const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions, + int dst_device_num, int src_device_num) +{ + if (!dst && !src) + return INT_MAX; + + if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK) + return EINVAL; + if (src_device_num != GOMP_DEVICE_HOST_FALLBACK) + return EINVAL; + + int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims, + volume, dst_offsets, src_offsets, + dst_dimensions, src_dimensions); + return ret; +} + +int +omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size, + size_t device_offset, int device_num) +{ + return EINVAL; +} + +int +omp_target_disassociate_ptr (void *ptr, int device_num) +{ + return EINVAL; +} --- libgomp/fortran.c.jj 2014-05-15 10:56:31.593531223 +0200 +++ libgomp/fortran.c 2016-07-13 16:57:04.432535397 +0200 @@ -67,12 +67,20 @@ ialias_redirect (omp_get_active_level) ialias_redirect (omp_in_final) ialias_redirect (omp_get_cancellation) ialias_redirect (omp_get_proc_bind) +ialias_redirect (omp_get_num_places) +ialias_redirect (omp_get_place_num_procs) +ialias_redirect (omp_get_place_proc_ids) +ialias_redirect (omp_get_place_num) +ialias_redirect (omp_get_partition_num_places) +ialias_redirect (omp_get_partition_place_nums) ialias_redirect (omp_set_default_device) ialias_redirect (omp_get_default_device) ialias_redirect (omp_get_num_devices) ialias_redirect (omp_get_num_teams) ialias_redirect (omp_get_team_num) ialias_redirect (omp_is_initial_device) +ialias_redirect (omp_get_initial_device) +ialias_redirect (omp_get_max_task_priority) #endif #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING @@ -342,35 +350,35 @@ omp_get_wtime_ (void) } void -omp_set_schedule_ (const int32_t *kind, const int32_t *modifier) +omp_set_schedule_ (const int32_t *kind, const int32_t *chunk_size) { - omp_set_schedule (*kind, *modifier); + omp_set_schedule (*kind, *chunk_size); } void -omp_set_schedule_8_ (const int32_t *kind, const int64_t *modifier) +omp_set_schedule_8_ (const int32_t *kind, const int64_t *chunk_size) { - omp_set_schedule (*kind, TO_INT (*modifier)); + omp_set_schedule (*kind, TO_INT (*chunk_size)); } void -omp_get_schedule_ (int32_t *kind, int32_t *modifier) +omp_get_schedule_ (int32_t *kind, int32_t *chunk_size) { omp_sched_t k; - int m; - omp_get_schedule (&k, &m); + int cs; + omp_get_schedule (&k, &cs); *kind = k; - *modifier = m; + *chunk_size = cs; } void -omp_get_schedule_8_ (int32_t *kind, int64_t *modifier) +omp_get_schedule_8_ (int32_t *kind, int64_t *chunk_size) { omp_sched_t k; - int m; - omp_get_schedule (&k, &m); + int cs; + omp_get_schedule (&k, &cs); *kind = k; - *modifier = m; + *chunk_size = cs; } int32_t @@ -451,6 +459,69 @@ omp_get_proc_bind_ (void) return omp_get_proc_bind (); } +int32_t +omp_get_num_places_ (void) +{ + return omp_get_num_places (); +} + +int32_t +omp_get_place_num_procs_ (const int32_t *place_num) +{ + return omp_get_place_num_procs (*place_num); +} + +int32_t +omp_get_place_num_procs_8_ (const int64_t *place_num) +{ + return omp_get_place_num_procs (TO_INT (*place_num)); +} + +void +omp_get_place_proc_ids_ (const int32_t *place_num, int32_t *ids) +{ + omp_get_place_proc_ids (*place_num, (int *) ids); +} + +void +omp_get_place_proc_ids_8_ (const int64_t *place_num, int64_t *ids) +{ + gomp_get_place_proc_ids_8 (TO_INT (*place_num), ids); +} + +int32_t +omp_get_place_num_ (void) +{ + return omp_get_place_num (); +} + +int32_t +omp_get_partition_num_places_ (void) +{ + return omp_get_partition_num_places (); +} + +void +omp_get_partition_place_nums_ (int32_t *place_nums) +{ + omp_get_partition_place_nums ((int *) place_nums); +} + +void +omp_get_partition_place_nums_8_ (int64_t *place_nums) +{ + if (gomp_places_list == NULL) + return; + + struct gomp_thread *thr = gomp_thread (); + if (thr->place == 0) + gomp_init_affinity (); + + unsigned int i; + for (i = 0; i < thr->ts.place_partition_len; i++) + *place_nums++ = (int64_t) thr->ts.place_partition_off + i; +} + void omp_set_default_device_ (const int32_t *device_num) { @@ -492,3 +563,15 @@ omp_is_initial_device_ (void) { return omp_is_initial_device (); } + +int32_t +omp_get_initial_device_ (void) +{ + return omp_get_initial_device (); +} + +int32_t +omp_get_max_task_priority_ (void) +{ + return omp_get_max_task_priority (); +} --- libgomp/libgomp.map.jj 2014-05-15 10:56:31.927533549 +0200 +++ libgomp/libgomp.map 2016-07-13 16:57:04.434535373 +0200 @@ -134,6 +134,36 @@ OMP_4.0 { omp_is_initial_device_; } OMP_3.1; +OMP_4.5 { + global: + omp_get_max_task_priority; + omp_get_max_task_priority_; + omp_get_num_places; + omp_get_num_places_; + omp_get_place_num_procs; + omp_get_place_num_procs_; + omp_get_place_num_procs_8_; + omp_get_place_proc_ids; + omp_get_place_proc_ids_; + omp_get_place_proc_ids_8_; + omp_get_place_num; + omp_get_place_num_; + omp_get_partition_num_places; + omp_get_partition_num_places_; + omp_get_partition_place_nums; + omp_get_partition_place_nums_; + omp_get_partition_place_nums_8_; + omp_get_initial_device; + omp_get_initial_device_; + omp_target_alloc; + omp_target_free; + omp_target_is_present; + omp_target_memcpy; + omp_target_memcpy_rect; + omp_target_associate_ptr; + omp_target_disassociate_ptr; +} OMP_4.0; + GOMP_1.0 { global: GOMP_atomic_end; @@ -227,3 +257,158 @@ GOMP_4.0 { GOMP_target_update; GOMP_teams; } GOMP_3.0; + +GOMP_4.0.1 { + global: + GOMP_offload_register; + GOMP_offload_unregister; +} GOMP_4.0; + +GOMP_4.5 { + global: + GOMP_target_ext; + GOMP_target_data_ext; + GOMP_target_update_ext; + GOMP_target_enter_exit_data; + GOMP_taskloop; + GOMP_taskloop_ull; + GOMP_offload_register_ver; + GOMP_offload_unregister_ver; + GOMP_loop_doacross_dynamic_start; + GOMP_loop_doacross_guided_start; + GOMP_loop_doacross_runtime_start; + GOMP_loop_doacross_static_start; + GOMP_doacross_post; + GOMP_doacross_wait; + GOMP_loop_ull_doacross_dynamic_start; + GOMP_loop_ull_doacross_guided_start; + GOMP_loop_ull_doacross_runtime_start; + GOMP_loop_ull_doacross_static_start; + GOMP_doacross_ull_post; + GOMP_doacross_ull_wait; + GOMP_loop_nonmonotonic_dynamic_next; + GOMP_loop_nonmonotonic_dynamic_start; + GOMP_loop_nonmonotonic_guided_next; + GOMP_loop_nonmonotonic_guided_start; + GOMP_loop_ull_nonmonotonic_dynamic_next; + GOMP_loop_ull_nonmonotonic_dynamic_start; + GOMP_loop_ull_nonmonotonic_guided_next; + GOMP_loop_ull_nonmonotonic_guided_start; + GOMP_parallel_loop_nonmonotonic_dynamic; + GOMP_parallel_loop_nonmonotonic_guided; +} GOMP_4.0.1; + +OACC_2.0 { + global: + acc_get_num_devices; + acc_get_num_devices_h_; + acc_set_device_type; + acc_set_device_type_h_; + acc_get_device_type; + acc_get_device_type_h_; + acc_set_device_num; + acc_set_device_num_h_; + acc_get_device_num; + acc_get_device_num_h_; + acc_async_test; + acc_async_test_h_; + acc_async_test_all; + acc_async_test_all_h_; + acc_wait; + acc_wait_h_; + acc_wait_async; + acc_wait_async_h_; + acc_wait_all; + acc_wait_all_h_; + acc_wait_all_async; + acc_wait_all_async_h_; + acc_init; + acc_init_h_; + acc_shutdown; + acc_shutdown_h_; + acc_on_device; + acc_on_device_h_; + acc_malloc; + acc_free; + acc_copyin; + acc_copyin_32_h_; + acc_copyin_64_h_; + acc_copyin_array_h_; + acc_present_or_copyin; + acc_present_or_copyin_32_h_; + acc_present_or_copyin_64_h_; + acc_present_or_copyin_array_h_; + acc_create; + acc_create_32_h_; + acc_create_64_h_; + acc_create_array_h_; + acc_present_or_create; + acc_present_or_create_32_h_; + acc_present_or_create_64_h_; + acc_present_or_create_array_h_; + acc_copyout; + acc_copyout_32_h_; + acc_copyout_64_h_; + acc_copyout_array_h_; + acc_delete; + acc_delete_32_h_; + acc_delete_64_h_; + acc_delete_array_h_; + acc_update_device; + acc_update_device_32_h_; + acc_update_device_64_h_; + acc_update_device_array_h_; + acc_update_self; + acc_update_self_32_h_; + acc_update_self_64_h_; + acc_update_self_array_h_; + acc_map_data; + acc_unmap_data; + acc_deviceptr; + acc_hostptr; + acc_is_present; + acc_is_present_32_h_; + acc_is_present_64_h_; + acc_is_present_array_h_; + acc_memcpy_to_device; + acc_memcpy_from_device; + acc_get_current_cuda_device; + acc_get_current_cuda_context; + acc_get_cuda_stream; + acc_set_cuda_stream; +}; + +GOACC_2.0 { + global: + GOACC_data_end; + GOACC_data_start; + GOACC_enter_exit_data; + GOACC_parallel; + GOACC_update; + GOACC_wait; + GOACC_get_thread_num; + GOACC_get_num_threads; +}; + +GOACC_2.0.1 { + global: + GOACC_declare; + GOACC_parallel_keyed; +} GOACC_2.0; + +GOMP_PLUGIN_1.0 { + global: + GOMP_PLUGIN_malloc; + GOMP_PLUGIN_malloc_cleared; + GOMP_PLUGIN_realloc; + GOMP_PLUGIN_debug; + GOMP_PLUGIN_error; + GOMP_PLUGIN_fatal; + GOMP_PLUGIN_async_unmap_vars; + GOMP_PLUGIN_acc_thread; +}; + +GOMP_PLUGIN_1.1 { + global: + GOMP_PLUGIN_target_task_completion; +} GOMP_PLUGIN_1.0; --- libgomp/ordered.c.jj 2013-01-21 16:00:46.137873657 +0100 +++ libgomp/ordered.c 2016-07-13 16:57:18.918355780 +0200 @@ -25,6 +25,9 @@ /* This file handles the ORDERED construct. */ #include "libgomp.h" +#include +#include +#include "doacross.h" /* This function is called when first allocating an iteration block. That @@ -249,3 +252,533 @@ void GOMP_ordered_end (void) { } + +/* DOACROSS initialization. */ + +#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__) + +void +gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + struct gomp_work_share *ws = thr->ts.work_share; + unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; + unsigned long ent, num_ents, elt_sz, shift_sz; + struct gomp_doacross_work_share *doacross; + + if (team == NULL || team->nthreads == 1) + return; + + for (i = 0; i < ncounts; i++) + { + /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ + if (counts[i] == 0) + return; + + if (num_bits <= MAX_COLLAPSED_BITS) + { + unsigned int this_bits; + if (counts[i] == 1) + this_bits = 1; + else + this_bits = __SIZEOF_LONG__ * __CHAR_BIT__ + - __builtin_clzl (counts[i] - 1); + if (num_bits + this_bits <= MAX_COLLAPSED_BITS) + { + bits[i] = this_bits; + num_bits += this_bits; + } + else + num_bits = MAX_COLLAPSED_BITS + 1; + } + } + + if (ws->sched == GFS_STATIC) + num_ents = team->nthreads; + else if (ws->sched == GFS_GUIDED) + num_ents = counts[0]; + else + num_ents = (counts[0] - 1) / chunk_size + 1; + if (num_bits <= MAX_COLLAPSED_BITS) + { + elt_sz = sizeof (unsigned long); + shift_sz = ncounts * sizeof (unsigned int); + } + else + { + elt_sz = sizeof (unsigned long) * ncounts; + shift_sz = 0; + } + elt_sz = (elt_sz + 63) & ~63UL; + + doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz + + shift_sz); + doacross->chunk_size = chunk_size; + doacross->elt_sz = elt_sz; + doacross->ncounts = ncounts; + doacross->flattened = false; + doacross->array = (unsigned char *) + ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) + & ~(uintptr_t) 63); + if (num_bits <= MAX_COLLAPSED_BITS) + { + unsigned int shift_count = 0; + doacross->flattened = true; + for (i = ncounts; i > 0; i--) + { + doacross->shift_counts[i - 1] = shift_count; + shift_count += bits[i - 1]; + } + for (ent = 0; ent < num_ents; ent++) + *(unsigned long *) (doacross->array + ent * elt_sz) = 0; + } + else + for (ent = 0; ent < num_ents; ent++) + memset (doacross->array + ent * elt_sz, '\0', + sizeof (unsigned long) * ncounts); + if (ws->sched == GFS_STATIC && chunk_size == 0) + { + unsigned long q = counts[0] / num_ents; + unsigned long t = counts[0] % num_ents; + doacross->boundary = t * (q + 1); + doacross->q = q; + doacross->t = t; + } + ws->doacross = doacross; +} + +/* DOACROSS POST operation. */ + +void +GOMP_doacross_post (long *counts) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_work_share *ws = thr->ts.work_share; + struct gomp_doacross_work_share *doacross = ws->doacross; + unsigned long ent; + unsigned int i; + + if (__builtin_expect (doacross == NULL, 0)) + { + __sync_synchronize (); + return; + } + + if (__builtin_expect (ws->sched == GFS_STATIC, 1)) + ent = thr->ts.team_id; + else if (ws->sched == GFS_GUIDED) + ent = counts[0]; + else + ent = counts[0] / doacross->chunk_size; + unsigned long *array = (unsigned long *) (doacross->array + + ent * doacross->elt_sz); + + if (__builtin_expect (doacross->flattened, 1)) + { + unsigned long flattened + = (unsigned long) counts[0] << doacross->shift_counts[0]; + + for (i = 1; i < doacross->ncounts; i++) + flattened |= (unsigned long) counts[i] + << doacross->shift_counts[i]; + flattened++; + if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) + __atomic_thread_fence (MEMMODEL_RELEASE); + else + __atomic_store_n (array, flattened, MEMMODEL_RELEASE); + return; + } + + __atomic_thread_fence (MEMMODEL_ACQUIRE); + for (i = doacross->ncounts; i-- > 0; ) + { + if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) + __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); + } +} + +/* DOACROSS WAIT operation. */ + +void +GOMP_doacross_wait (long first, ...) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_work_share *ws = thr->ts.work_share; + struct gomp_doacross_work_share *doacross = ws->doacross; + va_list ap; + unsigned long ent; + unsigned int i; + + if (__builtin_expect (doacross == NULL, 0)) + { + __sync_synchronize (); + return; + } + + if (__builtin_expect (ws->sched == GFS_STATIC, 1)) + { + if (ws->chunk_size == 0) + { + if (first < doacross->boundary) + ent = first / (doacross->q + 1); + else + ent = (first - doacross->boundary) / doacross->q + + doacross->t; + } + else + ent = first / ws->chunk_size % thr->ts.team->nthreads; + } + else if (ws->sched == GFS_GUIDED) + ent = first; + else + ent = first / doacross->chunk_size; + unsigned long *array = (unsigned long *) (doacross->array + + ent * doacross->elt_sz); + + if (__builtin_expect (doacross->flattened, 1)) + { + unsigned long flattened + = (unsigned long) first << doacross->shift_counts[0]; + unsigned long cur; + + va_start (ap, first); + for (i = 1; i < doacross->ncounts; i++) + flattened |= (unsigned long) va_arg (ap, long) + << doacross->shift_counts[i]; + cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); + if (flattened < cur) + { + __atomic_thread_fence (MEMMODEL_RELEASE); + va_end (ap); + return; + } + doacross_spin (array, flattened, cur); + __atomic_thread_fence (MEMMODEL_RELEASE); + va_end (ap); + return; + } + + do + { + va_start (ap, first); + for (i = 0; i < doacross->ncounts; i++) + { + unsigned long thisv + = (unsigned long) (i ? va_arg (ap, long) : first) + 1; + unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); + if (thisv < cur) + { + i = doacross->ncounts; + break; + } + if (thisv > cur) + break; + } + va_end (ap); + if (i == doacross->ncounts) + break; + cpu_relax (); + } + while (1); + __sync_synchronize (); +} + +typedef unsigned long long gomp_ull; + +void +gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + struct gomp_work_share *ws = thr->ts.work_share; + unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; + unsigned long ent, num_ents, elt_sz, shift_sz; + struct gomp_doacross_work_share *doacross; + + if (team == NULL || team->nthreads == 1) + return; + + for (i = 0; i < ncounts; i++) + { + /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ + if (counts[i] == 0) + return; + + if (num_bits <= MAX_COLLAPSED_BITS) + { + unsigned int this_bits; + if (counts[i] == 1) + this_bits = 1; + else + this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__ + - __builtin_clzll (counts[i] - 1); + if (num_bits + this_bits <= MAX_COLLAPSED_BITS) + { + bits[i] = this_bits; + num_bits += this_bits; + } + else + num_bits = MAX_COLLAPSED_BITS + 1; + } + } + + if (ws->sched == GFS_STATIC) + num_ents = team->nthreads; + else if (ws->sched == GFS_GUIDED) + num_ents = counts[0]; + else + num_ents = (counts[0] - 1) / chunk_size + 1; + if (num_bits <= MAX_COLLAPSED_BITS) + { + elt_sz = sizeof (unsigned long); + shift_sz = ncounts * sizeof (unsigned int); + } + else + { + if (sizeof (gomp_ull) == sizeof (unsigned long)) + elt_sz = sizeof (gomp_ull) * ncounts; + else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long)) + elt_sz = sizeof (unsigned long) * 2 * ncounts; + else + abort (); + shift_sz = 0; + } + elt_sz = (elt_sz + 63) & ~63UL; + + doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz + + shift_sz); + doacross->chunk_size_ull = chunk_size; + doacross->elt_sz = elt_sz; + doacross->ncounts = ncounts; + doacross->flattened = false; + doacross->boundary = 0; + doacross->array = (unsigned char *) + ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) + & ~(uintptr_t) 63); + if (num_bits <= MAX_COLLAPSED_BITS) + { + unsigned int shift_count = 0; + doacross->flattened = true; + for (i = ncounts; i > 0; i--) + { + doacross->shift_counts[i - 1] = shift_count; + shift_count += bits[i - 1]; + } + for (ent = 0; ent < num_ents; ent++) + *(unsigned long *) (doacross->array + ent * elt_sz) = 0; + } + else + for (ent = 0; ent < num_ents; ent++) + memset (doacross->array + ent * elt_sz, '\0', + sizeof (unsigned long) * ncounts); + if (ws->sched == GFS_STATIC && chunk_size == 0) + { + gomp_ull q = counts[0] / num_ents; + gomp_ull t = counts[0] % num_ents; + doacross->boundary_ull = t * (q + 1); + doacross->q_ull = q; + doacross->t = t; + } + ws->doacross = doacross; +} + +/* DOACROSS POST operation. */ + +void +GOMP_doacross_ull_post (gomp_ull *counts) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_work_share *ws = thr->ts.work_share; + struct gomp_doacross_work_share *doacross = ws->doacross; + unsigned long ent; + unsigned int i; + + if (__builtin_expect (doacross == NULL, 0)) + { + __sync_synchronize (); + return; + } + + if (__builtin_expect (ws->sched == GFS_STATIC, 1)) + ent = thr->ts.team_id; + else if (ws->sched == GFS_GUIDED) + ent = counts[0]; + else + ent = counts[0] / doacross->chunk_size_ull; + + if (__builtin_expect (doacross->flattened, 1)) + { + unsigned long *array = (unsigned long *) (doacross->array + + ent * doacross->elt_sz); + gomp_ull flattened + = counts[0] << doacross->shift_counts[0]; + + for (i = 1; i < doacross->ncounts; i++) + flattened |= counts[i] << doacross->shift_counts[i]; + flattened++; + if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) + __atomic_thread_fence (MEMMODEL_RELEASE); + else + __atomic_store_n (array, flattened, MEMMODEL_RELEASE); + return; + } + + __atomic_thread_fence (MEMMODEL_ACQUIRE); + if (sizeof (gomp_ull) == sizeof (unsigned long)) + { + gomp_ull *array = (gomp_ull *) (doacross->array + + ent * doacross->elt_sz); + + for (i = doacross->ncounts; i-- > 0; ) + { + if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) + __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); + } + } + else + { + unsigned long *array = (unsigned long *) (doacross->array + + ent * doacross->elt_sz); + + for (i = doacross->ncounts; i-- > 0; ) + { + gomp_ull cull = counts[i] + 1UL; + unsigned long c = (unsigned long) cull; + if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED)) + __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE); + c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); + if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED)) + __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE); + } + } +} + +/* DOACROSS WAIT operation. */ + +void +GOMP_doacross_ull_wait (gomp_ull first, ...) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_work_share *ws = thr->ts.work_share; + struct gomp_doacross_work_share *doacross = ws->doacross; + va_list ap; + unsigned long ent; + unsigned int i; + + if (__builtin_expect (doacross == NULL, 0)) + { + __sync_synchronize (); + return; + } + + if (__builtin_expect (ws->sched == GFS_STATIC, 1)) + { + if (ws->chunk_size_ull == 0) + { + if (first < doacross->boundary_ull) + ent = first / (doacross->q_ull + 1); + else + ent = (first - doacross->boundary_ull) / doacross->q_ull + + doacross->t; + } + else + ent = first / ws->chunk_size_ull % thr->ts.team->nthreads; + } + else if (ws->sched == GFS_GUIDED) + ent = first; + else + ent = first / doacross->chunk_size_ull; + + if (__builtin_expect (doacross->flattened, 1)) + { + unsigned long *array = (unsigned long *) (doacross->array + + ent * doacross->elt_sz); + gomp_ull flattened = first << doacross->shift_counts[0]; + unsigned long cur; + + va_start (ap, first); + for (i = 1; i < doacross->ncounts; i++) + flattened |= va_arg (ap, gomp_ull) + << doacross->shift_counts[i]; + cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); + if (flattened < cur) + { + __atomic_thread_fence (MEMMODEL_RELEASE); + va_end (ap); + return; + } + doacross_spin (array, flattened, cur); + __atomic_thread_fence (MEMMODEL_RELEASE); + va_end (ap); + return; + } + + if (sizeof (gomp_ull) == sizeof (unsigned long)) + { + gomp_ull *array = (gomp_ull *) (doacross->array + + ent * doacross->elt_sz); + do + { + va_start (ap, first); + for (i = 0; i < doacross->ncounts; i++) + { + gomp_ull thisv + = (i ? va_arg (ap, gomp_ull) : first) + 1; + gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); + if (thisv < cur) + { + i = doacross->ncounts; + break; + } + if (thisv > cur) + break; + } + va_end (ap); + if (i == doacross->ncounts) + break; + cpu_relax (); + } + while (1); + } + else + { + unsigned long *array = (unsigned long *) (doacross->array + + ent * doacross->elt_sz); + do + { + va_start (ap, first); + for (i = 0; i < doacross->ncounts; i++) + { + gomp_ull thisv + = (i ? va_arg (ap, gomp_ull) : first) + 1; + unsigned long t + = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); + unsigned long cur + = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED); + if (t < cur) + { + i = doacross->ncounts; + break; + } + if (t > cur) + break; + t = thisv; + cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED); + if (t < cur) + { + i = doacross->ncounts; + break; + } + if (t > cur) + break; + } + va_end (ap); + if (i == doacross->ncounts) + break; + cpu_relax (); + } + while (1); + } + __sync_synchronize (); +} --- libgomp/loop.c.jj 2014-05-15 10:56:36.487505570 +0200 +++ libgomp/loop.c 2016-07-13 16:57:13.488423109 +0200 @@ -110,6 +110,11 @@ gomp_loop_static_start (long start, long return !gomp_iter_static_next (istart, iend); } +/* The current dynamic implementation is always monotonic. The + entrypoints without nonmonotonic in them have to be always monotonic, + but the nonmonotonic ones could be changed to use work-stealing for + improved scalability. */ + static bool gomp_loop_dynamic_start (long start, long end, long incr, long chunk_size, long *istart, long *iend) @@ -135,6 +140,9 @@ gomp_loop_dynamic_start (long start, lon return ret; } +/* Similarly as for dynamic, though the question is how can the chunk sizes + be decreased without a central locking or atomics. */ + static bool gomp_loop_guided_start (long start, long end, long incr, long chunk_size, long *istart, long *iend) @@ -168,13 +176,16 @@ GOMP_loop_runtime_start (long start, lon switch (icv->run_sched_var) { case GFS_STATIC: - return gomp_loop_static_start (start, end, incr, icv->run_sched_modifier, + return gomp_loop_static_start (start, end, incr, + icv->run_sched_chunk_size, istart, iend); case GFS_DYNAMIC: - return gomp_loop_dynamic_start (start, end, incr, icv->run_sched_modifier, + return gomp_loop_dynamic_start (start, end, incr, + icv->run_sched_chunk_size, istart, iend); case GFS_GUIDED: - return gomp_loop_guided_start (start, end, incr, icv->run_sched_modifier, + return gomp_loop_guided_start (start, end, incr, + icv->run_sched_chunk_size, istart, iend); case GFS_AUTO: /* For now map to schedule(static), later on we could play with feedback @@ -265,15 +276,15 @@ GOMP_loop_ordered_runtime_start (long st { case GFS_STATIC: return gomp_loop_ordered_static_start (start, end, incr, - icv->run_sched_modifier, + icv->run_sched_chunk_size, istart, iend); case GFS_DYNAMIC: return gomp_loop_ordered_dynamic_start (start, end, incr, - icv->run_sched_modifier, + icv->run_sched_chunk_size, istart, iend); case GFS_GUIDED: return gomp_loop_ordered_guided_start (start, end, incr, - icv->run_sched_modifier, + icv->run_sched_chunk_size, istart, iend); case GFS_AUTO: /* For now map to schedule(static), later on we could play with feedback @@ -285,6 +296,111 @@ GOMP_loop_ordered_runtime_start (long st } } +/* The *_doacross_*_start routines are similar. The only difference is that + this work-share construct is initialized to expect an ORDERED(N) - DOACROSS + section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1 + and other COUNTS array elements tell the library number of iterations + in the ordered inner loops. */ + +static bool +gomp_loop_doacross_static_start (unsigned ncounts, long *counts, + long chunk_size, long *istart, long *iend) +{ + struct gomp_thread *thr = gomp_thread (); + + thr->ts.static_trip = 0; + if (gomp_work_share_start (false)) + { + gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, + GFS_STATIC, chunk_size); + gomp_doacross_init (ncounts, counts, chunk_size); + gomp_work_share_init_done (); + } + + return !gomp_iter_static_next (istart, iend); +} + +static bool +gomp_loop_doacross_dynamic_start (unsigned ncounts, long *counts, + long chunk_size, long *istart, long *iend) +{ + struct gomp_thread *thr = gomp_thread (); + bool ret; + + if (gomp_work_share_start (false)) + { + gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, + GFS_DYNAMIC, chunk_size); + gomp_doacross_init (ncounts, counts, chunk_size); + gomp_work_share_init_done (); + } + +#ifdef HAVE_SYNC_BUILTINS + ret = gomp_iter_dynamic_next (istart, iend); +#else + gomp_mutex_lock (&thr->ts.work_share->lock); + ret = gomp_iter_dynamic_next_locked (istart, iend); + gomp_mutex_unlock (&thr->ts.work_share->lock); +#endif + + return ret; +} + +static bool +gomp_loop_doacross_guided_start (unsigned ncounts, long *counts, + long chunk_size, long *istart, long *iend) +{ + struct gomp_thread *thr = gomp_thread (); + bool ret; + + if (gomp_work_share_start (false)) + { + gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, + GFS_GUIDED, chunk_size); + gomp_doacross_init (ncounts, counts, chunk_size); + gomp_work_share_init_done (); + } + +#ifdef HAVE_SYNC_BUILTINS + ret = gomp_iter_guided_next (istart, iend); +#else + gomp_mutex_lock (&thr->ts.work_share->lock); + ret = gomp_iter_guided_next_locked (istart, iend); + gomp_mutex_unlock (&thr->ts.work_share->lock); +#endif + + return ret; +} + +bool +GOMP_loop_doacross_runtime_start (unsigned ncounts, long *counts, + long *istart, long *iend) +{ + struct gomp_task_icv *icv = gomp_icv (false); + switch (icv->run_sched_var) + { + case GFS_STATIC: + return gomp_loop_doacross_static_start (ncounts, counts, + icv->run_sched_chunk_size, + istart, iend); + case GFS_DYNAMIC: + return gomp_loop_doacross_dynamic_start (ncounts, counts, + icv->run_sched_chunk_size, + istart, iend); + case GFS_GUIDED: + return gomp_loop_doacross_guided_start (ncounts, counts, + icv->run_sched_chunk_size, + istart, iend); + case GFS_AUTO: + /* For now map to schedule(static), later on we could play with feedback + driven choice. */ + return gomp_loop_doacross_static_start (ncounts, counts, + 0, istart, iend); + default: + abort (); + } +} + /* The *_next routines are called when the thread completes processing of the iteration block currently assigned to it. If the work-share construct is bound directly to a parallel construct, then the iteration @@ -483,7 +599,7 @@ GOMP_parallel_loop_runtime_start (void ( { struct gomp_task_icv *icv = gomp_icv (false); gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, - icv->run_sched_var, icv->run_sched_modifier, 0); + icv->run_sched_var, icv->run_sched_chunk_size, 0); } ialias_redirect (GOMP_parallel_end) @@ -521,6 +637,37 @@ GOMP_parallel_loop_guided (void (*fn) (v GOMP_parallel_end (); } +#ifdef HAVE_ATTRIBUTE_ALIAS +extern __typeof(GOMP_parallel_loop_dynamic) GOMP_parallel_loop_nonmonotonic_dynamic + __attribute__((alias ("GOMP_parallel_loop_dynamic"))); +extern __typeof(GOMP_parallel_loop_guided) GOMP_parallel_loop_nonmonotonic_guided + __attribute__((alias ("GOMP_parallel_loop_guided"))); +#else +void +GOMP_parallel_loop_nonmonotonic_dynamic (void (*fn) (void *), void *data, + unsigned num_threads, long start, + long end, long incr, long chunk_size, + unsigned flags) +{ + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, + GFS_DYNAMIC, chunk_size, flags); + fn (data); + GOMP_parallel_end (); +} + +void +GOMP_parallel_loop_nonmonotonic_guided (void (*fn) (void *), void *data, + unsigned num_threads, long start, + long end, long incr, long chunk_size, + unsigned flags) +{ + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, + GFS_GUIDED, chunk_size, flags); + fn (data); + GOMP_parallel_end (); +} +#endif + void GOMP_parallel_loop_runtime (void (*fn) (void *), void *data, unsigned num_threads, long start, long end, @@ -528,7 +675,7 @@ GOMP_parallel_loop_runtime (void (*fn) ( { struct gomp_task_icv *icv = gomp_icv (false); gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, - icv->run_sched_var, icv->run_sched_modifier, + icv->run_sched_var, icv->run_sched_chunk_size, flags); fn (data); GOMP_parallel_end (); @@ -569,6 +716,10 @@ extern __typeof(gomp_loop_dynamic_start) __attribute__((alias ("gomp_loop_dynamic_start"))); extern __typeof(gomp_loop_guided_start) GOMP_loop_guided_start __attribute__((alias ("gomp_loop_guided_start"))); +extern __typeof(gomp_loop_dynamic_start) GOMP_loop_nonmonotonic_dynamic_start + __attribute__((alias ("gomp_loop_dynamic_start"))); +extern __typeof(gomp_loop_guided_start) GOMP_loop_nonmonotonic_guided_start + __attribute__((alias ("gomp_loop_guided_start"))); extern __typeof(gomp_loop_ordered_static_start) GOMP_loop_ordered_static_start __attribute__((alias ("gomp_loop_ordered_static_start"))); @@ -577,12 +728,23 @@ extern __typeof(gomp_loop_ordered_dynami extern __typeof(gomp_loop_ordered_guided_start) GOMP_loop_ordered_guided_start __attribute__((alias ("gomp_loop_ordered_guided_start"))); +extern __typeof(gomp_loop_doacross_static_start) GOMP_loop_doacross_static_start + __attribute__((alias ("gomp_loop_doacross_static_start"))); +extern __typeof(gomp_loop_doacross_dynamic_start) GOMP_loop_doacross_dynamic_start + __attribute__((alias ("gomp_loop_doacross_dynamic_start"))); +extern __typeof(gomp_loop_doacross_guided_start) GOMP_loop_doacross_guided_start + __attribute__((alias ("gomp_loop_doacross_guided_start"))); + extern __typeof(gomp_loop_static_next) GOMP_loop_static_next __attribute__((alias ("gomp_loop_static_next"))); extern __typeof(gomp_loop_dynamic_next) GOMP_loop_dynamic_next __attribute__((alias ("gomp_loop_dynamic_next"))); extern __typeof(gomp_loop_guided_next) GOMP_loop_guided_next __attribute__((alias ("gomp_loop_guided_next"))); +extern __typeof(gomp_loop_dynamic_next) GOMP_loop_nonmonotonic_dynamic_next + __attribute__((alias ("gomp_loop_dynamic_next"))); +extern __typeof(gomp_loop_guided_next) GOMP_loop_nonmonotonic_guided_next + __attribute__((alias ("gomp_loop_guided_next"))); extern __typeof(gomp_loop_ordered_static_next) GOMP_loop_ordered_static_next __attribute__((alias ("gomp_loop_ordered_static_next"))); @@ -613,6 +775,21 @@ GOMP_loop_guided_start (long start, long } bool +GOMP_loop_nonmonotonic_dynamic_start (long start, long end, long incr, + long chunk_size, long *istart, + long *iend) +{ + return gomp_loop_dynamic_start (start, end, incr, chunk_size, istart, iend); +} + +bool +GOMP_loop_nonmonotonic_guided_start (long start, long end, long incr, + long chunk_size, long *istart, long *iend) +{ + return gomp_loop_guided_start (start, end, incr, chunk_size, istart, iend); +} + +bool GOMP_loop_ordered_static_start (long start, long end, long incr, long chunk_size, long *istart, long *iend) { @@ -637,6 +814,30 @@ GOMP_loop_ordered_guided_start (long sta } bool +GOMP_loop_doacross_static_start (unsigned ncounts, long *counts, + long chunk_size, long *istart, long *iend) +{ + return gomp_loop_doacross_static_start (ncounts, counts, chunk_size, + istart, iend); +} + +bool +GOMP_loop_doacross_dynamic_start (unsigned ncounts, long *counts, + long chunk_size, long *istart, long *iend) +{ + return gomp_loop_doacross_dynamic_start (ncounts, counts, chunk_size, + istart, iend); +} + +bool +GOMP_loop_doacross_guided_start (unsigned ncounts, long *counts, + long chunk_size, long *istart, long *iend) +{ + return gomp_loop_doacross_guided_start (ncounts, counts, chunk_size, + istart, iend); +} + +bool GOMP_loop_static_next (long *istart, long *iend) { return gomp_loop_static_next (istart, iend); @@ -653,6 +854,18 @@ GOMP_loop_guided_next (long *istart, lon { return gomp_loop_guided_next (istart, iend); } + +bool +GOMP_loop_nonmonotonic_dynamic_next (long *istart, long *iend) +{ + return gomp_loop_dynamic_next (istart, iend); +} + +bool +GOMP_loop_nonmonotonic_guided_next (long *istart, long *iend) +{ + return gomp_loop_guided_next (istart, iend); +} bool GOMP_loop_ordered_static_next (long *istart, long *iend) --- libgomp/error.c.jj 2013-01-21 16:00:31.834953566 +0100 +++ libgomp/error.c 2016-07-13 16:57:04.437535335 +0200 @@ -35,7 +35,26 @@ #include -static void +#undef gomp_vdebug +void +gomp_vdebug (int kind __attribute__ ((unused)), const char *msg, va_list list) +{ + if (gomp_debug_var) + vfprintf (stderr, msg, list); +} + +#undef gomp_debug +void +gomp_debug (int kind, const char *msg, ...) +{ + va_list list; + + va_start (list, msg); + gomp_vdebug (kind, msg, list); + va_end (list); +} + +void gomp_verror (const char *fmt, va_list list) { fputs ("\nlibgomp: ", stderr); @@ -54,13 +73,18 @@ gomp_error (const char *fmt, ...) } void +gomp_vfatal (const char *fmt, va_list list) +{ + gomp_verror (fmt, list); + exit (EXIT_FAILURE); +} + +void gomp_fatal (const char *fmt, ...) { va_list list; va_start (list, fmt); - gomp_verror (fmt, list); + gomp_vfatal (fmt, list); va_end (list); - - exit (EXIT_FAILURE); } --- libgomp/Makefile.am.jj 2014-05-15 11:12:10.000000000 +0200 +++ libgomp/Makefile.am 2016-07-14 16:10:51.968202878 +0200 @@ -60,7 +60,13 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_L libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \ task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \ - time.c fortran.c affinity.c target.c + time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \ + oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \ + oacc-plugin.c oacc-cuda.c priority_queue.c + +if USE_FORTRAN +libgomp_la_SOURCES += openacc.f90 +endif nodist_noinst_HEADERS = libgomp_f.h nodist_libsubinclude_HEADERS = omp.h --- libgomp/Makefile.in.jj 2014-05-15 11:12:10.000000000 +0200 +++ libgomp/Makefile.in 2016-07-14 16:11:10.981954087 +0200 @@ -36,6 +36,7 @@ POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ target_triplet = @target@ +@USE_FORTRAN_TRUE@am__append_1 = openacc.f90 subdir = . DIST_COMMON = ChangeLog $(srcdir)/Makefile.in $(srcdir)/Makefile.am \ $(top_srcdir)/configure $(am__configure_deps) \ @@ -92,11 +93,15 @@ am__installdirs = "$(DESTDIR)$(toolexecl "$(DESTDIR)$(toolexeclibdir)" LTLIBRARIES = $(toolexeclib_LTLIBRARIES) libgomp_la_LIBADD = +@USE_FORTRAN_TRUE@am__objects_1 = openacc.lo am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \ error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \ parallel.lo sections.lo single.lo task.lo team.lo work.lo \ lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \ - fortran.lo affinity.lo target.lo + fortran.lo affinity.lo target.lo splay-tree.lo \ + libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \ + oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \ + priority_queue.lo $(am__objects_1) libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) DEFAULT_INCLUDES = -I.@am__isrc@ depcomp = $(SHELL) $(top_srcdir)/../depcomp @@ -108,6 +113,13 @@ LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIB --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) CCLD = $(CC) +FCCOMPILE = $(FC) $(AM_FCFLAGS) $(FCFLAGS) +LTFCCOMPILE = $(LIBTOOL) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile $(FC) $(AM_FCFLAGS) $(FCFLAGS) +FCLD = $(FC) +FCLINK = $(LIBTOOL) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=link $(FCLD) $(AM_FCFLAGS) $(FCFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ SOURCES = $(libgomp_la_SOURCES) MULTISRCTOP = MULTIBUILDTOP = @@ -315,10 +327,12 @@ libgomp_la_LDFLAGS = $(libgomp_version_i libgomp_la_DEPENDENCIES = $(libgomp_version_dep) libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS) libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ - iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \ - task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \ - time.c fortran.c affinity.c target.c - + iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c \ + single.c task.c team.c work.c lock.c mutex.c proc.c sem.c \ + bar.c ptrlock.c time.c fortran.c affinity.c target.c \ + splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \ + oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \ + priority_queue.c $(am__append_1) nodist_noinst_HEADERS = libgomp_f.h nodist_libsubinclude_HEADERS = omp.h @USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod @@ -351,7 +365,7 @@ all: config.h $(MAKE) $(AM_MAKEFLAGS) all-recursive .SUFFIXES: -.SUFFIXES: .c .dvi .lo .o .obj .ps +.SUFFIXES: .c .dvi .f90 .lo .o .obj .ps am--refresh: @: $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) @@ -463,17 +477,27 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop_ull.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mutex.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-async.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-cuda.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-host.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-init.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/proc.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ptrlock.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/splay-tree.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@ @@ -501,6 +525,15 @@ distclean-compile: @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< +.f90.o: + $(FCCOMPILE) -c -o $@ $< + +.f90.obj: + $(FCCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.f90.lo: + $(LTFCCOMPILE) -c -o $@ $< + mostlyclean-libtool: -rm -f *.lo --- libgomp/task.c.jj 2014-08-06 16:25:16.575091658 +0200 +++ libgomp/task.c 2016-07-13 17:47:58.722758497 +0200 @@ -28,6 +28,7 @@ #include "libgomp.h" #include #include +#include "gomp-constants.h" typedef struct gomp_task_depend_entry *hash_entry_type; @@ -63,6 +64,14 @@ void gomp_init_task (struct gomp_task *task, struct gomp_task *parent_task, struct gomp_task_icv *prev_icv) { + /* It would seem that using memset here would be a win, but it turns + out that partially filling gomp_task allows us to keep the + overhead of task creation low. In the nqueens-1.c test, for a + sufficiently large N, we drop the overhead from 5-6% to 1%. + + Note, the nqueens-1.c test in serial mode is a good test to + benchmark the overhead of creating tasks as there are millions of + tiny tasks created that all run undeferred. */ task->parent = parent_task; task->icv = *prev_icv; task->kind = GOMP_TASK_IMPLICIT; @@ -71,7 +80,7 @@ gomp_init_task (struct gomp_task *task, task->final_task = false; task->copy_ctors_done = false; task->parent_depends_on = false; - task->children = NULL; + priority_queue_init (&task->children_queue); task->taskgroup = NULL; task->dependers = NULL; task->depend_hash = NULL; @@ -90,30 +99,194 @@ gomp_end_task (void) thr->task = task->parent; } +/* Clear the parent field of every task in LIST. */ + static inline void -gomp_clear_parent (struct gomp_task *children) +gomp_clear_parent_in_list (struct priority_list *list) { - struct gomp_task *task = children; - - if (task) + struct priority_node *p = list->tasks; + if (p) do { - task->parent = NULL; - task = task->next_child; + priority_node_to_task (PQ_CHILDREN, p)->parent = NULL; + p = p->next; } - while (task != children); + while (p != list->tasks); +} + +/* Splay tree version of gomp_clear_parent_in_list. + + Clear the parent field of every task in NODE within SP, and free + the node when done. */ + +static void +gomp_clear_parent_in_tree (prio_splay_tree sp, prio_splay_tree_node node) +{ + if (!node) + return; + prio_splay_tree_node left = node->left, right = node->right; + gomp_clear_parent_in_list (&node->key.l); +#if _LIBGOMP_CHECKING_ + memset (node, 0xaf, sizeof (*node)); +#endif + /* No need to remove the node from the tree. We're nuking + everything, so just free the nodes and our caller can clear the + entire splay tree. */ + free (node); + gomp_clear_parent_in_tree (sp, left); + gomp_clear_parent_in_tree (sp, right); +} + +/* Clear the parent field of every task in Q and remove every task + from Q. */ + +static inline void +gomp_clear_parent (struct priority_queue *q) +{ + if (priority_queue_multi_p (q)) + { + gomp_clear_parent_in_tree (&q->t, q->t.root); + /* All the nodes have been cleared in gomp_clear_parent_in_tree. + No need to remove anything. We can just nuke everything. */ + q->t.root = NULL; + } + else + gomp_clear_parent_in_list (&q->l); } -static void gomp_task_maybe_wait_for_dependencies (void **depend); +/* Helper function for GOMP_task and gomp_create_target_task. + + For a TASK with in/out dependencies, fill in the various dependency + queues. PARENT is the parent of said task. DEPEND is as in + GOMP_task. */ + +static void +gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent, + void **depend) +{ + size_t ndepend = (uintptr_t) depend[0]; + size_t nout = (uintptr_t) depend[1]; + size_t i; + hash_entry_type ent; + + task->depend_count = ndepend; + task->num_dependees = 0; + if (parent->depend_hash == NULL) + parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); + for (i = 0; i < ndepend; i++) + { + task->depend[i].addr = depend[2 + i]; + task->depend[i].next = NULL; + task->depend[i].prev = NULL; + task->depend[i].task = task; + task->depend[i].is_in = i >= nout; + task->depend[i].redundant = false; + task->depend[i].redundant_out = false; + + hash_entry_type *slot = htab_find_slot (&parent->depend_hash, + &task->depend[i], INSERT); + hash_entry_type out = NULL, last = NULL; + if (*slot) + { + /* If multiple depends on the same task are the same, all but the + first one are redundant. As inout/out come first, if any of them + is inout/out, it will win, which is the right semantics. */ + if ((*slot)->task == task) + { + task->depend[i].redundant = true; + continue; + } + for (ent = *slot; ent; ent = ent->next) + { + if (ent->redundant_out) + break; + + last = ent; + + /* depend(in:...) doesn't depend on earlier depend(in:...). */ + if (i >= nout && ent->is_in) + continue; + + if (!ent->is_in) + out = ent; + + struct gomp_task *tsk = ent->task; + if (tsk->dependers == NULL) + { + tsk->dependers + = gomp_malloc (sizeof (struct gomp_dependers_vec) + + 6 * sizeof (struct gomp_task *)); + tsk->dependers->n_elem = 1; + tsk->dependers->allocated = 6; + tsk->dependers->elem[0] = task; + task->num_dependees++; + continue; + } + /* We already have some other dependency on tsk from earlier + depend clause. */ + else if (tsk->dependers->n_elem + && (tsk->dependers->elem[tsk->dependers->n_elem - 1] + == task)) + continue; + else if (tsk->dependers->n_elem == tsk->dependers->allocated) + { + tsk->dependers->allocated + = tsk->dependers->allocated * 2 + 2; + tsk->dependers + = gomp_realloc (tsk->dependers, + sizeof (struct gomp_dependers_vec) + + (tsk->dependers->allocated + * sizeof (struct gomp_task *))); + } + tsk->dependers->elem[tsk->dependers->n_elem++] = task; + task->num_dependees++; + } + task->depend[i].next = *slot; + (*slot)->prev = &task->depend[i]; + } + *slot = &task->depend[i]; + + /* There is no need to store more than one depend({,in}out:) task per + address in the hash table chain for the purpose of creation of + deferred tasks, because each out depends on all earlier outs, thus it + is enough to record just the last depend({,in}out:). For depend(in:), + we need to keep all of the previous ones not terminated yet, because + a later depend({,in}out:) might need to depend on all of them. So, if + the new task's clause is depend({,in}out:), we know there is at most + one other depend({,in}out:) clause in the list (out). For + non-deferred tasks we want to see all outs, so they are moved to the + end of the chain, after first redundant_out entry all following + entries should be redundant_out. */ + if (!task->depend[i].is_in && out) + { + if (out != last) + { + out->next->prev = out->prev; + out->prev->next = out->next; + out->next = last->next; + out->prev = last; + last->next = out; + if (out->next) + out->next->prev = out; + } + out->redundant_out = true; + } + } +} /* Called when encountering an explicit task directive. If IF_CLAUSE is false, then we must not delay in executing the task. If UNTIED is true, - then the task may be executed by any member of the team. */ + then the task may be executed by any member of the team. + + DEPEND is an array containing: + depend[0]: number of depend elements. + depend[1]: number of depend elements of type "out". + depend[2..N+1]: address of [1..N]th depend element. */ void GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), long arg_size, long arg_align, bool if_clause, unsigned flags, - void **depend) + void **depend, int priority) { struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; @@ -125,8 +298,7 @@ GOMP_task (void (*fn) (void *), void *da might be running on different thread than FN. */ if (cpyfn) if_clause = false; - if (flags & 1) - flags &= ~1; + flags &= ~GOMP_TASK_FLAG_UNTIED; #endif /* If parallel or taskgroup has been cancelled, don't start new tasks. */ @@ -135,6 +307,11 @@ GOMP_task (void (*fn) (void *), void *da || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) return; + if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0) + priority = 0; + else if (priority > gomp_max_task_priority_var) + priority = gomp_max_task_priority_var; + if (!if_clause || team == NULL || (thr->task && thr->task->final_task) || team->task_count > 64 * team->nthreads) @@ -147,12 +324,15 @@ GOMP_task (void (*fn) (void *), void *da depend clauses for non-deferred tasks other than this, because the parent task is suspended until the child task finishes and thus it can't start further child tasks. */ - if ((flags & 8) && thr->task && thr->task->depend_hash) + if ((flags & GOMP_TASK_FLAG_DEPEND) + && thr->task && thr->task->depend_hash) gomp_task_maybe_wait_for_dependencies (depend); gomp_init_task (&task, thr->task, gomp_icv (false)); - task.kind = GOMP_TASK_IFFALSE; - task.final_task = (thr->task && thr->task->final_task) || (flags & 2); + task.kind = GOMP_TASK_UNDEFERRED; + task.final_task = (thr->task && thr->task->final_task) + || (flags & GOMP_TASK_FLAG_FINAL); + task.priority = priority; if (thr->task) { task.in_tied_task = thr->task->in_tied_task; @@ -178,10 +358,10 @@ GOMP_task (void (*fn) (void *), void *da child thread, but seeing a stale non-NULL value is not a problem. Once past the task_lock acquisition, this thread will see the real value of task.children. */ - if (task.children != NULL) + if (!priority_queue_empty_p (&task.children_queue, MEMMODEL_RELAXED)) { gomp_mutex_lock (&team->task_lock); - gomp_clear_parent (task.children); + gomp_clear_parent (&task.children_queue); gomp_mutex_unlock (&team->task_lock); } gomp_end_task (); @@ -195,7 +375,7 @@ GOMP_task (void (*fn) (void *), void *da bool do_wake; size_t depend_size = 0; - if (flags & 8) + if (flags & GOMP_TASK_FLAG_DEPEND) depend_size = ((uintptr_t) depend[0] * sizeof (struct gomp_task_depend_entry)); task = gomp_malloc (sizeof (*task) + depend_size @@ -203,7 +383,8 @@ GOMP_task (void (*fn) (void *), void *da arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1) & ~(uintptr_t) (arg_align - 1)); gomp_init_task (task, parent, gomp_icv (false)); - task->kind = GOMP_TASK_IFFALSE; + task->priority = priority; + task->kind = GOMP_TASK_UNDEFERRED; task->in_tied_task = parent->in_tied_task; task->taskgroup = taskgroup; thr->task = task; @@ -218,7 +399,7 @@ GOMP_task (void (*fn) (void *), void *da task->kind = GOMP_TASK_WAITING; task->fn = fn; task->fn_data = arg; - task->final_task = (flags & 2) >> 1; + task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1; gomp_mutex_lock (&team->task_lock); /* If parallel or taskgroup has been cancelled, don't start new tasks. */ @@ -235,171 +416,39 @@ GOMP_task (void (*fn) (void *), void *da taskgroup->num_children++; if (depend_size) { - size_t ndepend = (uintptr_t) depend[0]; - size_t nout = (uintptr_t) depend[1]; - size_t i; - hash_entry_type ent; - - task->depend_count = ndepend; - task->num_dependees = 0; - if (parent->depend_hash == NULL) - parent->depend_hash - = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); - for (i = 0; i < ndepend; i++) - { - task->depend[i].addr = depend[2 + i]; - task->depend[i].next = NULL; - task->depend[i].prev = NULL; - task->depend[i].task = task; - task->depend[i].is_in = i >= nout; - task->depend[i].redundant = false; - task->depend[i].redundant_out = false; - - hash_entry_type *slot - = htab_find_slot (&parent->depend_hash, &task->depend[i], - INSERT); - hash_entry_type out = NULL, last = NULL; - if (*slot) - { - /* If multiple depends on the same task are the - same, all but the first one are redundant. - As inout/out come first, if any of them is - inout/out, it will win, which is the right - semantics. */ - if ((*slot)->task == task) - { - task->depend[i].redundant = true; - continue; - } - for (ent = *slot; ent; ent = ent->next) - { - if (ent->redundant_out) - break; - - last = ent; - - /* depend(in:...) doesn't depend on earlier - depend(in:...). */ - if (i >= nout && ent->is_in) - continue; - - if (!ent->is_in) - out = ent; - - struct gomp_task *tsk = ent->task; - if (tsk->dependers == NULL) - { - tsk->dependers - = gomp_malloc (sizeof (struct gomp_dependers_vec) - + 6 * sizeof (struct gomp_task *)); - tsk->dependers->n_elem = 1; - tsk->dependers->allocated = 6; - tsk->dependers->elem[0] = task; - task->num_dependees++; - continue; - } - /* We already have some other dependency on tsk - from earlier depend clause. */ - else if (tsk->dependers->n_elem - && (tsk->dependers->elem[tsk->dependers->n_elem - - 1] - == task)) - continue; - else if (tsk->dependers->n_elem - == tsk->dependers->allocated) - { - tsk->dependers->allocated - = tsk->dependers->allocated * 2 + 2; - tsk->dependers - = gomp_realloc (tsk->dependers, - sizeof (struct gomp_dependers_vec) - + (tsk->dependers->allocated - * sizeof (struct gomp_task *))); - } - tsk->dependers->elem[tsk->dependers->n_elem++] = task; - task->num_dependees++; - } - task->depend[i].next = *slot; - (*slot)->prev = &task->depend[i]; - } - *slot = &task->depend[i]; - - /* There is no need to store more than one depend({,in}out:) - task per address in the hash table chain for the purpose - of creation of deferred tasks, because each out - depends on all earlier outs, thus it is enough to record - just the last depend({,in}out:). For depend(in:), we need - to keep all of the previous ones not terminated yet, because - a later depend({,in}out:) might need to depend on all of - them. So, if the new task's clause is depend({,in}out:), - we know there is at most one other depend({,in}out:) clause - in the list (out). For non-deferred tasks we want to see - all outs, so they are moved to the end of the chain, - after first redundant_out entry all following entries - should be redundant_out. */ - if (!task->depend[i].is_in && out) - { - if (out != last) - { - out->next->prev = out->prev; - out->prev->next = out->next; - out->next = last->next; - out->prev = last; - last->next = out; - if (out->next) - out->next->prev = out; - } - out->redundant_out = true; - } - } + gomp_task_handle_depend (task, parent, depend); if (task->num_dependees) { + /* Tasks that depend on other tasks are not put into the + various waiting queues, so we are done for now. Said + tasks are instead put into the queues via + gomp_task_run_post_handle_dependers() after their + dependencies have been satisfied. After which, they + can be picked up by the various scheduling + points. */ gomp_mutex_unlock (&team->task_lock); return; } } - if (parent->children) - { - task->next_child = parent->children; - task->prev_child = parent->children->prev_child; - task->next_child->prev_child = task; - task->prev_child->next_child = task; - } - else - { - task->next_child = task; - task->prev_child = task; - } - parent->children = task; + + priority_queue_insert (PQ_CHILDREN, &parent->children_queue, + task, priority, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); if (taskgroup) - { - if (taskgroup->children) - { - task->next_taskgroup = taskgroup->children; - task->prev_taskgroup = taskgroup->children->prev_taskgroup; - task->next_taskgroup->prev_taskgroup = task; - task->prev_taskgroup->next_taskgroup = task; - } - else - { - task->next_taskgroup = task; - task->prev_taskgroup = task; - } - taskgroup->children = task; - } - if (team->task_queue) - { - task->next_queue = team->task_queue; - task->prev_queue = team->task_queue->prev_queue; - task->next_queue->prev_queue = task; - task->prev_queue->next_queue = task; - } - else - { - task->next_queue = task; - task->prev_queue = task; - team->task_queue = task; - } + priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + task, priority, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + + priority_queue_insert (PQ_TEAM, &team->task_queue, + task, priority, + PRIORITY_INSERT_END, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + ++team->task_count; ++team->task_queued_count; gomp_team_barrier_set_task_pending (&team->barrier); @@ -411,36 +460,529 @@ GOMP_task (void (*fn) (void *), void *da } } -static inline bool -gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent, - struct gomp_taskgroup *taskgroup, struct gomp_team *team) +ialias (GOMP_taskgroup_start) +ialias (GOMP_taskgroup_end) + +#define TYPE long +#define UTYPE unsigned long +#define TYPE_is_long 1 +#include "taskloop.c" +#undef TYPE +#undef UTYPE +#undef TYPE_is_long + +#define TYPE unsigned long long +#define UTYPE TYPE +#define GOMP_taskloop GOMP_taskloop_ull +#include "taskloop.c" +#undef TYPE +#undef UTYPE +#undef GOMP_taskloop + +static void inline +priority_queue_move_task_first (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task) { +#if _LIBGOMP_CHECKING_ + if (!priority_queue_task_in_queue_p (type, head, task)) + gomp_fatal ("Attempt to move first missing task %p", task); +#endif + struct priority_list *list; + if (priority_queue_multi_p (head)) + { + list = priority_queue_lookup_priority (head, task->priority); +#if _LIBGOMP_CHECKING_ + if (!list) + gomp_fatal ("Unable to find priority %d", task->priority); +#endif + } + else + list = &head->l; + priority_list_remove (list, task_to_priority_node (type, task), 0); + priority_list_insert (type, list, task, task->priority, + PRIORITY_INSERT_BEGIN, type == PQ_CHILDREN, + task->parent_depends_on); +} + +/* Actual body of GOMP_PLUGIN_target_task_completion that is executed + with team->task_lock held, or is executed in the thread that called + gomp_target_task_fn if GOMP_PLUGIN_target_task_completion has been + run before it acquires team->task_lock. */ + +static void +gomp_target_task_completion (struct gomp_team *team, struct gomp_task *task) +{ + struct gomp_task *parent = task->parent; if (parent) + priority_queue_move_task_first (PQ_CHILDREN, &parent->children_queue, + task); + + struct gomp_taskgroup *taskgroup = task->taskgroup; + if (taskgroup) + priority_queue_move_task_first (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + task); + + priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority, + PRIORITY_INSERT_BEGIN, false, + task->parent_depends_on); + task->kind = GOMP_TASK_WAITING; + if (parent && parent->taskwait) { - if (parent->children == child_task) - parent->children = child_task->next_child; - if (__builtin_expect (child_task->parent_depends_on, 0) - && parent->taskwait->last_parent_depends_on == child_task) - { - if (child_task->prev_child->kind == GOMP_TASK_WAITING - && child_task->prev_child->parent_depends_on) - parent->taskwait->last_parent_depends_on = child_task->prev_child; - else - parent->taskwait->last_parent_depends_on = NULL; + if (parent->taskwait->in_taskwait) + { + /* One more task has had its dependencies met. + Inform any waiters. */ + parent->taskwait->in_taskwait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); } + else if (parent->taskwait->in_depend_wait) + { + /* One more task has had its dependencies met. + Inform any waiters. */ + parent->taskwait->in_depend_wait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); + } + } + if (taskgroup && taskgroup->in_taskgroup_wait) + { + /* One more task has had its dependencies met. + Inform any waiters. */ + taskgroup->in_taskgroup_wait = false; + gomp_sem_post (&taskgroup->taskgroup_sem); } - if (taskgroup && taskgroup->children == child_task) - taskgroup->children = child_task->next_taskgroup; - child_task->prev_queue->next_queue = child_task->next_queue; - child_task->next_queue->prev_queue = child_task->prev_queue; - if (team->task_queue == child_task) + + ++team->task_queued_count; + gomp_team_barrier_set_task_pending (&team->barrier); + /* I'm afraid this can't be done after releasing team->task_lock, + as gomp_target_task_completion is run from unrelated thread and + therefore in between gomp_mutex_unlock and gomp_team_barrier_wake + the team could be gone already. */ + if (team->nthreads > team->task_running_count) + gomp_team_barrier_wake (&team->barrier, 1); +} + +/* Signal that a target task TTASK has completed the asynchronously + running phase and should be requeued as a task to handle the + variable unmapping. */ + +void +GOMP_PLUGIN_target_task_completion (void *data) +{ + struct gomp_target_task *ttask = (struct gomp_target_task *) data; + struct gomp_task *task = ttask->task; + struct gomp_team *team = ttask->team; + + gomp_mutex_lock (&team->task_lock); + if (ttask->state == GOMP_TARGET_TASK_READY_TO_RUN) { - if (child_task->next_queue != child_task) - team->task_queue = child_task->next_queue; + ttask->state = GOMP_TARGET_TASK_FINISHED; + gomp_mutex_unlock (&team->task_lock); + return; + } + ttask->state = GOMP_TARGET_TASK_FINISHED; + gomp_target_task_completion (team, task); + gomp_mutex_unlock (&team->task_lock); +} + +static void gomp_task_run_post_handle_depend_hash (struct gomp_task *); + +/* Called for nowait target tasks. */ + +bool +gomp_create_target_task (struct gomp_device_descr *devicep, + void (*fn) (void *), size_t mapnum, void **hostaddrs, + size_t *sizes, unsigned short *kinds, + unsigned int flags, void **depend, void **args, + enum gomp_target_task_state state) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ + if (team + && (gomp_team_barrier_cancelled (&team->barrier) + || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) + return true; + + struct gomp_target_task *ttask; + struct gomp_task *task; + struct gomp_task *parent = thr->task; + struct gomp_taskgroup *taskgroup = parent->taskgroup; + bool do_wake; + size_t depend_size = 0; + uintptr_t depend_cnt = 0; + size_t tgt_align = 0, tgt_size = 0; + + if (depend != NULL) + { + depend_cnt = (uintptr_t) depend[0]; + depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry); + } + if (fn) + { + /* GOMP_MAP_FIRSTPRIVATE need to be copied first, as they are + firstprivate on the target task. */ + size_t i; + for (i = 0; i < mapnum; i++) + if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE) + { + size_t align = (size_t) 1 << (kinds[i] >> 8); + if (tgt_align < align) + tgt_align = align; + tgt_size = (tgt_size + align - 1) & ~(align - 1); + tgt_size += sizes[i]; + } + if (tgt_align) + tgt_size += tgt_align - 1; else - team->task_queue = NULL; + tgt_size = 0; } + + task = gomp_malloc (sizeof (*task) + depend_size + + sizeof (*ttask) + + mapnum * (sizeof (void *) + sizeof (size_t) + + sizeof (unsigned short)) + + tgt_size); + gomp_init_task (task, parent, gomp_icv (false)); + task->priority = 0; + task->kind = GOMP_TASK_WAITING; + task->in_tied_task = parent->in_tied_task; + task->taskgroup = taskgroup; + ttask = (struct gomp_target_task *) &task->depend[depend_cnt]; + ttask->devicep = devicep; + ttask->fn = fn; + ttask->mapnum = mapnum; + ttask->args = args; + memcpy (ttask->hostaddrs, hostaddrs, mapnum * sizeof (void *)); + ttask->sizes = (size_t *) &ttask->hostaddrs[mapnum]; + memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t)); + ttask->kinds = (unsigned short *) &ttask->sizes[mapnum]; + memcpy (ttask->kinds, kinds, mapnum * sizeof (unsigned short)); + if (tgt_align) + { + char *tgt = (char *) &ttask->kinds[mapnum]; + size_t i; + uintptr_t al = (uintptr_t) tgt & (tgt_align - 1); + if (al) + tgt += tgt_align - al; + tgt_size = 0; + for (i = 0; i < mapnum; i++) + if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE) + { + size_t align = (size_t) 1 << (kinds[i] >> 8); + tgt_size = (tgt_size + align - 1) & ~(align - 1); + memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]); + ttask->hostaddrs[i] = tgt + tgt_size; + tgt_size = tgt_size + sizes[i]; + } + } + ttask->flags = flags; + ttask->state = state; + ttask->task = task; + ttask->team = team; + task->fn = NULL; + task->fn_data = ttask; + task->final_task = 0; + gomp_mutex_lock (&team->task_lock); + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ + if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier) + || (taskgroup && taskgroup->cancelled), 0)) + { + gomp_mutex_unlock (&team->task_lock); + gomp_finish_task (task); + free (task); + return true; + } + if (depend_size) + { + gomp_task_handle_depend (task, parent, depend); + if (task->num_dependees) + { + if (taskgroup) + taskgroup->num_children++; + gomp_mutex_unlock (&team->task_lock); + return true; + } + } + if (state == GOMP_TARGET_TASK_DATA) + { + gomp_task_run_post_handle_depend_hash (task); + gomp_mutex_unlock (&team->task_lock); + gomp_finish_task (task); + free (task); + return false; + } + if (taskgroup) + taskgroup->num_children++; + /* For async offloading, if we don't need to wait for dependencies, + run the gomp_target_task_fn right away, essentially schedule the + mapping part of the task in the current thread. */ + if (devicep != NULL + && (devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + { + priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0, + PRIORITY_INSERT_END, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + if (taskgroup) + priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + task, 0, PRIORITY_INSERT_END, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + task->pnode[PQ_TEAM].next = NULL; + task->pnode[PQ_TEAM].prev = NULL; + task->kind = GOMP_TASK_TIED; + ++team->task_count; + gomp_mutex_unlock (&team->task_lock); + + thr->task = task; + gomp_target_task_fn (task->fn_data); + thr->task = parent; + + gomp_mutex_lock (&team->task_lock); + task->kind = GOMP_TASK_ASYNC_RUNNING; + /* If GOMP_PLUGIN_target_task_completion has run already + in between gomp_target_task_fn and the mutex lock, + perform the requeuing here. */ + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + gomp_target_task_completion (team, task); + else + ttask->state = GOMP_TARGET_TASK_RUNNING; + gomp_mutex_unlock (&team->task_lock); + return true; + } + priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + if (taskgroup) + priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, task, 0, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + priority_queue_insert (PQ_TEAM, &team->task_queue, task, 0, + PRIORITY_INSERT_END, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + ++team->task_count; + ++team->task_queued_count; + gomp_team_barrier_set_task_pending (&team->barrier); + do_wake = team->task_running_count + !parent->in_tied_task + < team->nthreads; + gomp_mutex_unlock (&team->task_lock); + if (do_wake) + gomp_team_barrier_wake (&team->barrier, 1); + return true; +} + +/* Given a parent_depends_on task in LIST, move it to the front of its + priority so it is run as soon as possible. + + Care is taken to update the list's LAST_PARENT_DEPENDS_ON field. + + We rearrange the queue such that all parent_depends_on tasks are + first, and last_parent_depends_on points to the last such task we + rearranged. For example, given the following tasks in a queue + where PD[123] are the parent_depends_on tasks: + + task->children + | + V + C1 -> C2 -> C3 -> PD1 -> PD2 -> PD3 -> C4 + + We rearrange such that: + + task->children + | +--- last_parent_depends_on + | | + V V + PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4. */ + +static void inline +priority_list_upgrade_task (struct priority_list *list, + struct priority_node *node) +{ + struct priority_node *last_parent_depends_on + = list->last_parent_depends_on; + if (last_parent_depends_on) + { + node->prev->next = node->next; + node->next->prev = node->prev; + node->prev = last_parent_depends_on; + node->next = last_parent_depends_on->next; + node->prev->next = node; + node->next->prev = node; + } + else if (node != list->tasks) + { + node->prev->next = node->next; + node->next->prev = node->prev; + node->prev = list->tasks->prev; + node->next = list->tasks; + list->tasks = node; + node->prev->next = node; + node->next->prev = node; + } + list->last_parent_depends_on = node; +} + +/* Given a parent_depends_on TASK in its parent's children_queue, move + it to the front of its priority so it is run as soon as possible. + + PARENT is passed as an optimization. + + (This function could be defined in priority_queue.c, but we want it + inlined, and putting it in priority_queue.h is not an option, given + that gomp_task has not been properly defined at that point). */ + +static void inline +priority_queue_upgrade_task (struct gomp_task *task, + struct gomp_task *parent) +{ + struct priority_queue *head = &parent->children_queue; + struct priority_node *node = &task->pnode[PQ_CHILDREN]; +#if _LIBGOMP_CHECKING_ + if (!task->parent_depends_on) + gomp_fatal ("priority_queue_upgrade_task: task must be a " + "parent_depends_on task"); + if (!priority_queue_task_in_queue_p (PQ_CHILDREN, head, task)) + gomp_fatal ("priority_queue_upgrade_task: cannot find task=%p", task); +#endif + if (priority_queue_multi_p (head)) + { + struct priority_list *list + = priority_queue_lookup_priority (head, task->priority); + priority_list_upgrade_task (list, node); + } + else + priority_list_upgrade_task (&head->l, node); +} + +/* Given a CHILD_TASK in LIST that is about to be executed, move it out of + the way in LIST so that other tasks can be considered for + execution. LIST contains tasks of type TYPE. + + Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field + if applicable. */ + +static void inline +priority_list_downgrade_task (enum priority_queue_type type, + struct priority_list *list, + struct gomp_task *child_task) +{ + struct priority_node *node = task_to_priority_node (type, child_task); + if (list->tasks == node) + list->tasks = node->next; + else if (node->next != list->tasks) + { + /* The task in NODE is about to become TIED and TIED tasks + cannot come before WAITING tasks. If we're about to + leave the queue in such an indeterminate state, rewire + things appropriately. However, a TIED task at the end is + perfectly fine. */ + struct gomp_task *next_task = priority_node_to_task (type, node->next); + if (next_task->kind == GOMP_TASK_WAITING) + { + /* Remove from list. */ + node->prev->next = node->next; + node->next->prev = node->prev; + /* Rewire at the end. */ + node->next = list->tasks; + node->prev = list->tasks->prev; + list->tasks->prev->next = node; + list->tasks->prev = node; + } + } + + /* If the current task is the last_parent_depends_on for its + priority, adjust last_parent_depends_on appropriately. */ + if (__builtin_expect (child_task->parent_depends_on, 0) + && list->last_parent_depends_on == node) + { + struct gomp_task *prev_child = priority_node_to_task (type, node->prev); + if (node->prev != node + && prev_child->kind == GOMP_TASK_WAITING + && prev_child->parent_depends_on) + list->last_parent_depends_on = node->prev; + else + { + /* There are no more parent_depends_on entries waiting + to run, clear the list. */ + list->last_parent_depends_on = NULL; + } + } +} + +/* Given a TASK in HEAD that is about to be executed, move it out of + the way so that other tasks can be considered for execution. HEAD + contains tasks of type TYPE. + + Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field + if applicable. + + (This function could be defined in priority_queue.c, but we want it + inlined, and putting it in priority_queue.h is not an option, given + that gomp_task has not been properly defined at that point). */ + +static void inline +priority_queue_downgrade_task (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task) +{ +#if _LIBGOMP_CHECKING_ + if (!priority_queue_task_in_queue_p (type, head, task)) + gomp_fatal ("Attempt to downgrade missing task %p", task); +#endif + if (priority_queue_multi_p (head)) + { + struct priority_list *list + = priority_queue_lookup_priority (head, task->priority); + priority_list_downgrade_task (type, list, task); + } + else + priority_list_downgrade_task (type, &head->l, task); +} + +/* Setup CHILD_TASK to execute. This is done by setting the task to + TIED, and updating all relevant queues so that CHILD_TASK is no + longer chosen for scheduling. Also, remove CHILD_TASK from the + overall team task queue entirely. + + Return TRUE if task or its containing taskgroup has been + cancelled. */ + +static inline bool +gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent, + struct gomp_team *team) +{ +#if _LIBGOMP_CHECKING_ + if (child_task->parent) + priority_queue_verify (PQ_CHILDREN, + &child_task->parent->children_queue, true); + if (child_task->taskgroup) + priority_queue_verify (PQ_TASKGROUP, + &child_task->taskgroup->taskgroup_queue, false); + priority_queue_verify (PQ_TEAM, &team->task_queue, false); +#endif + + /* Task is about to go tied, move it out of the way. */ + if (parent) + priority_queue_downgrade_task (PQ_CHILDREN, &parent->children_queue, + child_task); + + /* Task is about to go tied, move it out of the way. */ + struct gomp_taskgroup *taskgroup = child_task->taskgroup; + if (taskgroup) + priority_queue_downgrade_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + child_task); + + priority_queue_remove (PQ_TEAM, &team->task_queue, child_task, + MEMMODEL_RELAXED); + child_task->pnode[PQ_TEAM].next = NULL; + child_task->pnode[PQ_TEAM].prev = NULL; child_task->kind = GOMP_TASK_TIED; + if (--team->task_queued_count == 0) gomp_team_barrier_clear_task_pending (&team->barrier); if ((gomp_team_barrier_cancelled (&team->barrier) @@ -478,6 +1020,14 @@ gomp_task_run_post_handle_depend_hash (s } } +/* After a CHILD_TASK has been run, adjust the dependency queue for + each task that depends on CHILD_TASK, to record the fact that there + is one less dependency to worry about. If a task that depended on + CHILD_TASK now has no dependencies, place it in the various queues + so it gets scheduled to run. + + TEAM is the team to which CHILD_TASK belongs to. */ + static size_t gomp_task_run_post_handle_dependers (struct gomp_task *child_task, struct gomp_team *team) @@ -487,91 +1037,60 @@ gomp_task_run_post_handle_dependers (str for (i = 0; i < count; i++) { struct gomp_task *task = child_task->dependers->elem[i]; + + /* CHILD_TASK satisfies a dependency for TASK. Keep track of + TASK's remaining dependencies. Once TASK has no other + depenencies, put it into the various queues so it will get + scheduled for execution. */ if (--task->num_dependees != 0) continue; struct gomp_taskgroup *taskgroup = task->taskgroup; if (parent) { - if (parent->children) - { - /* If parent is in gomp_task_maybe_wait_for_dependencies - and it doesn't need to wait for this task, put it after - all ready to run tasks it needs to wait for. */ - if (parent->taskwait && parent->taskwait->last_parent_depends_on - && !task->parent_depends_on) - { - struct gomp_task *last_parent_depends_on - = parent->taskwait->last_parent_depends_on; - task->next_child = last_parent_depends_on->next_child; - task->prev_child = last_parent_depends_on; - } - else - { - task->next_child = parent->children; - task->prev_child = parent->children->prev_child; - parent->children = task; - } - task->next_child->prev_child = task; - task->prev_child->next_child = task; - } - else - { - task->next_child = task; - task->prev_child = task; - parent->children = task; - } + priority_queue_insert (PQ_CHILDREN, &parent->children_queue, + task, task->priority, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/true, + task->parent_depends_on); if (parent->taskwait) { if (parent->taskwait->in_taskwait) { + /* One more task has had its dependencies met. + Inform any waiters. */ parent->taskwait->in_taskwait = false; gomp_sem_post (&parent->taskwait->taskwait_sem); } else if (parent->taskwait->in_depend_wait) { + /* One more task has had its dependencies met. + Inform any waiters. */ parent->taskwait->in_depend_wait = false; gomp_sem_post (&parent->taskwait->taskwait_sem); } - if (parent->taskwait->last_parent_depends_on == NULL - && task->parent_depends_on) - parent->taskwait->last_parent_depends_on = task; } } if (taskgroup) { - if (taskgroup->children) - { - task->next_taskgroup = taskgroup->children; - task->prev_taskgroup = taskgroup->children->prev_taskgroup; - task->next_taskgroup->prev_taskgroup = task; - task->prev_taskgroup->next_taskgroup = task; - } - else - { - task->next_taskgroup = task; - task->prev_taskgroup = task; - } - taskgroup->children = task; + priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + task, task->priority, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); if (taskgroup->in_taskgroup_wait) { + /* One more task has had its dependencies met. + Inform any waiters. */ taskgroup->in_taskgroup_wait = false; gomp_sem_post (&taskgroup->taskgroup_sem); } } - if (team->task_queue) - { - task->next_queue = team->task_queue; - task->prev_queue = team->task_queue->prev_queue; - task->next_queue->prev_queue = task; - task->prev_queue->next_queue = task; - } - else - { - task->next_queue = task; - task->prev_queue = task; - team->task_queue = task; - } + priority_queue_insert (PQ_TEAM, &team->task_queue, + task, task->priority, + PRIORITY_INSERT_END, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); ++team->task_count; ++team->task_queued_count; ++ret; @@ -601,12 +1120,18 @@ gomp_task_run_post_handle_depend (struct return gomp_task_run_post_handle_dependers (child_task, team); } +/* Remove CHILD_TASK from its parent. */ + static inline void gomp_task_run_post_remove_parent (struct gomp_task *child_task) { struct gomp_task *parent = child_task->parent; if (parent == NULL) return; + + /* If this was the last task the parent was depending on, + synchronize with gomp_task_maybe_wait_for_dependencies so it can + clean up and return. */ if (__builtin_expect (child_task->parent_depends_on, 0) && --parent->taskwait->n_depend == 0 && parent->taskwait->in_depend_wait) @@ -614,36 +1139,31 @@ gomp_task_run_post_remove_parent (struct parent->taskwait->in_depend_wait = false; gomp_sem_post (&parent->taskwait->taskwait_sem); } - child_task->prev_child->next_child = child_task->next_child; - child_task->next_child->prev_child = child_task->prev_child; - if (parent->children != child_task) - return; - if (child_task->next_child != child_task) - parent->children = child_task->next_child; - else + + if (priority_queue_remove (PQ_CHILDREN, &parent->children_queue, + child_task, MEMMODEL_RELEASE) + && parent->taskwait && parent->taskwait->in_taskwait) { - /* We access task->children in GOMP_taskwait - outside of the task lock mutex region, so - need a release barrier here to ensure memory - written by child_task->fn above is flushed - before the NULL is written. */ - __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE); - if (parent->taskwait && parent->taskwait->in_taskwait) - { - parent->taskwait->in_taskwait = false; - gomp_sem_post (&parent->taskwait->taskwait_sem); - } + parent->taskwait->in_taskwait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); } + child_task->pnode[PQ_CHILDREN].next = NULL; + child_task->pnode[PQ_CHILDREN].prev = NULL; } +/* Remove CHILD_TASK from its taskgroup. */ + static inline void gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task) { struct gomp_taskgroup *taskgroup = child_task->taskgroup; if (taskgroup == NULL) return; - child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup; - child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup; + bool empty = priority_queue_remove (PQ_TASKGROUP, + &taskgroup->taskgroup_queue, + child_task, MEMMODEL_RELAXED); + child_task->pnode[PQ_TASKGROUP].next = NULL; + child_task->pnode[PQ_TASKGROUP].prev = NULL; if (taskgroup->num_children > 1) --taskgroup->num_children; else @@ -655,18 +1175,10 @@ gomp_task_run_post_remove_taskgroup (str before the NULL is written. */ __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE); } - if (taskgroup->children != child_task) - return; - if (child_task->next_taskgroup != child_task) - taskgroup->children = child_task->next_taskgroup; - else + if (empty && taskgroup->in_taskgroup_wait) { - taskgroup->children = NULL; - if (taskgroup->in_taskgroup_wait) - { - taskgroup->in_taskgroup_wait = false; - gomp_sem_post (&taskgroup->taskgroup_sem); - } + taskgroup->in_taskgroup_wait = false; + gomp_sem_post (&taskgroup->taskgroup_sem); } } @@ -696,11 +1208,15 @@ gomp_barrier_handle_tasks (gomp_barrier_ while (1) { bool cancelled = false; - if (team->task_queue != NULL) + if (!priority_queue_empty_p (&team->task_queue, MEMMODEL_RELAXED)) { - child_task = team->task_queue; + bool ignored; + child_task + = priority_queue_next_task (PQ_TEAM, &team->task_queue, + PQ_IGNORED, NULL, + &ignored); cancelled = gomp_task_run_pre (child_task, child_task->parent, - child_task->taskgroup, team); + team); if (__builtin_expect (cancelled, 0)) { if (to_free) @@ -729,7 +1245,29 @@ gomp_barrier_handle_tasks (gomp_barrier_ if (child_task) { thr->task = child_task; - child_task->fn (child_task->fn_data); + if (__builtin_expect (child_task->fn == NULL, 0)) + { + if (gomp_target_task_fn (child_task->fn_data)) + { + thr->task = task; + gomp_mutex_lock (&team->task_lock); + child_task->kind = GOMP_TASK_ASYNC_RUNNING; + team->task_running_count--; + struct gomp_target_task *ttask + = (struct gomp_target_task *) child_task->fn_data; + /* If GOMP_PLUGIN_target_task_completion has run already + in between gomp_target_task_fn and the mutex lock, + perform the requeuing here. */ + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + gomp_target_task_completion (team, child_task); + else + ttask->state = GOMP_TARGET_TASK_RUNNING; + child_task = NULL; + continue; + } + } + else + child_task->fn (child_task->fn_data); thr->task = task; } else @@ -741,7 +1279,7 @@ gomp_barrier_handle_tasks (gomp_barrier_ size_t new_tasks = gomp_task_run_post_handle_depend (child_task, team); gomp_task_run_post_remove_parent (child_task); - gomp_clear_parent (child_task->children); + gomp_clear_parent (&child_task->children_queue); gomp_task_run_post_remove_taskgroup (child_task); to_free = child_task; child_task = NULL; @@ -765,7 +1303,9 @@ gomp_barrier_handle_tasks (gomp_barrier_ } } -/* Called when encountering a taskwait directive. */ +/* Called when encountering a taskwait directive. + + Wait for all children of the current task. */ void GOMP_taskwait (void) @@ -785,15 +1325,16 @@ GOMP_taskwait (void) child thread task work function are seen before we exit from GOMP_taskwait. */ if (task == NULL - || __atomic_load_n (&task->children, MEMMODEL_ACQUIRE) == NULL) + || priority_queue_empty_p (&task->children_queue, MEMMODEL_ACQUIRE)) return; memset (&taskwait, 0, sizeof (taskwait)); + bool child_q = false; gomp_mutex_lock (&team->task_lock); while (1) { bool cancelled = false; - if (task->children == NULL) + if (priority_queue_empty_p (&task->children_queue, MEMMODEL_RELAXED)) { bool destroy_taskwait = task->taskwait != NULL; task->taskwait = NULL; @@ -807,12 +1348,14 @@ GOMP_taskwait (void) gomp_sem_destroy (&taskwait.taskwait_sem); return; } - if (task->children->kind == GOMP_TASK_WAITING) + struct gomp_task *next_task + = priority_queue_next_task (PQ_CHILDREN, &task->children_queue, + PQ_TEAM, &team->task_queue, &child_q); + if (next_task->kind == GOMP_TASK_WAITING) { - child_task = task->children; + child_task = next_task; cancelled - = gomp_task_run_pre (child_task, task, child_task->taskgroup, - team); + = gomp_task_run_pre (child_task, task, team); if (__builtin_expect (cancelled, 0)) { if (to_free) @@ -826,8 +1369,10 @@ GOMP_taskwait (void) } else { - /* All tasks we are waiting for are already running - in other threads. Wait for them. */ + /* All tasks we are waiting for are either running in other + threads, or they are tasks that have not had their + dependencies met (so they're not even in the queue). Wait + for them. */ if (task->taskwait == NULL) { taskwait.in_depend_wait = false; @@ -851,7 +1396,28 @@ GOMP_taskwait (void) if (child_task) { thr->task = child_task; - child_task->fn (child_task->fn_data); + if (__builtin_expect (child_task->fn == NULL, 0)) + { + if (gomp_target_task_fn (child_task->fn_data)) + { + thr->task = task; + gomp_mutex_lock (&team->task_lock); + child_task->kind = GOMP_TASK_ASYNC_RUNNING; + struct gomp_target_task *ttask + = (struct gomp_target_task *) child_task->fn_data; + /* If GOMP_PLUGIN_target_task_completion has run already + in between gomp_target_task_fn and the mutex lock, + perform the requeuing here. */ + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + gomp_target_task_completion (team, child_task); + else + ttask->state = GOMP_TARGET_TASK_RUNNING; + child_task = NULL; + continue; + } + } + else + child_task->fn (child_task->fn_data); thr->task = task; } else @@ -862,17 +1428,19 @@ GOMP_taskwait (void) finish_cancelled:; size_t new_tasks = gomp_task_run_post_handle_depend (child_task, team); - child_task->prev_child->next_child = child_task->next_child; - child_task->next_child->prev_child = child_task->prev_child; - if (task->children == child_task) - { - if (child_task->next_child != child_task) - task->children = child_task->next_child; - else - task->children = NULL; + + if (child_q) + { + priority_queue_remove (PQ_CHILDREN, &task->children_queue, + child_task, MEMMODEL_RELAXED); + child_task->pnode[PQ_CHILDREN].next = NULL; + child_task->pnode[PQ_CHILDREN].prev = NULL; } - gomp_clear_parent (child_task->children); + + gomp_clear_parent (&child_task->children_queue); + gomp_task_run_post_remove_taskgroup (child_task); + to_free = child_task; child_task = NULL; team->task_count--; @@ -887,10 +1455,20 @@ GOMP_taskwait (void) } } -/* This is like GOMP_taskwait, but we only wait for tasks that the - upcoming task depends on. */ +/* An undeferred task is about to run. Wait for all tasks that this + undeferred task depends on. -static void + This is done by first putting all known ready dependencies + (dependencies that have their own dependencies met) at the top of + the scheduling queues. Then we iterate through these imminently + ready tasks (and possibly other high priority tasks), and run them. + If we run out of ready dependencies to execute, we either wait for + the reamining dependencies to finish, or wait for them to get + scheduled so we can run them. + + DEPEND is as in GOMP_task. */ + +void gomp_task_maybe_wait_for_dependencies (void **depend) { struct gomp_thread *thr = gomp_thread (); @@ -898,7 +1476,6 @@ gomp_task_maybe_wait_for_dependencies (v struct gomp_team *team = thr->ts.team; struct gomp_task_depend_entry elem, *ent = NULL; struct gomp_taskwait taskwait; - struct gomp_task *last_parent_depends_on = NULL; size_t ndepend = (uintptr_t) depend[0]; size_t nout = (uintptr_t) depend[1]; size_t i; @@ -922,32 +1499,11 @@ gomp_task_maybe_wait_for_dependencies (v { tsk->parent_depends_on = true; ++num_awaited; + /* If depenency TSK itself has no dependencies and is + ready to run, move it up front so that we run it as + soon as possible. */ if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING) - { - /* If a task we need to wait for is not already - running and is ready to be scheduled, move it - to front, so that we run it as soon as possible. */ - if (last_parent_depends_on) - { - tsk->prev_child->next_child = tsk->next_child; - tsk->next_child->prev_child = tsk->prev_child; - tsk->prev_child = last_parent_depends_on; - tsk->next_child = last_parent_depends_on->next_child; - tsk->prev_child->next_child = tsk; - tsk->next_child->prev_child = tsk; - } - else if (tsk != task->children) - { - tsk->prev_child->next_child = tsk->next_child; - tsk->next_child->prev_child = tsk->prev_child; - tsk->prev_child = task->children; - tsk->next_child = task->children->next_child; - task->children = tsk; - tsk->prev_child->next_child = tsk; - tsk->next_child->prev_child = tsk; - } - last_parent_depends_on = tsk; - } + priority_queue_upgrade_task (tsk, task); } } } @@ -959,7 +1515,6 @@ gomp_task_maybe_wait_for_dependencies (v memset (&taskwait, 0, sizeof (taskwait)); taskwait.n_depend = num_awaited; - taskwait.last_parent_depends_on = last_parent_depends_on; gomp_sem_init (&taskwait.taskwait_sem, 0); task->taskwait = &taskwait; @@ -978,12 +1533,30 @@ gomp_task_maybe_wait_for_dependencies (v gomp_sem_destroy (&taskwait.taskwait_sem); return; } - if (task->children->kind == GOMP_TASK_WAITING) + + /* Theoretically when we have multiple priorities, we should + chose between the highest priority item in + task->children_queue and team->task_queue here, so we should + use priority_queue_next_task(). However, since we are + running an undeferred task, perhaps that makes all tasks it + depends on undeferred, thus a priority of INF? This would + make it unnecessary to take anything into account here, + but the dependencies. + + On the other hand, if we want to use priority_queue_next_task(), + care should be taken to only use priority_queue_remove() + below if the task was actually removed from the children + queue. */ + bool ignored; + struct gomp_task *next_task + = priority_queue_next_task (PQ_CHILDREN, &task->children_queue, + PQ_IGNORED, NULL, &ignored); + + if (next_task->kind == GOMP_TASK_WAITING) { - child_task = task->children; + child_task = next_task; cancelled - = gomp_task_run_pre (child_task, task, child_task->taskgroup, - team); + = gomp_task_run_pre (child_task, task, team); if (__builtin_expect (cancelled, 0)) { if (to_free) @@ -996,8 +1569,10 @@ gomp_task_maybe_wait_for_dependencies (v } } else - /* All tasks we are waiting for are already running - in other threads. Wait for them. */ + /* All tasks we are waiting for are either running in other + threads, or they are tasks that have not had their + dependencies met (so they're not even in the queue). Wait + for them. */ taskwait.in_depend_wait = true; gomp_mutex_unlock (&team->task_lock); if (do_wake) @@ -1014,7 +1589,28 @@ gomp_task_maybe_wait_for_dependencies (v if (child_task) { thr->task = child_task; - child_task->fn (child_task->fn_data); + if (__builtin_expect (child_task->fn == NULL, 0)) + { + if (gomp_target_task_fn (child_task->fn_data)) + { + thr->task = task; + gomp_mutex_lock (&team->task_lock); + child_task->kind = GOMP_TASK_ASYNC_RUNNING; + struct gomp_target_task *ttask + = (struct gomp_target_task *) child_task->fn_data; + /* If GOMP_PLUGIN_target_task_completion has run already + in between gomp_target_task_fn and the mutex lock, + perform the requeuing here. */ + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + gomp_target_task_completion (team, child_task); + else + ttask->state = GOMP_TARGET_TASK_RUNNING; + child_task = NULL; + continue; + } + } + else + child_task->fn (child_task->fn_data); thr->task = task; } else @@ -1027,16 +1623,13 @@ gomp_task_maybe_wait_for_dependencies (v = gomp_task_run_post_handle_depend (child_task, team); if (child_task->parent_depends_on) --taskwait.n_depend; - child_task->prev_child->next_child = child_task->next_child; - child_task->next_child->prev_child = child_task->prev_child; - if (task->children == child_task) - { - if (child_task->next_child != child_task) - task->children = child_task->next_child; - else - task->children = NULL; - } - gomp_clear_parent (child_task->children); + + priority_queue_remove (PQ_CHILDREN, &task->children_queue, + child_task, MEMMODEL_RELAXED); + child_task->pnode[PQ_CHILDREN].next = NULL; + child_task->pnode[PQ_CHILDREN].prev = NULL; + + gomp_clear_parent (&child_task->children_queue); gomp_task_run_post_remove_taskgroup (child_task); to_free = child_task; child_task = NULL; @@ -1069,14 +1662,14 @@ GOMP_taskgroup_start (void) struct gomp_taskgroup *taskgroup; /* If team is NULL, all tasks are executed as - GOMP_TASK_IFFALSE tasks and thus all children tasks of + GOMP_TASK_UNDEFERRED tasks and thus all children tasks of taskgroup and their descendant tasks will be finished by the time GOMP_taskgroup_end is called. */ if (team == NULL) return; taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup)); taskgroup->prev = task->taskgroup; - taskgroup->children = NULL; + priority_queue_init (&taskgroup->taskgroup_queue); taskgroup->in_taskgroup_wait = false; taskgroup->cancelled = false; taskgroup->num_children = 0; @@ -1098,6 +1691,17 @@ GOMP_taskgroup_end (void) if (team == NULL) return; taskgroup = task->taskgroup; + if (__builtin_expect (taskgroup == NULL, 0) + && thr->ts.level == 0) + { + /* This can happen if GOMP_taskgroup_start is called when + thr->ts.team == NULL, but inside of the taskgroup there + is #pragma omp target nowait that creates an implicit + team with a single thread. In this case, we want to wait + for all outstanding tasks in this team. */ + gomp_team_barrier_wait (&team->barrier); + return; + } /* The acquire barrier on load of taskgroup->num_children here synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup. @@ -1108,19 +1712,25 @@ GOMP_taskgroup_end (void) if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0) goto finish; + bool unused; gomp_mutex_lock (&team->task_lock); while (1) { bool cancelled = false; - if (taskgroup->children == NULL) + if (priority_queue_empty_p (&taskgroup->taskgroup_queue, + MEMMODEL_RELAXED)) { if (taskgroup->num_children) { - if (task->children == NULL) + if (priority_queue_empty_p (&task->children_queue, + MEMMODEL_RELAXED)) goto do_wait; - child_task = task->children; - } - else + child_task + = priority_queue_next_task (PQ_CHILDREN, &task->children_queue, + PQ_TEAM, &team->task_queue, + &unused); + } + else { gomp_mutex_unlock (&team->task_lock); if (to_free) @@ -1132,12 +1742,13 @@ GOMP_taskgroup_end (void) } } else - child_task = taskgroup->children; + child_task + = priority_queue_next_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + PQ_TEAM, &team->task_queue, &unused); if (child_task->kind == GOMP_TASK_WAITING) { cancelled - = gomp_task_run_pre (child_task, child_task->parent, taskgroup, - team); + = gomp_task_run_pre (child_task, child_task->parent, team); if (__builtin_expect (cancelled, 0)) { if (to_free) @@ -1153,8 +1764,10 @@ GOMP_taskgroup_end (void) { child_task = NULL; do_wait: - /* All tasks we are waiting for are already running - in other threads. Wait for them. */ + /* All tasks we are waiting for are either running in other + threads, or they are tasks that have not had their + dependencies met (so they're not even in the queue). Wait + for them. */ taskgroup->in_taskgroup_wait = true; } gomp_mutex_unlock (&team->task_lock); @@ -1172,7 +1785,28 @@ GOMP_taskgroup_end (void) if (child_task) { thr->task = child_task; - child_task->fn (child_task->fn_data); + if (__builtin_expect (child_task->fn == NULL, 0)) + { + if (gomp_target_task_fn (child_task->fn_data)) + { + thr->task = task; + gomp_mutex_lock (&team->task_lock); + child_task->kind = GOMP_TASK_ASYNC_RUNNING; + struct gomp_target_task *ttask + = (struct gomp_target_task *) child_task->fn_data; + /* If GOMP_PLUGIN_target_task_completion has run already + in between gomp_target_task_fn and the mutex lock, + perform the requeuing here. */ + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + gomp_target_task_completion (team, child_task); + else + ttask->state = GOMP_TARGET_TASK_RUNNING; + child_task = NULL; + continue; + } + } + else + child_task->fn (child_task->fn_data); thr->task = task; } else @@ -1184,7 +1818,7 @@ GOMP_taskgroup_end (void) size_t new_tasks = gomp_task_run_post_handle_depend (child_task, team); gomp_task_run_post_remove_parent (child_task); - gomp_clear_parent (child_task->children); + gomp_clear_parent (&child_task->children_queue); gomp_task_run_post_remove_taskgroup (child_task); to_free = child_task; child_task = NULL; --- libgomp/libgomp_g.h.jj 2014-05-15 10:56:31.429532978 +0200 +++ libgomp/libgomp_g.h 2016-07-13 16:57:04.422535521 +0200 @@ -29,6 +29,7 @@ #define LIBGOMP_G_H 1 #include +#include /* barrier.c */ @@ -50,6 +51,10 @@ extern bool GOMP_loop_static_start (long extern bool GOMP_loop_dynamic_start (long, long, long, long, long *, long *); extern bool GOMP_loop_guided_start (long, long, long, long, long *, long *); extern bool GOMP_loop_runtime_start (long, long, long, long *, long *); +extern bool GOMP_loop_nonmonotonic_dynamic_start (long, long, long, long, + long *, long *); +extern bool GOMP_loop_nonmonotonic_guided_start (long, long, long, long, + long *, long *); extern bool GOMP_loop_ordered_static_start (long, long, long, long, long *, long *); @@ -63,12 +68,23 @@ extern bool GOMP_loop_static_next (long extern bool GOMP_loop_dynamic_next (long *, long *); extern bool GOMP_loop_guided_next (long *, long *); extern bool GOMP_loop_runtime_next (long *, long *); +extern bool GOMP_loop_nonmonotonic_dynamic_next (long *, long *); +extern bool GOMP_loop_nonmonotonic_guided_next (long *, long *); extern bool GOMP_loop_ordered_static_next (long *, long *); extern bool GOMP_loop_ordered_dynamic_next (long *, long *); extern bool GOMP_loop_ordered_guided_next (long *, long *); extern bool GOMP_loop_ordered_runtime_next (long *, long *); +extern bool GOMP_loop_doacross_static_start (unsigned, long *, long, long *, + long *); +extern bool GOMP_loop_doacross_dynamic_start (unsigned, long *, long, long *, + long *); +extern bool GOMP_loop_doacross_guided_start (unsigned, long *, long, long *, + long *); +extern bool GOMP_loop_doacross_runtime_start (unsigned, long *, long *, + long *); + extern void GOMP_parallel_loop_static_start (void (*)(void *), void *, unsigned, long, long, long, long); extern void GOMP_parallel_loop_dynamic_start (void (*)(void *), void *, @@ -89,6 +105,12 @@ extern void GOMP_parallel_loop_guided (v extern void GOMP_parallel_loop_runtime (void (*)(void *), void *, unsigned, long, long, long, unsigned); +extern void GOMP_parallel_loop_nonmonotonic_dynamic (void (*)(void *), void *, + unsigned, long, long, + long, long, unsigned); +extern void GOMP_parallel_loop_nonmonotonic_guided (void (*)(void *), void *, + unsigned, long, long, + long, long, unsigned); extern void GOMP_loop_end (void); extern void GOMP_loop_end_nowait (void); @@ -119,6 +141,18 @@ extern bool GOMP_loop_ull_runtime_start unsigned long long, unsigned long long *, unsigned long long *); +extern bool GOMP_loop_ull_nonmonotonic_dynamic_start (bool, unsigned long long, + unsigned long long, + unsigned long long, + unsigned long long, + unsigned long long *, + unsigned long long *); +extern bool GOMP_loop_ull_nonmonotonic_guided_start (bool, unsigned long long, + unsigned long long, + unsigned long long, + unsigned long long, + unsigned long long *, + unsigned long long *); extern bool GOMP_loop_ull_ordered_static_start (bool, unsigned long long, unsigned long long, @@ -152,6 +186,10 @@ extern bool GOMP_loop_ull_guided_next (u unsigned long long *); extern bool GOMP_loop_ull_runtime_next (unsigned long long *, unsigned long long *); +extern bool GOMP_loop_ull_nonmonotonic_dynamic_next (unsigned long long *, + unsigned long long *); +extern bool GOMP_loop_ull_nonmonotonic_guided_next (unsigned long long *, + unsigned long long *); extern bool GOMP_loop_ull_ordered_static_next (unsigned long long *, unsigned long long *); @@ -162,10 +200,34 @@ extern bool GOMP_loop_ull_ordered_guided extern bool GOMP_loop_ull_ordered_runtime_next (unsigned long long *, unsigned long long *); +extern bool GOMP_loop_ull_doacross_static_start (unsigned, + unsigned long long *, + unsigned long long, + unsigned long long *, + unsigned long long *); +extern bool GOMP_loop_ull_doacross_dynamic_start (unsigned, + unsigned long long *, + unsigned long long, + unsigned long long *, + unsigned long long *); +extern bool GOMP_loop_ull_doacross_guided_start (unsigned, + unsigned long long *, + unsigned long long, + unsigned long long *, + unsigned long long *); +extern bool GOMP_loop_ull_doacross_runtime_start (unsigned, + unsigned long long *, + unsigned long long *, + unsigned long long *); + /* ordered.c */ extern void GOMP_ordered_start (void); extern void GOMP_ordered_end (void); +extern void GOMP_doacross_post (long *); +extern void GOMP_doacross_wait (long, ...); +extern void GOMP_doacross_ull_post (unsigned long long *); +extern void GOMP_doacross_ull_wait (unsigned long long, ...); /* parallel.c */ @@ -178,7 +240,15 @@ extern bool GOMP_cancellation_point (int /* task.c */ extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *), - long, long, bool, unsigned, void **); + long, long, bool, unsigned, void **, int); +extern void GOMP_taskloop (void (*) (void *), void *, + void (*) (void *, void *), long, long, unsigned, + unsigned long, int, long, long, long); +extern void GOMP_taskloop_ull (void (*) (void *), void *, + void (*) (void *, void *), long, long, + unsigned, unsigned long, int, + unsigned long long, unsigned long long, + unsigned long long); extern void GOMP_taskwait (void); extern void GOMP_taskyield (void); extern void GOMP_taskgroup_start (void); @@ -206,11 +276,38 @@ extern void GOMP_single_copy_end (void * extern void GOMP_target (int, void (*) (void *), const void *, size_t, void **, size_t *, unsigned char *); +extern void GOMP_target_ext (int, void (*) (void *), size_t, void **, size_t *, + unsigned short *, unsigned int, void **, void **); extern void GOMP_target_data (int, const void *, size_t, void **, size_t *, unsigned char *); +extern void GOMP_target_data_ext (int, size_t, void **, size_t *, + unsigned short *); extern void GOMP_target_end_data (void); extern void GOMP_target_update (int, const void *, size_t, void **, size_t *, unsigned char *); +extern void GOMP_target_update_ext (int, size_t, void **, size_t *, + unsigned short *, unsigned int, void **); +extern void GOMP_target_enter_exit_data (int, size_t, void **, size_t *, + unsigned short *, unsigned int, + void **); extern void GOMP_teams (unsigned int, unsigned int); +/* oacc-parallel.c */ + +extern void GOACC_parallel_keyed (int, void (*) (void *), size_t, + void **, size_t *, unsigned short *, ...); +extern void GOACC_parallel (int, void (*) (void *), size_t, void **, size_t *, + unsigned short *, int, int, int, int, int, ...); +extern void GOACC_data_start (int, size_t, void **, size_t *, + unsigned short *); +extern void GOACC_data_end (void); +extern void GOACC_enter_exit_data (int, size_t, void **, + size_t *, unsigned short *, int, int, ...); +extern void GOACC_update (int, size_t, void **, size_t *, + unsigned short *, int, int, ...); +extern void GOACC_wait (int, int, ...); +extern int GOACC_get_num_threads (void); +extern int GOACC_get_thread_num (void); +extern void GOACC_declare (int, size_t, void **, size_t *, unsigned short *); + #endif /* LIBGOMP_G_H */ --- libgomp/libgomp.h.jj 2014-08-01 15:59:49.145188127 +0200 +++ libgomp/libgomp.h 2016-07-14 17:40:24.038243456 +0200 @@ -34,12 +34,35 @@ #ifndef LIBGOMP_H #define LIBGOMP_H 1 +#ifndef _LIBGOMP_CHECKING_ +/* Define to 1 to perform internal sanity checks. */ +#define _LIBGOMP_CHECKING_ 0 +#endif + #include "config.h" #include "gstdint.h" +#include "libgomp-plugin.h" #include #include #include +#include + +/* Needed for memset in priority_queue.c. */ +#if _LIBGOMP_CHECKING_ +# ifdef STRING_WITH_STRINGS +# include +# include +# else +# ifdef HAVE_STRING_H +# include +# else +# ifdef HAVE_STRINGS_H +# include +# endif +# endif +# endif +#endif #ifdef HAVE_ATTRIBUTE_VISIBILITY # pragma GCC visibility push(hidden) @@ -56,6 +79,44 @@ enum memmodel MEMMODEL_SEQ_CST = 5 }; +/* alloc.c */ + +extern void *gomp_malloc (size_t) __attribute__((malloc)); +extern void *gomp_malloc_cleared (size_t) __attribute__((malloc)); +extern void *gomp_realloc (void *, size_t); + +/* Avoid conflicting prototypes of alloca() in system headers by using + GCC's builtin alloca(). */ +#define gomp_alloca(x) __builtin_alloca(x) + +/* error.c */ + +extern void gomp_vdebug (int, const char *, va_list); +extern void gomp_debug (int, const char *, ...) + __attribute__ ((format (printf, 2, 3))); +#define gomp_vdebug(KIND, FMT, VALIST) \ + do { \ + if (__builtin_expect (gomp_debug_var, 0)) \ + (gomp_vdebug) ((KIND), (FMT), (VALIST)); \ + } while (0) +#define gomp_debug(KIND, ...) \ + do { \ + if (__builtin_expect (gomp_debug_var, 0)) \ + (gomp_debug) ((KIND), __VA_ARGS__); \ + } while (0) +extern void gomp_verror (const char *, va_list); +extern void gomp_error (const char *, ...) + __attribute__ ((format (printf, 1, 2))); +extern void gomp_vfatal (const char *, va_list) + __attribute__ ((noreturn)); +extern void gomp_fatal (const char *, ...) + __attribute__ ((noreturn, format (printf, 1, 2))); + +struct gomp_task; +struct gomp_taskgroup; +struct htab; + +#include "priority_queue.h" #include "sem.h" #include "mutex.h" #include "bar.h" @@ -74,6 +135,44 @@ enum gomp_schedule_type GFS_AUTO }; +struct gomp_doacross_work_share +{ + union { + /* chunk_size copy, as ws->chunk_size is multiplied by incr for + GFS_DYNAMIC. */ + long chunk_size; + /* Likewise, but for ull implementation. */ + unsigned long long chunk_size_ull; + /* For schedule(static,0) this is the number + of iterations assigned to the last thread, i.e. number of + iterations / number of threads. */ + long q; + /* Likewise, but for ull implementation. */ + unsigned long long q_ull; + }; + /* Size of each array entry (padded to cache line size). */ + unsigned long elt_sz; + /* Number of dimensions in sink vectors. */ + unsigned int ncounts; + /* True if the iterations can be flattened. */ + bool flattened; + /* Actual array (of elt_sz sized units), aligned to cache line size. + This is indexed by team_id for GFS_STATIC and outermost iteration + / chunk_size for other schedules. */ + unsigned char *array; + /* These two are only used for schedule(static,0). */ + /* This one is number of iterations % number of threads. */ + long t; + union { + /* And this one is cached t * (q + 1). */ + long boundary; + /* Likewise, but for the ull implementation. */ + unsigned long long boundary_ull; + }; + /* Array of shift counts for each dimension if they can be flattened. */ + unsigned int shift_counts[]; +}; + struct gomp_work_share { /* This member records the SCHEDULE clause to be used for this construct. @@ -105,13 +204,18 @@ struct gomp_work_share }; }; - /* This is a circular queue that details which threads will be allowed - into the ordered region and in which order. When a thread allocates - iterations on which it is going to work, it also registers itself at - the end of the array. When a thread reaches the ordered region, it - checks to see if it is the one at the head of the queue. If not, it - blocks on its RELEASE semaphore. */ - unsigned *ordered_team_ids; + union { + /* This is a circular queue that details which threads will be allowed + into the ordered region and in which order. When a thread allocates + iterations on which it is going to work, it also registers itself at + the end of the array. When a thread reaches the ordered region, it + checks to see if it is the one at the head of the queue. If not, it + blocks on its RELEASE semaphore. */ + unsigned *ordered_team_ids; + + /* This is a pointer to DOACROSS work share data. */ + struct gomp_doacross_work_share *doacross; + }; /* This is the number of threads that have registered themselves in the circular queue ordered_team_ids. */ @@ -230,7 +334,7 @@ struct gomp_task_icv { unsigned long nthreads_var; enum gomp_schedule_type run_sched_var; - int run_sched_modifier; + int run_sched_chunk_size; int default_device_var; unsigned int thread_limit_var; bool dyn_var; @@ -246,6 +350,7 @@ extern gomp_mutex_t gomp_managed_threads #endif extern unsigned long gomp_max_active_levels_var; extern bool gomp_cancel_var; +extern int gomp_max_task_priority_var; extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var; extern unsigned long gomp_available_cpus, gomp_managed_threads; extern unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len; @@ -253,25 +358,36 @@ extern char *gomp_bind_var_list; extern unsigned long gomp_bind_var_list_len; extern void **gomp_places_list; extern unsigned long gomp_places_list_len; +extern int gomp_debug_var; +extern int goacc_device_num; +extern char *goacc_device_type; enum gomp_task_kind { + /* Implicit task. */ GOMP_TASK_IMPLICIT, - GOMP_TASK_IFFALSE, + /* Undeferred task. */ + GOMP_TASK_UNDEFERRED, + /* Task created by GOMP_task and waiting to be run. */ GOMP_TASK_WAITING, - GOMP_TASK_TIED + /* Task currently executing or scheduled and about to execute. */ + GOMP_TASK_TIED, + /* Used for target tasks that have vars mapped and async run started, + but not yet completed. Once that completes, they will be readded + into the queues as GOMP_TASK_WAITING in order to perform the var + unmapping. */ + GOMP_TASK_ASYNC_RUNNING }; -struct gomp_task; -struct gomp_taskgroup; -struct htab; - struct gomp_task_depend_entry { + /* Address of dependency. */ void *addr; struct gomp_task_depend_entry *next; struct gomp_task_depend_entry *prev; + /* Task that provides the dependency in ADDR. */ struct gomp_task *task; + /* Depend entry is of type "IN". */ bool is_in; bool redundant; bool redundant_out; @@ -290,8 +406,8 @@ struct gomp_taskwait { bool in_taskwait; bool in_depend_wait; + /* Number of tasks we are waiting for. */ size_t n_depend; - struct gomp_task *last_parent_depends_on; gomp_sem_t taskwait_sem; }; @@ -299,20 +415,31 @@ struct gomp_taskwait struct gomp_task { + /* Parent of this task. */ struct gomp_task *parent; - struct gomp_task *children; - struct gomp_task *next_child; - struct gomp_task *prev_child; - struct gomp_task *next_queue; - struct gomp_task *prev_queue; - struct gomp_task *next_taskgroup; - struct gomp_task *prev_taskgroup; + /* Children of this task. */ + struct priority_queue children_queue; + /* Taskgroup this task belongs in. */ struct gomp_taskgroup *taskgroup; + /* Tasks that depend on this task. */ struct gomp_dependers_vec *dependers; struct htab *depend_hash; struct gomp_taskwait *taskwait; + /* Number of items in DEPEND. */ size_t depend_count; + /* Number of tasks this task depends on. Once this counter reaches + 0, we have no unsatisfied dependencies, and this task can be put + into the various queues to be scheduled. */ size_t num_dependees; + + /* Priority of this task. */ + int priority; + /* The priority node for this task in each of the different queues. + We put this here to avoid allocating space for each priority + node. Then we play offsetof() games to convert between pnode[] + entries and the gomp_task in which they reside. */ + struct priority_node pnode[3]; + struct gomp_task_icv icv; void (*fn) (void *); void *fn_data; @@ -320,20 +447,58 @@ struct gomp_task bool in_tied_task; bool final_task; bool copy_ctors_done; + /* Set for undeferred tasks with unsatisfied dependencies which + block further execution of their parent until the dependencies + are satisfied. */ bool parent_depends_on; + /* Dependencies provided and/or needed for this task. DEPEND_COUNT + is the number of items available. */ struct gomp_task_depend_entry depend[]; }; +/* This structure describes a single #pragma omp taskgroup. */ + struct gomp_taskgroup { struct gomp_taskgroup *prev; - struct gomp_task *children; + /* Queue of tasks that belong in this taskgroup. */ + struct priority_queue taskgroup_queue; bool in_taskgroup_wait; bool cancelled; gomp_sem_t taskgroup_sem; size_t num_children; }; +/* Various state of OpenMP async offloading tasks. */ +enum gomp_target_task_state +{ + GOMP_TARGET_TASK_DATA, + GOMP_TARGET_TASK_BEFORE_MAP, + GOMP_TARGET_TASK_FALLBACK, + GOMP_TARGET_TASK_READY_TO_RUN, + GOMP_TARGET_TASK_RUNNING, + GOMP_TARGET_TASK_FINISHED +}; + +/* This structure describes a target task. */ + +struct gomp_target_task +{ + struct gomp_device_descr *devicep; + void (*fn) (void *); + size_t mapnum; + size_t *sizes; + unsigned short *kinds; + unsigned int flags; + enum gomp_target_task_state state; + struct target_mem_desc *tgt; + struct gomp_task *task; + struct gomp_team *team; + /* Device-specific target arguments. */ + void **args; + void *hostaddrs[]; +}; + /* This structure describes a "team" of threads. These are the threads that are spawned by a PARALLEL constructs, as well as the work sharing constructs that the team encounters. */ @@ -396,7 +561,8 @@ struct gomp_team struct gomp_work_share work_shares[8]; gomp_mutex_t task_lock; - struct gomp_task *task_queue; + /* Scheduled tasks. */ + struct priority_queue task_queue; /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team. */ unsigned int task_count; /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled. */ @@ -451,6 +617,9 @@ struct gomp_thread_pool struct gomp_thread **threads; unsigned threads_size; unsigned threads_used; + /* The last team is used for non-nested teams to delay their destruction to + make sure all the threads in the team move on to the pool's barrier before + the team's barrier is destroyed. */ struct gomp_team *last_team; /* Number of threads running in this contention group. */ unsigned long threads_busy; @@ -519,23 +688,7 @@ extern bool gomp_affinity_same_place (vo extern bool gomp_affinity_finalize_place_list (bool); extern bool gomp_affinity_init_level (int, unsigned long, bool); extern void gomp_affinity_print_place (void *); - -/* alloc.c */ - -extern void *gomp_malloc (size_t) __attribute__((malloc)); -extern void *gomp_malloc_cleared (size_t) __attribute__((malloc)); -extern void *gomp_realloc (void *, size_t); - -/* Avoid conflicting prototypes of alloca() in system headers by using - GCC's builtin alloca(). */ -#define gomp_alloca(x) __builtin_alloca(x) - -/* error.c */ - -extern void gomp_error (const char *, ...) - __attribute__((format (printf, 1, 2))); -extern void gomp_fatal (const char *, ...) - __attribute__((noreturn, format (printf, 1, 2))); +extern void gomp_get_place_proc_ids_8 (int, int64_t *); /* iter.c */ @@ -572,6 +725,9 @@ extern void gomp_ordered_next (void); extern void gomp_ordered_static_init (void); extern void gomp_ordered_static_next (void); extern void gomp_ordered_sync (void); +extern void gomp_doacross_init (unsigned, long *, long); +extern void gomp_doacross_ull_init (unsigned, unsigned long long *, + unsigned long long); /* parallel.c */ @@ -588,6 +744,12 @@ extern void gomp_init_task (struct gomp_ struct gomp_task_icv *); extern void gomp_end_task (void); extern void gomp_barrier_handle_tasks (gomp_barrier_state_t); +extern void gomp_task_maybe_wait_for_dependencies (void **); +extern bool gomp_create_target_task (struct gomp_device_descr *, + void (*) (void *), size_t, void **, + size_t *, unsigned short *, unsigned int, + void **, void **, + enum gomp_target_task_state); static void inline gomp_finish_task (struct gomp_task *task) @@ -606,7 +768,213 @@ extern void gomp_free_thread (void *); /* target.c */ +extern void gomp_init_targets_once (void); extern int gomp_get_num_devices (void); +extern bool gomp_target_task_fn (void *); + +/* Splay tree definitions. */ +typedef struct splay_tree_node_s *splay_tree_node; +typedef struct splay_tree_s *splay_tree; +typedef struct splay_tree_key_s *splay_tree_key; + +struct target_var_desc { + /* Splay key. */ + splay_tree_key key; + /* True if data should be copied from device to host at the end. */ + bool copy_from; + /* True if data always should be copied from device to host at the end. */ + bool always_copy_from; + /* Relative offset against key host_start. */ + uintptr_t offset; + /* Actual length. */ + uintptr_t length; +}; + +struct target_mem_desc { + /* Reference count. */ + uintptr_t refcount; + /* All the splay nodes allocated together. */ + splay_tree_node array; + /* Start of the target region. */ + uintptr_t tgt_start; + /* End of the targer region. */ + uintptr_t tgt_end; + /* Handle to free. */ + void *to_free; + /* Previous target_mem_desc. */ + struct target_mem_desc *prev; + /* Number of items in following list. */ + size_t list_count; + + /* Corresponding target device descriptor. */ + struct gomp_device_descr *device_descr; + + /* List of target items to remove (or decrease refcount) + at the end of region. */ + struct target_var_desc list[]; +}; + +/* Special value for refcount - infinity. */ +#define REFCOUNT_INFINITY (~(uintptr_t) 0) +/* Special value for refcount - tgt_offset contains target address of the + artificial pointer to "omp declare target link" object. */ +#define REFCOUNT_LINK (~(uintptr_t) 1) + +struct splay_tree_key_s { + /* Address of the host object. */ + uintptr_t host_start; + /* Address immediately after the host object. */ + uintptr_t host_end; + /* Descriptor of the target memory. */ + struct target_mem_desc *tgt; + /* Offset from tgt->tgt_start to the start of the target object. */ + uintptr_t tgt_offset; + /* Reference count. */ + uintptr_t refcount; + /* Pointer to the original mapping of "omp declare target link" object. */ + splay_tree_key link_key; +}; + +/* The comparison function. */ + +static inline int +splay_compare (splay_tree_key x, splay_tree_key y) +{ + if (x->host_start == x->host_end + && y->host_start == y->host_end) + return 0; + if (x->host_end <= y->host_start) + return -1; + if (x->host_start >= y->host_end) + return 1; + return 0; +} + +#include "splay-tree.h" + +typedef struct acc_dispatch_t +{ + /* This is a linked list of data mapped using the + acc_map_data/acc_unmap_data or "acc enter data"/"acc exit data" pragmas. + Unlike mapped_data in the goacc_thread struct, unmapping can + happen out-of-order with respect to mapping. */ + /* This is guarded by the lock in the "outer" struct gomp_device_descr. */ + struct target_mem_desc *data_environ; + + /* Execute. */ + void (*exec_func) (void (*) (void *), size_t, void **, void **, int, + unsigned *, void *); + + /* Async cleanup callback registration. */ + void (*register_async_cleanup_func) (void *, int); + + /* Asynchronous routines. */ + int (*async_test_func) (int); + int (*async_test_all_func) (void); + void (*async_wait_func) (int); + void (*async_wait_async_func) (int, int); + void (*async_wait_all_func) (void); + void (*async_wait_all_async_func) (int); + void (*async_set_async_func) (int); + + /* Create/destroy TLS data. */ + void *(*create_thread_data_func) (int); + void (*destroy_thread_data_func) (void *); + + /* NVIDIA target specific routines. */ + struct { + void *(*get_current_device_func) (void); + void *(*get_current_context_func) (void); + void *(*get_stream_func) (int); + int (*set_stream_func) (int, void *); + } cuda; +} acc_dispatch_t; + +/* Various state of the accelerator device. */ +enum gomp_device_state +{ + GOMP_DEVICE_UNINITIALIZED, + GOMP_DEVICE_INITIALIZED, + GOMP_DEVICE_FINALIZED +}; + +/* This structure describes accelerator device. + It contains name of the corresponding libgomp plugin, function handlers for + interaction with the device, ID-number of the device, and information about + mapped memory. */ +struct gomp_device_descr +{ + /* Immutable data, which is only set during initialization, and which is not + guarded by the lock. */ + + /* The name of the device. */ + const char *name; + + /* Capabilities of device (supports OpenACC, OpenMP). */ + unsigned int capabilities; + + /* This is the ID number of device among devices of the same type. */ + int target_id; + + /* This is the TYPE of device. */ + enum offload_target_type type; + + /* Function handlers. */ + const char *(*get_name_func) (void); + unsigned int (*get_caps_func) (void); + int (*get_type_func) (void); + int (*get_num_devices_func) (void); + bool (*init_device_func) (int); + bool (*fini_device_func) (int); + unsigned (*version_func) (void); + int (*load_image_func) (int, unsigned, const void *, struct addr_pair **); + bool (*unload_image_func) (int, unsigned, const void *); + void *(*alloc_func) (int, size_t); + bool (*free_func) (int, void *); + bool (*dev2host_func) (int, void *, const void *, size_t); + bool (*host2dev_func) (int, void *, const void *, size_t); + bool (*dev2dev_func) (int, void *, const void *, size_t); + bool (*can_run_func) (void *); + void (*run_func) (int, void *, void *, void **); + void (*async_run_func) (int, void *, void *, void **, void *); + + /* Splay tree containing information about mapped memory regions. */ + struct splay_tree_s mem_map; + + /* Mutex for the mutable data. */ + gomp_mutex_t lock; + + /* Current state of the device. OpenACC allows to move from INITIALIZED state + back to UNINITIALIZED state. OpenMP allows only to move from INITIALIZED + to FINALIZED state (at program shutdown). */ + enum gomp_device_state state; + + /* OpenACC-specific data and functions. */ + /* This is mutable because of its mutable data_environ and target_data + members. */ + acc_dispatch_t openacc; +}; + +/* Kind of the pragma, for which gomp_map_vars () is called. */ +enum gomp_map_vars_kind +{ + GOMP_MAP_VARS_OPENACC, + GOMP_MAP_VARS_TARGET, + GOMP_MAP_VARS_DATA, + GOMP_MAP_VARS_ENTER_DATA +}; + +extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *); +extern void gomp_acc_remove_pointer (void *, bool, int, int); + +extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *, + size_t, void **, void **, + size_t *, void *, bool, + enum gomp_map_vars_kind); +extern void gomp_unmap_vars (struct target_mem_desc *, bool); +extern void gomp_init_device (struct gomp_device_descr *); +extern void gomp_free_memmap (struct splay_tree_s *); +extern void gomp_unload_device (struct gomp_device_descr *); /* work.c */ @@ -646,8 +1014,28 @@ typedef enum omp_proc_bind_t omp_proc_bind_spread = 4 } omp_proc_bind_t; +typedef enum omp_lock_hint_t +{ + omp_lock_hint_none = 0, + omp_lock_hint_uncontended = 1, + omp_lock_hint_contended = 2, + omp_lock_hint_nonspeculative = 4, + omp_lock_hint_speculative = 8, +} omp_lock_hint_t; + +extern void omp_init_lock_with_hint (omp_lock_t *, omp_lock_hint_t) + __GOMP_NOTHROW; +extern void omp_init_nest_lock_with_hint (omp_lock_t *, omp_lock_hint_t) + __GOMP_NOTHROW; + extern int omp_get_cancellation (void) __GOMP_NOTHROW; extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW; +extern int omp_get_num_places (void) __GOMP_NOTHROW; +extern int omp_get_place_num_procs (int) __GOMP_NOTHROW; +extern void omp_get_place_proc_ids (int, int *) __GOMP_NOTHROW; +extern int omp_get_place_num (void) __GOMP_NOTHROW; +extern int omp_get_partition_num_places (void) __GOMP_NOTHROW; +extern void omp_get_partition_place_nums (int *) __GOMP_NOTHROW; extern void omp_set_default_device (int) __GOMP_NOTHROW; extern int omp_get_default_device (void) __GOMP_NOTHROW; @@ -656,6 +1044,24 @@ extern int omp_get_num_teams (void) __GO extern int omp_get_team_num (void) __GOMP_NOTHROW; extern int omp_is_initial_device (void) __GOMP_NOTHROW; +extern int omp_get_initial_device (void) __GOMP_NOTHROW; +extern int omp_get_max_task_priority (void) __GOMP_NOTHROW; + +extern void *omp_target_alloc (__SIZE_TYPE__, int) __GOMP_NOTHROW; +extern void omp_target_free (void *, int) __GOMP_NOTHROW; +extern int omp_target_is_present (void *, int) __GOMP_NOTHROW; +extern int omp_target_memcpy (void *, void *, __SIZE_TYPE__, __SIZE_TYPE__, + __SIZE_TYPE__, int, int) __GOMP_NOTHROW; +extern int omp_target_memcpy_rect (void *, void *, __SIZE_TYPE__, int, + const __SIZE_TYPE__ *, + const __SIZE_TYPE__ *, + const __SIZE_TYPE__ *, + const __SIZE_TYPE__ *, + const __SIZE_TYPE__ *, int, int) + __GOMP_NOTHROW; +extern int omp_target_associate_ptr (void *, void *, __SIZE_TYPE__, + __SIZE_TYPE__, int) __GOMP_NOTHROW; +extern int omp_target_disassociate_ptr (void *, int) __GOMP_NOTHROW; #if !defined (HAVE_ATTRIBUTE_VISIBILITY) \ || !defined (HAVE_ATTRIBUTE_ALIAS) \ @@ -728,4 +1134,34 @@ extern int gomp_test_nest_lock_25 (omp_n # define ialias_call(fn) fn #endif +/* Helper function for priority_node_to_task() and + task_to_priority_node(). + + Return the offset from a task to its priority_node entry. The + priority_node entry is has a type of TYPE. */ + +static inline size_t +priority_queue_offset (enum priority_queue_type type) +{ + return offsetof (struct gomp_task, pnode[(int) type]); +} + +/* Return the task associated with a priority NODE of type TYPE. */ + +static inline struct gomp_task * +priority_node_to_task (enum priority_queue_type type, + struct priority_node *node) +{ + return (struct gomp_task *) ((char *) node - priority_queue_offset (type)); +} + +/* Return the priority node of type TYPE for a given TASK. */ + +static inline struct priority_node * +task_to_priority_node (enum priority_queue_type type, + struct gomp_task *task) +{ + return (struct priority_node *) ((char *) task + + priority_queue_offset (type)); +} #endif /* LIBGOMP_H */ --- libgomp/env.c.jj 2014-05-15 10:56:32.420522486 +0200 +++ libgomp/env.c 2016-07-13 16:57:04.437535335 +0200 @@ -27,6 +27,8 @@ #include "libgomp.h" #include "libgomp_f.h" +#include "oacc-int.h" +#include "gomp-constants.h" #include #include #include @@ -56,7 +58,7 @@ struct gomp_task_icv gomp_global_icv = { .nthreads_var = 1, .thread_limit_var = UINT_MAX, .run_sched_var = GFS_DYNAMIC, - .run_sched_modifier = 1, + .run_sched_chunk_size = 1, .default_device_var = 0, .dyn_var = false, .nest_var = false, @@ -66,6 +68,7 @@ struct gomp_task_icv gomp_global_icv = { unsigned long gomp_max_active_levels_var = INT_MAX; bool gomp_cancel_var = false; +int gomp_max_task_priority_var = 0; #ifndef HAVE_SYNC_BUILTINS gomp_mutex_t gomp_managed_threads_lock; #endif @@ -76,6 +79,9 @@ char *gomp_bind_var_list; unsigned long gomp_bind_var_list_len; void **gomp_places_list; unsigned long gomp_places_list_len; +int gomp_debug_var; +char *goacc_device_type; +int goacc_device_num; /* Parse the OMP_SCHEDULE environment variable. */ @@ -118,7 +124,7 @@ parse_schedule (void) ++env; if (*env == '\0') { - gomp_global_icv.run_sched_modifier + gomp_global_icv.run_sched_chunk_size = gomp_global_icv.run_sched_var != GFS_STATIC; return; } @@ -144,7 +150,7 @@ parse_schedule (void) if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC) value = 1; - gomp_global_icv.run_sched_modifier = value; + gomp_global_icv.run_sched_chunk_size = value; return; unknown: @@ -1011,6 +1017,16 @@ parse_affinity (bool ignore) return false; } +static void +parse_acc_device_type (void) +{ + const char *env = getenv ("ACC_DEVICE_TYPE"); + + if (env && *env != '\0') + goacc_device_type = strdup (env); + else + goacc_device_type = NULL; +} static void handle_omp_display_env (unsigned long stacksize, int wait_policy) @@ -1054,7 +1070,7 @@ handle_omp_display_env (unsigned long st fputs ("\nOPENMP DISPLAY ENVIRONMENT BEGIN\n", stderr); - fputs (" _OPENMP = '201307'\n", stderr); + fputs (" _OPENMP = '201511'\n", stderr); fprintf (stderr, " OMP_DYNAMIC = '%s'\n", gomp_global_icv.dyn_var ? "TRUE" : "FALSE"); fprintf (stderr, " OMP_NESTED = '%s'\n", @@ -1142,6 +1158,8 @@ handle_omp_display_env (unsigned long st gomp_cancel_var ? "TRUE" : "FALSE"); fprintf (stderr, " OMP_DEFAULT_DEVICE = '%d'\n", gomp_global_icv.default_device_var); + fprintf (stderr, " OMP_MAX_TASK_PRIORITY = '%d'\n", + gomp_max_task_priority_var); if (verbose) { @@ -1174,6 +1192,7 @@ initialize_env (void) parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var); parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var); parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true); + parse_int ("OMP_MAX_TASK_PRIORITY", &gomp_max_task_priority_var, true); parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var, true); if (parse_unsigned_long ("OMP_THREAD_LIMIT", &thread_limit_var, false)) @@ -1181,6 +1200,7 @@ initialize_env (void) gomp_global_icv.thread_limit_var = thread_limit_var > INT_MAX ? UINT_MAX : thread_limit_var; } + parse_int ("GOMP_DEBUG", &gomp_debug_var, true); #ifndef HAVE_SYNC_BUILTINS gomp_mutex_init (&gomp_managed_threads_lock); #endif @@ -1271,6 +1291,15 @@ initialize_env (void) } handle_omp_display_env (stacksize, wait_policy); + + /* OpenACC. */ + + if (!parse_int ("ACC_DEVICE_NUM", &goacc_device_num, true)) + goacc_device_num = 0; + + parse_acc_device_type (); + + goacc_runtime_initialize (); } @@ -1312,21 +1341,21 @@ omp_get_nested (void) } void -omp_set_schedule (omp_sched_t kind, int modifier) +omp_set_schedule (omp_sched_t kind, int chunk_size) { struct gomp_task_icv *icv = gomp_icv (true); switch (kind) { case omp_sched_static: - if (modifier < 1) - modifier = 0; - icv->run_sched_modifier = modifier; + if (chunk_size < 1) + chunk_size = 0; + icv->run_sched_chunk_size = chunk_size; break; case omp_sched_dynamic: case omp_sched_guided: - if (modifier < 1) - modifier = 1; - icv->run_sched_modifier = modifier; + if (chunk_size < 1) + chunk_size = 1; + icv->run_sched_chunk_size = chunk_size; break; case omp_sched_auto: break; @@ -1337,11 +1366,11 @@ omp_set_schedule (omp_sched_t kind, int } void -omp_get_schedule (omp_sched_t *kind, int *modifier) +omp_get_schedule (omp_sched_t *kind, int *chunk_size) { struct gomp_task_icv *icv = gomp_icv (false); *kind = icv->run_sched_var; - *modifier = icv->run_sched_modifier; + *chunk_size = icv->run_sched_chunk_size; } int @@ -1377,6 +1406,12 @@ omp_get_cancellation (void) return gomp_cancel_var; } +int +omp_get_max_task_priority (void) +{ + return gomp_max_task_priority_var; +} + omp_proc_bind_t omp_get_proc_bind (void) { @@ -1425,6 +1460,59 @@ omp_is_initial_device (void) return 1; } +int +omp_get_initial_device (void) +{ + return GOMP_DEVICE_HOST_FALLBACK; +} + +int +omp_get_num_places (void) +{ + return gomp_places_list_len; +} + +int +omp_get_place_num (void) +{ + if (gomp_places_list == NULL) + return -1; + + struct gomp_thread *thr = gomp_thread (); + if (thr->place == 0) + gomp_init_affinity (); + + return (int) thr->place - 1; +} + +int +omp_get_partition_num_places (void) +{ + if (gomp_places_list == NULL) + return 0; + + struct gomp_thread *thr = gomp_thread (); + if (thr->place == 0) + gomp_init_affinity (); + + return thr->ts.place_partition_len; +} + +void +omp_get_partition_place_nums (int *place_nums) +{ + if (gomp_places_list == NULL) + return; + + struct gomp_thread *thr = gomp_thread (); + if (thr->place == 0) + gomp_init_affinity (); + + unsigned int i; + for (i = 0; i < thr->ts.place_partition_len; i++) + *place_nums++ = thr->ts.place_partition_off + i; +} + ialias (omp_set_dynamic) ialias (omp_set_nested) ialias (omp_set_num_threads) @@ -1444,3 +1532,9 @@ ialias (omp_get_num_devices) ialias (omp_get_num_teams) ialias (omp_get_team_num) ialias (omp_is_initial_device) +ialias (omp_get_initial_device) +ialias (omp_get_max_task_priority) +ialias (omp_get_num_places) +ialias (omp_get_place_num) +ialias (omp_get_partition_num_places) +ialias (omp_get_partition_place_nums) --- libgomp/openacc.h.jj 2016-07-13 16:57:04.432535397 +0200 +++ libgomp/openacc.h 2016-07-13 16:57:04.432535397 +0200 @@ -0,0 +1,131 @@ +/* OpenACC Runtime Library User-facing Declarations + + Copyright (C) 2013-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _OPENACC_H +#define _OPENACC_H 1 + +/* The OpenACC standard is silent on whether or not including + might or must not include other header files. We chose to include + some. */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if __cplusplus >= 201103 +# define __GOACC_NOTHROW noexcept +#elif __cplusplus +# define __GOACC_NOTHROW throw () +#else /* Not C++ */ +# define __GOACC_NOTHROW __attribute__ ((__nothrow__)) +#endif + +/* Types */ +typedef enum acc_device_t { + /* Keep in sync with include/gomp-constants.h. */ + acc_device_none = 0, + acc_device_default = 1, + acc_device_host = 2, + /* acc_device_host_nonshm = 3 removed. */ + acc_device_not_host = 4, + acc_device_nvidia = 5, + _ACC_device_hwm, + /* Ensure enumeration is layout compatible with int. */ + _ACC_highest = __INT_MAX__, + _ACC_neg = -1 +} acc_device_t; + +typedef enum acc_async_t { + /* Keep in sync with include/gomp-constants.h. */ + acc_async_noval = -1, + acc_async_sync = -2 +} acc_async_t; + +int acc_get_num_devices (acc_device_t) __GOACC_NOTHROW; +void acc_set_device_type (acc_device_t) __GOACC_NOTHROW; +acc_device_t acc_get_device_type (void) __GOACC_NOTHROW; +void acc_set_device_num (int, acc_device_t) __GOACC_NOTHROW; +int acc_get_device_num (acc_device_t) __GOACC_NOTHROW; +int acc_async_test (int) __GOACC_NOTHROW; +int acc_async_test_all (void) __GOACC_NOTHROW; +void acc_wait (int) __GOACC_NOTHROW; +void acc_wait_async (int, int) __GOACC_NOTHROW; +void acc_wait_all (void) __GOACC_NOTHROW; +void acc_wait_all_async (int) __GOACC_NOTHROW; +void acc_init (acc_device_t) __GOACC_NOTHROW; +void acc_shutdown (acc_device_t) __GOACC_NOTHROW; +#ifdef __cplusplus +int acc_on_device (int __arg) __GOACC_NOTHROW; +#else +int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW; +#endif +void *acc_malloc (size_t) __GOACC_NOTHROW; +void acc_free (void *) __GOACC_NOTHROW; +/* Some of these would be more correct with const qualifiers, but + the standard specifies otherwise. */ +void *acc_copyin (void *, size_t) __GOACC_NOTHROW; +void *acc_present_or_copyin (void *, size_t) __GOACC_NOTHROW; +void *acc_create (void *, size_t) __GOACC_NOTHROW; +void *acc_present_or_create (void *, size_t) __GOACC_NOTHROW; +void acc_copyout (void *, size_t) __GOACC_NOTHROW; +void acc_delete (void *, size_t) __GOACC_NOTHROW; +void acc_update_device (void *, size_t) __GOACC_NOTHROW; +void acc_update_self (void *, size_t) __GOACC_NOTHROW; +void acc_map_data (void *, void *, size_t) __GOACC_NOTHROW; +void acc_unmap_data (void *) __GOACC_NOTHROW; +void *acc_deviceptr (void *) __GOACC_NOTHROW; +void *acc_hostptr (void *) __GOACC_NOTHROW; +int acc_is_present (void *, size_t) __GOACC_NOTHROW; +void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW; +void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW; + +/* Old names. OpenACC does not specify whether these can or must + not be macros, inlines or aliases for the new names. */ +#define acc_pcreate acc_present_or_create +#define acc_pcopyin acc_present_or_copyin + +/* CUDA-specific routines. */ +void *acc_get_current_cuda_device (void) __GOACC_NOTHROW; +void *acc_get_current_cuda_context (void) __GOACC_NOTHROW; +void *acc_get_cuda_stream (int) __GOACC_NOTHROW; +int acc_set_cuda_stream (int, void *) __GOACC_NOTHROW; + +#ifdef __cplusplus +} + +/* Forwarding function with correctly typed arg. */ + +#pragma acc routine seq +inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW +{ + return acc_on_device ((int) __arg); +} +#endif + +#endif /* _OPENACC_H */ --- libgomp/config/linux/doacross.h.jj 2016-07-13 16:57:18.902355979 +0200 +++ libgomp/config/linux/doacross.h 2016-07-13 16:57:18.902355979 +0200 @@ -0,0 +1,57 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Jakub Jelinek . + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* This is a Linux specific implementation of doacross spinning. */ + +#ifndef GOMP_DOACROSS_H +#define GOMP_DOACROSS_H 1 + +#include "libgomp.h" +#include +#include "wait.h" + +#ifdef HAVE_ATTRIBUTE_VISIBILITY +# pragma GCC visibility push(hidden) +#endif + +static inline void doacross_spin (unsigned long *addr, unsigned long expected, + unsigned long cur) +{ + /* FIXME: back off depending on how large expected - cur is. */ + do + { + cpu_relax (); + cur = __atomic_load_n (addr, MEMMODEL_RELAXED); + if (expected < cur) + return; + } + while (1); +} + +#ifdef HAVE_ATTRIBUTE_VISIBILITY +# pragma GCC visibility pop +#endif + +#endif /* GOMP_DOACROSS_H */ --- libgomp/config/posix/doacross.h.jj 2016-07-13 16:57:18.903355966 +0200 +++ libgomp/config/posix/doacross.h 2016-07-13 16:57:18.903355966 +0200 @@ -0,0 +1,62 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Jakub Jelinek . + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* This is a generic implementation of doacross spinning. */ + +#ifndef GOMP_DOACROSS_H +#define GOMP_DOACROSS_H 1 + +#include "libgomp.h" +#include + +#ifdef HAVE_ATTRIBUTE_VISIBILITY +# pragma GCC visibility push(hidden) +#endif + +static inline void +cpu_relax (void) +{ + __asm volatile ("" : : : "memory"); +} + +static inline void doacross_spin (unsigned long *addr, unsigned long expected, + unsigned long cur) +{ + /* FIXME: back off depending on how large expected - cur is. */ + do + { + cpu_relax (); + cur = __atomic_load_n (addr, MEMMODEL_RELAXED); + if (expected < cur) + return; + } + while (1); +} + +#ifdef HAVE_ATTRIBUTE_VISIBILITY +# pragma GCC visibility pop +#endif + +#endif /* GOMP_DOACROSS_H */ --- libgomp/splay-tree.c.jj 2016-07-13 16:57:18.919355768 +0200 +++ libgomp/splay-tree.c 2016-07-13 16:57:18.919355768 +0200 @@ -0,0 +1,238 @@ +/* A splay-tree datatype. + Copyright (C) 1998-2016 Free Software Foundation, Inc. + Contributed by Mark Mitchell (mark@markmitchell.com). + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* The splay tree code copied from include/splay-tree.h and adjusted, + so that all the data lives directly in splay_tree_node_s structure + and no extra allocations are needed. */ + +/* For an easily readable description of splay-trees, see: + + Lewis, Harry R. and Denenberg, Larry. Data Structures and Their + Algorithms. Harper-Collins, Inc. 1991. + + The major feature of splay trees is that all basic tree operations + are amortized O(log n) time for a tree with n nodes. */ + +#include "libgomp.h" + +/* Rotate the edge joining the left child N with its parent P. PP is the + grandparents' pointer to P. */ + +static inline void +rotate_left (splay_tree_node *pp, splay_tree_node p, splay_tree_node n) +{ + splay_tree_node tmp; + tmp = n->right; + n->right = p; + p->left = tmp; + *pp = n; +} + +/* Rotate the edge joining the right child N with its parent P. PP is the + grandparents' pointer to P. */ + +static inline void +rotate_right (splay_tree_node *pp, splay_tree_node p, splay_tree_node n) +{ + splay_tree_node tmp; + tmp = n->left; + n->left = p; + p->right = tmp; + *pp = n; +} + +/* Bottom up splay of KEY. */ + +static void +splay_tree_splay (splay_tree sp, splay_tree_key key) +{ + if (sp->root == NULL) + return; + + do { + int cmp1, cmp2; + splay_tree_node n, c; + + n = sp->root; + cmp1 = splay_compare (key, &n->key); + + /* Found. */ + if (cmp1 == 0) + return; + + /* Left or right? If no child, then we're done. */ + if (cmp1 < 0) + c = n->left; + else + c = n->right; + if (!c) + return; + + /* Next one left or right? If found or no child, we're done + after one rotation. */ + cmp2 = splay_compare (key, &c->key); + if (cmp2 == 0 + || (cmp2 < 0 && !c->left) + || (cmp2 > 0 && !c->right)) + { + if (cmp1 < 0) + rotate_left (&sp->root, n, c); + else + rotate_right (&sp->root, n, c); + return; + } + + /* Now we have the four cases of double-rotation. */ + if (cmp1 < 0 && cmp2 < 0) + { + rotate_left (&n->left, c, c->left); + rotate_left (&sp->root, n, n->left); + } + else if (cmp1 > 0 && cmp2 > 0) + { + rotate_right (&n->right, c, c->right); + rotate_right (&sp->root, n, n->right); + } + else if (cmp1 < 0 && cmp2 > 0) + { + rotate_right (&n->left, c, c->right); + rotate_left (&sp->root, n, n->left); + } + else if (cmp1 > 0 && cmp2 < 0) + { + rotate_left (&n->right, c, c->left); + rotate_right (&sp->root, n, n->right); + } + } while (1); +} + +/* Insert a new NODE into SP. The NODE shouldn't exist in the tree. */ + +attribute_hidden void +splay_tree_insert (splay_tree sp, splay_tree_node node) +{ + int comparison = 0; + + splay_tree_splay (sp, &node->key); + + if (sp->root) + comparison = splay_compare (&sp->root->key, &node->key); + + if (sp->root && comparison == 0) + gomp_fatal ("Duplicate node"); + else + { + /* Insert it at the root. */ + if (sp->root == NULL) + node->left = node->right = NULL; + else if (comparison < 0) + { + node->left = sp->root; + node->right = node->left->right; + node->left->right = NULL; + } + else + { + node->right = sp->root; + node->left = node->right->left; + node->right->left = NULL; + } + + sp->root = node; + } +} + +/* Remove node with KEY from SP. It is not an error if it did not exist. */ + +attribute_hidden void +splay_tree_remove (splay_tree sp, splay_tree_key key) +{ + splay_tree_splay (sp, key); + + if (sp->root && splay_compare (&sp->root->key, key) == 0) + { + splay_tree_node left, right; + + left = sp->root->left; + right = sp->root->right; + + /* One of the children is now the root. Doesn't matter much + which, so long as we preserve the properties of the tree. */ + if (left) + { + sp->root = left; + + /* If there was a right child as well, hang it off the + right-most leaf of the left child. */ + if (right) + { + while (left->right) + left = left->right; + left->right = right; + } + } + else + sp->root = right; + } +} + +/* Lookup KEY in SP, returning NODE if present, and NULL + otherwise. */ + +attribute_hidden splay_tree_key +splay_tree_lookup (splay_tree sp, splay_tree_key key) +{ + splay_tree_splay (sp, key); + + if (sp->root && splay_compare (&sp->root->key, key) == 0) + return &sp->root->key; + else + return NULL; +} + +/* Helper function for splay_tree_foreach. + + Run FUNC on every node in KEY. */ + +static void +splay_tree_foreach_internal (splay_tree_node node, splay_tree_callback func, + void *data) +{ + if (!node) + return; + func (&node->key, data); + splay_tree_foreach_internal (node->left, func, data); + /* Yeah, whatever. GCC can fix my tail recursion. */ + splay_tree_foreach_internal (node->right, func, data); +} + +/* Run FUNC on each of the nodes in SP. */ + +attribute_hidden void +splay_tree_foreach (splay_tree sp, splay_tree_callback func, void *data) +{ + splay_tree_foreach_internal (sp->root, func, data); +} --- libgomp/libgomp-plugin.c.jj 2016-07-13 16:57:04.435535360 +0200 +++ libgomp/libgomp-plugin.c 2016-07-13 16:57:04.435535360 +0200 @@ -0,0 +1,80 @@ +/* Copyright (C) 2014-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Exported (non-hidden) functions exposing libgomp interface for plugins. */ + +#include + +#include "libgomp.h" +#include "libgomp-plugin.h" + +void * +GOMP_PLUGIN_malloc (size_t size) +{ + return gomp_malloc (size); +} + +void * +GOMP_PLUGIN_malloc_cleared (size_t size) +{ + return gomp_malloc_cleared (size); +} + +void * +GOMP_PLUGIN_realloc (void *ptr, size_t size) +{ + return gomp_realloc (ptr, size); +} + +void +GOMP_PLUGIN_debug (int kind, const char *msg, ...) +{ + va_list ap; + + va_start (ap, msg); + gomp_vdebug (kind, msg, ap); + va_end (ap); +} + +void +GOMP_PLUGIN_error (const char *msg, ...) +{ + va_list ap; + + va_start (ap, msg); + gomp_verror (msg, ap); + va_end (ap); +} + +void +GOMP_PLUGIN_fatal (const char *msg, ...) +{ + va_list ap; + + va_start (ap, msg); + gomp_vfatal (msg, ap); + va_end (ap); +} --- libgomp/libgomp-plugin.h.jj 2016-07-13 16:57:04.438535323 +0200 +++ libgomp/libgomp-plugin.h 2016-07-13 16:57:04.438535323 +0200 @@ -0,0 +1,80 @@ +/* Copyright (C) 2014-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* An interface to various libgomp-internal functions for use by plugins. */ + +#ifndef LIBGOMP_PLUGIN_H +#define LIBGOMP_PLUGIN_H 1 + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Capabilities of offloading devices. */ +#define GOMP_OFFLOAD_CAP_SHARED_MEM (1 << 0) +#define GOMP_OFFLOAD_CAP_NATIVE_EXEC (1 << 1) +#define GOMP_OFFLOAD_CAP_OPENMP_400 (1 << 2) +#define GOMP_OFFLOAD_CAP_OPENACC_200 (1 << 3) + +/* Type of offload target device. Keep in sync with include/gomp-constants.h. */ +enum offload_target_type +{ + OFFLOAD_TARGET_TYPE_HOST = 2, + /* OFFLOAD_TARGET_TYPE_HOST_NONSHM = 3 removed. */ + OFFLOAD_TARGET_TYPE_NVIDIA_PTX = 5, + OFFLOAD_TARGET_TYPE_INTEL_MIC = 6, + OFFLOAD_TARGET_TYPE_HSA = 7 +}; + +/* Auxiliary struct, used for transferring pairs of addresses from plugin + to libgomp. */ +struct addr_pair +{ + uintptr_t start; + uintptr_t end; +}; + +/* Miscellaneous functions. */ +extern void *GOMP_PLUGIN_malloc (size_t) __attribute__ ((malloc)); +extern void *GOMP_PLUGIN_malloc_cleared (size_t) __attribute__ ((malloc)); +extern void *GOMP_PLUGIN_realloc (void *, size_t); +void GOMP_PLUGIN_target_task_completion (void *); + +extern void GOMP_PLUGIN_debug (int, const char *, ...) + __attribute__ ((format (printf, 2, 3))); +extern void GOMP_PLUGIN_error (const char *, ...) + __attribute__ ((format (printf, 1, 2))); +extern void GOMP_PLUGIN_fatal (const char *, ...) + __attribute__ ((noreturn, format (printf, 1, 2))); + +#ifdef __cplusplus +} +#endif + +#endif --- libgomp/oacc-async.c.jj 2016-07-13 16:57:13.488423109 +0200 +++ libgomp/oacc-async.c 2016-07-13 16:57:13.488423109 +0200 @@ -0,0 +1,107 @@ +/* OpenACC Runtime Library Definitions. + + Copyright (C) 2013-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#include +#include "openacc.h" +#include "libgomp.h" +#include "oacc-int.h" + +int +acc_async_test (int async) +{ + if (async < acc_async_sync) + gomp_fatal ("invalid async argument: %d", async); + + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + return thr->dev->openacc.async_test_func (async); +} + +int +acc_async_test_all (void) +{ + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + return thr->dev->openacc.async_test_all_func (); +} + +void +acc_wait (int async) +{ + if (async < acc_async_sync) + gomp_fatal ("invalid async argument: %d", async); + + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + thr->dev->openacc.async_wait_func (async); +} + +void +acc_wait_async (int async1, int async2) +{ + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + thr->dev->openacc.async_wait_async_func (async1, async2); +} + +void +acc_wait_all (void) +{ + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + thr->dev->openacc.async_wait_all_func (); +} + +void +acc_wait_all_async (int async) +{ + if (async < acc_async_sync) + gomp_fatal ("invalid async argument: %d", async); + + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + thr->dev->openacc.async_wait_all_async_func (async); +} --- libgomp/splay-tree.h.jj 2016-07-13 16:57:18.934355582 +0200 +++ libgomp/splay-tree.h 2016-07-13 16:57:18.934355582 +0200 @@ -0,0 +1,130 @@ +/* A splay-tree datatype. + Copyright (C) 1998-2016 Free Software Foundation, Inc. + Contributed by Mark Mitchell (mark@markmitchell.com). + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* The splay tree code copied from include/splay-tree.h and adjusted, + so that all the data lives directly in splay_tree_node_s structure + and no extra allocations are needed. + + Files including this header should before including it add: +typedef struct splay_tree_node_s *splay_tree_node; +typedef struct splay_tree_s *splay_tree; +typedef struct splay_tree_key_s *splay_tree_key; + define splay_tree_key_s structure, and define + splay_compare inline function. + + Alternatively, they can define splay_tree_prefix macro before + including this header and then all the above types, the + splay_compare function and the splay_tree_{lookup,insert_remove} + function will be prefixed by that prefix. If splay_tree_prefix + macro is defined, this header must be included twice: once where + you need the header file definitions, and once where you need the + .c implementation routines. In the latter case, you must also + define the macro splay_tree_c. See the include of splay-tree.h in + priority_queue.[hc] for an example. */ + +/* For an easily readable description of splay-trees, see: + + Lewis, Harry R. and Denenberg, Larry. Data Structures and Their + Algorithms. Harper-Collins, Inc. 1991. + + The major feature of splay trees is that all basic tree operations + are amortized O(log n) time for a tree with n nodes. */ + +#ifdef splay_tree_prefix +# define splay_tree_name_1(prefix, name) prefix ## _ ## name +# define splay_tree_name(prefix, name) splay_tree_name_1 (prefix, name) +# define splay_tree_node_s \ + splay_tree_name (splay_tree_prefix, splay_tree_node_s) +# define splay_tree_s \ + splay_tree_name (splay_tree_prefix, splay_tree_s) +# define splay_tree_key_s \ + splay_tree_name (splay_tree_prefix, splay_tree_key_s) +# define splay_tree_node \ + splay_tree_name (splay_tree_prefix, splay_tree_node) +# define splay_tree \ + splay_tree_name (splay_tree_prefix, splay_tree) +# define splay_tree_key \ + splay_tree_name (splay_tree_prefix, splay_tree_key) +# define splay_compare \ + splay_tree_name (splay_tree_prefix, splay_compare) +# define splay_tree_lookup \ + splay_tree_name (splay_tree_prefix, splay_tree_lookup) +# define splay_tree_insert \ + splay_tree_name (splay_tree_prefix, splay_tree_insert) +# define splay_tree_remove \ + splay_tree_name (splay_tree_prefix, splay_tree_remove) +# define splay_tree_foreach \ + splay_tree_name (splay_tree_prefix, splay_tree_foreach) +# define splay_tree_callback \ + splay_tree_name (splay_tree_prefix, splay_tree_callback) +#endif + +#ifndef splay_tree_c +/* Header file definitions and prototypes. */ + +/* The nodes in the splay tree. */ +struct splay_tree_node_s { + struct splay_tree_key_s key; + /* The left and right children, respectively. */ + splay_tree_node left; + splay_tree_node right; +}; + +/* The splay tree. */ +struct splay_tree_s { + splay_tree_node root; +}; + +typedef void (*splay_tree_callback) (splay_tree_key, void *); + +extern splay_tree_key splay_tree_lookup (splay_tree, splay_tree_key); +extern void splay_tree_insert (splay_tree, splay_tree_node); +extern void splay_tree_remove (splay_tree, splay_tree_key); +extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *); +#else /* splay_tree_c */ +# ifdef splay_tree_prefix +# include "splay-tree.c" +# undef splay_tree_name_1 +# undef splay_tree_name +# undef splay_tree_node_s +# undef splay_tree_s +# undef splay_tree_key_s +# undef splay_tree_node +# undef splay_tree +# undef splay_tree_key +# undef splay_compare +# undef splay_tree_lookup +# undef splay_tree_insert +# undef splay_tree_remove +# undef splay_tree_foreach +# undef splay_tree_callback +# undef splay_tree_c +# endif +#endif /* #ifndef splay_tree_c */ + +#ifdef splay_tree_prefix +# undef splay_tree_prefix +#endif --- libgomp/oacc-plugin.c.jj 2016-07-13 16:57:13.481423196 +0200 +++ libgomp/oacc-plugin.c 2016-07-14 15:40:21.653151873 +0200 @@ -0,0 +1,44 @@ +/* Copyright (C) 2014-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Initialize and register OpenACC dispatch table from libgomp plugin. */ + +#include "libgomp.h" +#include "oacc-plugin.h" +#include "oacc-int.h" + +void +GOMP_PLUGIN_async_unmap_vars (void *ptr, int async) +{ +} + +/* Return the target-specific part of the TLS data for the current thread. */ + +void * +GOMP_PLUGIN_acc_thread (void) +{ + return NULL; +} --- libgomp/oacc-init.c.jj 2016-07-13 16:57:04.423535509 +0200 +++ libgomp/oacc-init.c 2016-07-14 19:06:41.679575688 +0200 @@ -0,0 +1,640 @@ +/* OpenACC Runtime initialization routines + + Copyright (C) 2013-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#include "libgomp.h" +#include "oacc-int.h" +#include "openacc.h" +#include +#include +#include +#include +#include + +/* This lock is used to protect access to cached_base_dev, dispatchers and + the (abstract) initialisation state of attached offloading devices. */ + +static gomp_mutex_t acc_device_lock; + +/* A cached version of the dispatcher for the global "current" accelerator type, + e.g. used as the default when creating new host threads. This is the + device-type equivalent of goacc_device_num (which specifies which device to + use out of potentially several of the same type). If there are several + devices of a given type, this points at the first one. */ + +static struct gomp_device_descr *cached_base_dev = NULL; + +#if defined HAVE_TLS || defined USE_EMUTLS +__thread struct goacc_thread *goacc_tls_data; +#else +pthread_key_t goacc_tls_key; +#endif +static pthread_key_t goacc_cleanup_key; + +static struct goacc_thread *goacc_threads; +static gomp_mutex_t goacc_thread_lock; + +/* An array of dispatchers for device types, indexed by the type. This array + only references "base" devices, and other instances of the same type are + found by simply indexing from each such device (which are stored linearly, + grouped by device in target.c:devices). */ +static struct gomp_device_descr *dispatchers[_ACC_device_hwm] = { 0 }; + +attribute_hidden void +goacc_register (struct gomp_device_descr *disp) +{ + /* Only register the 0th device here. */ + if (disp->target_id != 0) + return; + + gomp_mutex_lock (&acc_device_lock); + + assert (acc_device_type (disp->type) != acc_device_none + && acc_device_type (disp->type) != acc_device_default + && acc_device_type (disp->type) != acc_device_not_host); + assert (!dispatchers[disp->type]); + dispatchers[disp->type] = disp; + + gomp_mutex_unlock (&acc_device_lock); +} + +static const char * +name_of_acc_device_t (enum acc_device_t type) +{ + switch (type) + { + case acc_device_none: return "none"; + case acc_device_default: return "default"; + case acc_device_host: return "host"; + case acc_device_not_host: return "not_host"; + case acc_device_nvidia: return "nvidia"; + default: gomp_fatal ("unknown device type %u", (unsigned) type); + } +} + +/* ACC_DEVICE_LOCK must be held before calling this function. If FAIL_IS_ERROR + is true, this function raises an error if there are no devices of type D, + otherwise it returns NULL in that case. */ + +static struct gomp_device_descr * +resolve_device (acc_device_t d, bool fail_is_error) +{ + acc_device_t d_arg = d; + + switch (d) + { + case acc_device_default: + { + if (goacc_device_type) + { + /* Lookup the named device. */ + if (!strcasecmp (goacc_device_type, "host")) + { + d = acc_device_host; + goto found; + } + + if (fail_is_error) + { + gomp_mutex_unlock (&acc_device_lock); + gomp_fatal ("device type %s not supported", goacc_device_type); + } + else + return NULL; + } + + /* No default device specified, so start scanning for any non-host + device that is available. */ + d = acc_device_not_host; + } + /* FALLTHROUGH */ + + case acc_device_not_host: + if (d_arg == acc_device_default) + { + d = acc_device_host; + goto found; + } + if (fail_is_error) + { + gomp_mutex_unlock (&acc_device_lock); + gomp_fatal ("no device found"); + } + else + return NULL; + break; + + case acc_device_host: + break; + + default: + if (d > _ACC_device_hwm) + { + if (fail_is_error) + goto unsupported_device; + else + return NULL; + } + break; + } + found: + + assert (d != acc_device_none + && d != acc_device_default + && d != acc_device_not_host); + + if (dispatchers[d] == NULL && fail_is_error) + { + unsupported_device: + gomp_mutex_unlock (&acc_device_lock); + gomp_fatal ("device type %s not supported", name_of_acc_device_t (d)); + } + + return dispatchers[d]; +} + +/* Emit a suitable error if no device of a particular type is available, or + the given device number is out-of-range. */ +static void +acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs) +{ + if (ndevs == 0) + gomp_fatal ("no devices of type %s available", name_of_acc_device_t (d)); + else + gomp_fatal ("device %u out of range", ord); +} + +/* This is called when plugins have been initialized, and serves to call + (indirectly) the target's device_init hook. Calling multiple times without + an intervening acc_shutdown_1 call is an error. ACC_DEVICE_LOCK must be + held before calling this function. */ + +static struct gomp_device_descr * +acc_init_1 (acc_device_t d) +{ + struct gomp_device_descr *base_dev, *acc_dev; + int ndevs; + + base_dev = resolve_device (d, true); + + ndevs = base_dev->get_num_devices_func (); + + if (ndevs <= 0 || goacc_device_num >= ndevs) + acc_dev_num_out_of_range (d, goacc_device_num, ndevs); + + acc_dev = &base_dev[goacc_device_num]; + + gomp_mutex_lock (&acc_dev->lock); + if (acc_dev->state == GOMP_DEVICE_INITIALIZED) + { + gomp_mutex_unlock (&acc_dev->lock); + gomp_fatal ("device already active"); + } + + gomp_init_device (acc_dev); + gomp_mutex_unlock (&acc_dev->lock); + + return base_dev; +} + +/* ACC_DEVICE_LOCK must be held before calling this function. */ + +static void +acc_shutdown_1 (acc_device_t d) +{ + struct gomp_device_descr *base_dev; + struct goacc_thread *walk; + int ndevs, i; + bool devices_active = false; + + /* Get the base device for this device type. */ + base_dev = resolve_device (d, true); + + ndevs = base_dev->get_num_devices_func (); + + gomp_mutex_lock (&goacc_thread_lock); + + /* Free target-specific TLS data and close all devices. */ + for (walk = goacc_threads; walk != NULL; walk = walk->next) + { + if (walk->target_tls) + base_dev->openacc.destroy_thread_data_func (walk->target_tls); + + walk->target_tls = NULL; + + /* Similarly, if this happens then user code has done something weird. */ + if (walk->saved_bound_dev) + { + gomp_mutex_unlock (&goacc_thread_lock); + gomp_fatal ("shutdown during host fallback"); + } + + if (walk->dev) + { + gomp_mutex_lock (&walk->dev->lock); + gomp_free_memmap (&walk->dev->mem_map); + gomp_mutex_unlock (&walk->dev->lock); + + walk->dev = NULL; + walk->base_dev = NULL; + } + } + + gomp_mutex_unlock (&goacc_thread_lock); + + /* Close all the devices of this type that have been opened. */ + bool ret = true; + for (i = 0; i < ndevs; i++) + { + struct gomp_device_descr *acc_dev = &base_dev[i]; + gomp_mutex_lock (&acc_dev->lock); + if (acc_dev->state == GOMP_DEVICE_INITIALIZED) + { + devices_active = true; + ret &= acc_dev->fini_device_func (acc_dev->target_id); + acc_dev->state = GOMP_DEVICE_UNINITIALIZED; + } + gomp_mutex_unlock (&acc_dev->lock); + } + + if (!ret) + gomp_fatal ("device finalization failed"); + + if (!devices_active) + gomp_fatal ("no device initialized"); +} + +static struct goacc_thread * +goacc_new_thread (void) +{ + struct goacc_thread *thr = gomp_malloc (sizeof (struct gomp_thread)); + +#if defined HAVE_TLS || defined USE_EMUTLS + goacc_tls_data = thr; +#else + pthread_setspecific (goacc_tls_key, thr); +#endif + + pthread_setspecific (goacc_cleanup_key, thr); + + gomp_mutex_lock (&goacc_thread_lock); + thr->next = goacc_threads; + goacc_threads = thr; + gomp_mutex_unlock (&goacc_thread_lock); + + return thr; +} + +static void +goacc_destroy_thread (void *data) +{ + struct goacc_thread *thr = data, *walk, *prev; + + gomp_mutex_lock (&goacc_thread_lock); + + if (thr) + { + struct gomp_device_descr *acc_dev = thr->dev; + + if (acc_dev && thr->target_tls) + { + acc_dev->openacc.destroy_thread_data_func (thr->target_tls); + thr->target_tls = NULL; + } + + assert (!thr->mapped_data); + + /* Remove from thread list. */ + for (prev = NULL, walk = goacc_threads; walk; + prev = walk, walk = walk->next) + if (walk == thr) + { + if (prev == NULL) + goacc_threads = walk->next; + else + prev->next = walk->next; + + free (thr); + + break; + } + + assert (walk); + } + + gomp_mutex_unlock (&goacc_thread_lock); +} + +/* Use the ORD'th device instance for the current host thread (or -1 for the + current global default). The device (and the runtime) must be initialised + before calling this function. */ + +void +goacc_attach_host_thread_to_device (int ord) +{ + struct goacc_thread *thr = goacc_thread (); + struct gomp_device_descr *acc_dev = NULL, *base_dev = NULL; + int num_devices; + + if (thr && thr->dev && (thr->dev->target_id == ord || ord < 0)) + return; + + if (ord < 0) + ord = goacc_device_num; + + /* Decide which type of device to use. If the current thread has a device + type already (e.g. set by acc_set_device_type), use that, else use the + global default. */ + if (thr && thr->base_dev) + base_dev = thr->base_dev; + else + { + assert (cached_base_dev); + base_dev = cached_base_dev; + } + + num_devices = base_dev->get_num_devices_func (); + if (num_devices <= 0 || ord >= num_devices) + acc_dev_num_out_of_range (acc_device_type (base_dev->type), ord, + num_devices); + + if (!thr) + thr = goacc_new_thread (); + + thr->base_dev = base_dev; + thr->dev = acc_dev = &base_dev[ord]; + thr->saved_bound_dev = NULL; + + thr->target_tls + = acc_dev->openacc.create_thread_data_func (ord); + + acc_dev->openacc.async_set_async_func (acc_async_sync); +} + +/* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of + init/shutdown is per-process or per-thread. We choose per-process. */ + +void +acc_init (acc_device_t d) +{ + gomp_mutex_lock (&acc_device_lock); + + cached_base_dev = acc_init_1 (d); + + gomp_mutex_unlock (&acc_device_lock); + + goacc_attach_host_thread_to_device (-1); +} + +ialias (acc_init) + +void +acc_shutdown (acc_device_t d) +{ + gomp_mutex_lock (&acc_device_lock); + + acc_shutdown_1 (d); + + gomp_mutex_unlock (&acc_device_lock); +} + +ialias (acc_shutdown) + +int +acc_get_num_devices (acc_device_t d) +{ + int n = 0; + struct gomp_device_descr *acc_dev; + + if (d == acc_device_none) + return 0; + + gomp_mutex_lock (&acc_device_lock); + acc_dev = resolve_device (d, false); + gomp_mutex_unlock (&acc_device_lock); + + if (!acc_dev) + return 0; + + n = acc_dev->get_num_devices_func (); + if (n < 0) + n = 0; + + return n; +} + +ialias (acc_get_num_devices) + +/* Set the device type for the current thread only (using the current global + default device number), initialising that device if necessary. Also set the + default device type for new threads to D. */ + +void +acc_set_device_type (acc_device_t d) +{ + struct gomp_device_descr *base_dev, *acc_dev; + struct goacc_thread *thr = goacc_thread (); + + gomp_mutex_lock (&acc_device_lock); + + cached_base_dev = base_dev = resolve_device (d, true); + acc_dev = &base_dev[goacc_device_num]; + + gomp_mutex_lock (&acc_dev->lock); + if (acc_dev->state == GOMP_DEVICE_UNINITIALIZED) + gomp_init_device (acc_dev); + gomp_mutex_unlock (&acc_dev->lock); + + gomp_mutex_unlock (&acc_device_lock); + + /* We're changing device type: invalidate the current thread's dev and + base_dev pointers. */ + if (thr && thr->base_dev != base_dev) + { + thr->base_dev = thr->dev = NULL; + } + + goacc_attach_host_thread_to_device (-1); +} + +ialias (acc_set_device_type) + +acc_device_t +acc_get_device_type (void) +{ + acc_device_t res = acc_device_none; + struct gomp_device_descr *dev; + struct goacc_thread *thr = goacc_thread (); + + if (thr && thr->base_dev) + res = acc_device_type (thr->base_dev->type); + else + { + gomp_mutex_lock (&acc_device_lock); + dev = resolve_device (acc_device_default, true); + gomp_mutex_unlock (&acc_device_lock); + res = acc_device_type (dev->type); + } + + assert (res != acc_device_default + && res != acc_device_not_host); + + return res; +} + +ialias (acc_get_device_type) + +int +acc_get_device_num (acc_device_t d) +{ + const struct gomp_device_descr *dev; + struct goacc_thread *thr = goacc_thread (); + + if (d >= _ACC_device_hwm) + gomp_fatal ("unknown device type %u", (unsigned) d); + + gomp_mutex_lock (&acc_device_lock); + dev = resolve_device (d, true); + gomp_mutex_unlock (&acc_device_lock); + + if (thr && thr->base_dev == dev && thr->dev) + return thr->dev->target_id; + + return goacc_device_num; +} + +ialias (acc_get_device_num) + +void +acc_set_device_num (int ord, acc_device_t d) +{ + struct gomp_device_descr *base_dev, *acc_dev; + int num_devices; + + if (ord < 0) + ord = goacc_device_num; + + if ((int) d == 0) + /* Set whatever device is being used by the current host thread to use + device instance ORD. It's unclear if this is supposed to affect other + host threads too (OpenACC 2.0 (3.2.4) acc_set_device_num). */ + goacc_attach_host_thread_to_device (ord); + else + { + gomp_mutex_lock (&acc_device_lock); + + cached_base_dev = base_dev = resolve_device (d, true); + + num_devices = base_dev->get_num_devices_func (); + + if (num_devices <= 0 || ord >= num_devices) + acc_dev_num_out_of_range (d, ord, num_devices); + + acc_dev = &base_dev[ord]; + + gomp_mutex_lock (&acc_dev->lock); + if (acc_dev->state == GOMP_DEVICE_UNINITIALIZED) + gomp_init_device (acc_dev); + gomp_mutex_unlock (&acc_dev->lock); + + gomp_mutex_unlock (&acc_device_lock); + + goacc_attach_host_thread_to_device (ord); + } + + goacc_device_num = ord; +} + +ialias (acc_set_device_num) + +int +acc_on_device (acc_device_t dev) +{ + return dev == acc_device_host || dev == acc_device_none; +} + +ialias (acc_on_device) + +attribute_hidden void +goacc_runtime_initialize (void) +{ + gomp_mutex_init (&acc_device_lock); + +#if !(defined HAVE_TLS || defined USE_EMUTLS) + pthread_key_create (&goacc_tls_key, NULL); +#endif + + pthread_key_create (&goacc_cleanup_key, goacc_destroy_thread); + + cached_base_dev = NULL; + + goacc_threads = NULL; + gomp_mutex_init (&goacc_thread_lock); + + /* Initialize and register the 'host' device type. */ + goacc_host_init (); +} + +/* Compiler helper functions */ + +attribute_hidden void +goacc_save_and_set_bind (acc_device_t d) +{ + struct goacc_thread *thr = goacc_thread (); + + assert (!thr->saved_bound_dev); + + thr->saved_bound_dev = thr->dev; + thr->dev = dispatchers[d]; +} + +attribute_hidden void +goacc_restore_bind (void) +{ + struct goacc_thread *thr = goacc_thread (); + + thr->dev = thr->saved_bound_dev; + thr->saved_bound_dev = NULL; +} + +/* This is called from any OpenACC support function that may need to implicitly + initialize the libgomp runtime, either globally or from a new host thread. + On exit "goacc_thread" will return a valid & populated thread block. */ + +attribute_hidden void +goacc_lazy_initialize (void) +{ + struct goacc_thread *thr = goacc_thread (); + + if (thr && thr->dev) + return; + + if (!cached_base_dev) + acc_init (acc_device_default); + else + goacc_attach_host_thread_to_device (-1); +} --- libgomp/oacc-int.h.jj 2016-07-13 16:57:04.400535794 +0200 +++ libgomp/oacc-int.h 2016-07-13 16:57:04.400535794 +0200 @@ -0,0 +1,106 @@ +/* OpenACC Runtime - internal declarations + + Copyright (C) 2013-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* This file contains data types and function declarations that are not + part of the official OpenACC user interface. There are declarations + in here that are part of the GNU OpenACC ABI, in that the compiler is + required to know about them and use them. + + The convention is that the all caps prefix "GOACC" is used group items + that are part of the external ABI, and the lower case prefix "goacc" + is used group items that are completely private to the library. */ + +#ifndef OACC_INT_H +#define OACC_INT_H 1 + +#include "openacc.h" +#include "config.h" +#include +#include +#include + +#ifdef HAVE_ATTRIBUTE_VISIBILITY +# pragma GCC visibility push(hidden) +#endif + +static inline enum acc_device_t +acc_device_type (enum offload_target_type type) +{ + return (enum acc_device_t) type; +} + +struct goacc_thread +{ + /* The base device for the current thread. */ + struct gomp_device_descr *base_dev; + + /* The device for the current thread. */ + struct gomp_device_descr *dev; + + struct gomp_device_descr *saved_bound_dev; + + /* This is a linked list of data mapped by the "acc data" pragma, following + strictly push/pop semantics according to lexical scope. */ + struct target_mem_desc *mapped_data; + + /* These structures form a list: this is the next thread in that list. */ + struct goacc_thread *next; + + /* Target-specific data (used by plugin). */ + void *target_tls; +}; + +#if defined HAVE_TLS || defined USE_EMUTLS +extern __thread struct goacc_thread *goacc_tls_data; +static inline struct goacc_thread * +goacc_thread (void) +{ + return goacc_tls_data; +} +#else +extern pthread_key_t goacc_tls_key; +static inline struct goacc_thread * +goacc_thread (void) +{ + return pthread_getspecific (goacc_tls_key); +} +#endif + +void goacc_register (struct gomp_device_descr *) __GOACC_NOTHROW; +void goacc_attach_host_thread_to_device (int); +void goacc_runtime_initialize (void); +void goacc_save_and_set_bind (acc_device_t); +void goacc_restore_bind (void); +void goacc_lazy_initialize (void); +void goacc_host_init (void); + +#ifdef HAVE_ATTRIBUTE_VISIBILITY +# pragma GCC visibility pop +#endif + +#endif --- libgomp/oacc-host.c.jj 2016-07-13 16:57:13.489423096 +0200 +++ libgomp/oacc-host.c 2016-07-13 16:57:13.489423096 +0200 @@ -0,0 +1,266 @@ +/* OpenACC Runtime Library: acc_device_host. + + Copyright (C) 2013-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#include "libgomp.h" +#include "oacc-int.h" +#include "gomp-constants.h" + +#include +#include +#include + +static struct gomp_device_descr host_dispatch; + +static const char * +host_get_name (void) +{ + return host_dispatch.name; +} + +static unsigned int +host_get_caps (void) +{ + return host_dispatch.capabilities; +} + +static int +host_get_type (void) +{ + return host_dispatch.type; +} + +static int +host_get_num_devices (void) +{ + return 1; +} + +static bool +host_init_device (int n __attribute__ ((unused))) +{ + return true; +} + +static bool +host_fini_device (int n __attribute__ ((unused))) +{ + return true; +} + +static unsigned +host_version (void) +{ + return GOMP_VERSION; +} + +static int +host_load_image (int n __attribute__ ((unused)), + unsigned v __attribute__ ((unused)), + const void *t __attribute__ ((unused)), + struct addr_pair **r __attribute__ ((unused))) +{ + return 0; +} + +static bool +host_unload_image (int n __attribute__ ((unused)), + unsigned v __attribute__ ((unused)), + const void *t __attribute__ ((unused))) +{ + return true; +} + +static void * +host_alloc (int n __attribute__ ((unused)), size_t s) +{ + return gomp_malloc (s); +} + +static bool +host_free (int n __attribute__ ((unused)), void *p) +{ + free (p); + return true; +} + +static bool +host_dev2host (int n __attribute__ ((unused)), + void *h __attribute__ ((unused)), + const void *d __attribute__ ((unused)), + size_t s __attribute__ ((unused))) +{ + return true; +} + +static bool +host_host2dev (int n __attribute__ ((unused)), + void *d __attribute__ ((unused)), + const void *h __attribute__ ((unused)), + size_t s __attribute__ ((unused))) +{ + return true; +} + +static void +host_run (int n __attribute__ ((unused)), void *fn_ptr, void *vars, + void **args __attribute__((unused))) +{ + void (*fn)(void *) = (void (*)(void *)) fn_ptr; + + fn (vars); +} + +static void +host_openacc_exec (void (*fn) (void *), + size_t mapnum __attribute__ ((unused)), + void **hostaddrs, + void **devaddrs __attribute__ ((unused)), + int async __attribute__ ((unused)), + unsigned *dims __attribute ((unused)), + void *targ_mem_desc __attribute__ ((unused))) +{ + fn (hostaddrs); +} + +static void +host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)), + int async __attribute__ ((unused))) +{ +} + +static int +host_openacc_async_test (int async __attribute__ ((unused))) +{ + return 1; +} + +static int +host_openacc_async_test_all (void) +{ + return 1; +} + +static void +host_openacc_async_wait (int async __attribute__ ((unused))) +{ +} + +static void +host_openacc_async_wait_async (int async1 __attribute__ ((unused)), + int async2 __attribute__ ((unused))) +{ +} + +static void +host_openacc_async_wait_all (void) +{ +} + +static void +host_openacc_async_wait_all_async (int async __attribute__ ((unused))) +{ +} + +static void +host_openacc_async_set_async (int async __attribute__ ((unused))) +{ +} + +static void * +host_openacc_create_thread_data (int ord __attribute__ ((unused))) +{ + return NULL; +} + +static void +host_openacc_destroy_thread_data (void *tls_data __attribute__ ((unused))) +{ +} + +static struct gomp_device_descr host_dispatch = + { + .name = "host", + .capabilities = (GOMP_OFFLOAD_CAP_SHARED_MEM + | GOMP_OFFLOAD_CAP_NATIVE_EXEC + | GOMP_OFFLOAD_CAP_OPENACC_200), + .target_id = 0, + .type = OFFLOAD_TARGET_TYPE_HOST, + + .get_name_func = host_get_name, + .get_caps_func = host_get_caps, + .get_type_func = host_get_type, + .get_num_devices_func = host_get_num_devices, + .init_device_func = host_init_device, + .fini_device_func = host_fini_device, + .version_func = host_version, + .load_image_func = host_load_image, + .unload_image_func = host_unload_image, + .alloc_func = host_alloc, + .free_func = host_free, + .dev2host_func = host_dev2host, + .host2dev_func = host_host2dev, + .run_func = host_run, + + .mem_map = { NULL }, + /* .lock initilized in goacc_host_init. */ + .state = GOMP_DEVICE_UNINITIALIZED, + + .openacc = { + .data_environ = NULL, + + .exec_func = host_openacc_exec, + + .register_async_cleanup_func = host_openacc_register_async_cleanup, + + .async_test_func = host_openacc_async_test, + .async_test_all_func = host_openacc_async_test_all, + .async_wait_func = host_openacc_async_wait, + .async_wait_async_func = host_openacc_async_wait_async, + .async_wait_all_func = host_openacc_async_wait_all, + .async_wait_all_async_func = host_openacc_async_wait_all_async, + .async_set_async_func = host_openacc_async_set_async, + + .create_thread_data_func = host_openacc_create_thread_data, + .destroy_thread_data_func = host_openacc_destroy_thread_data, + + .cuda = { + .get_current_device_func = NULL, + .get_current_context_func = NULL, + .get_stream_func = NULL, + .set_stream_func = NULL, + } + } + }; + +/* Initialize and register this device type. */ +void +goacc_host_init (void) +{ + gomp_mutex_init (&host_dispatch.lock); + goacc_register (&host_dispatch); +} --- libgomp/oacc-parallel.c.jj 2016-07-13 16:57:04.399535807 +0200 +++ libgomp/oacc-parallel.c 2016-07-14 18:53:06.694996381 +0200 @@ -0,0 +1,241 @@ +/* Copyright (C) 2013-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* This file handles OpenACC constructs. */ + +#include "openacc.h" +#include "libgomp.h" +#include "libgomp_g.h" +#include "gomp-constants.h" +#include "oacc-int.h" +#ifdef HAVE_INTTYPES_H +# include /* For PRIu64. */ +#endif +#include +#include +#include + +static void goacc_wait (int async, int num_waits, va_list *ap); + + +/* Launch a possibly offloaded function on DEVICE. FN is the host fn + address. MAPNUM, HOSTADDRS, SIZES & KINDS describe the memory + blocks to be copied to/from the device. Varadic arguments are + keyed optional parameters terminated with a zero. */ + +void +GOACC_parallel_keyed (int device, void (*fn) (void *), + size_t mapnum, void **hostaddrs, size_t *sizes, + unsigned short *kinds, ...) +{ + bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; + struct goacc_thread *thr; + struct gomp_device_descr *acc_dev; + +#ifdef HAVE_INTTYPES_H + gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n", + __FUNCTION__, (uint64_t) mapnum, hostaddrs, sizes, kinds); +#else + gomp_debug (0, "%s: mapnum=%lu, hostaddrs=%p, sizes=%p, kinds=%p\n", + __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds); +#endif + goacc_lazy_initialize (); + + thr = goacc_thread (); + acc_dev = thr->dev; + + /* Host fallback if "if" clause is false or if the current device is set to + the host. */ + if (host_fallback) + { + goacc_save_and_set_bind (acc_device_host); + fn (hostaddrs); + goacc_restore_bind (); + return; + } + else if (acc_device_type (acc_dev->type) == acc_device_host) + { + fn (hostaddrs); + return; + } + + /* acc_device_host is the only supported device type. */ +} + +/* Legacy entry point, only provide host execution. */ + +void +GOACC_parallel (int device, void (*fn) (void *), + size_t mapnum, void **hostaddrs, size_t *sizes, + unsigned short *kinds, + int num_gangs, int num_workers, int vector_length, + int async, int num_waits, ...) +{ + goacc_save_and_set_bind (acc_device_host); + fn (hostaddrs); + goacc_restore_bind (); +} + +void +GOACC_data_start (int device, size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned short *kinds) +{ + goacc_lazy_initialize (); +} + +void +GOACC_data_end (void) +{ + gomp_debug (0, " %s: restore mappings\n", __FUNCTION__); + gomp_debug (0, " %s: mappings restored\n", __FUNCTION__); +} + +void +GOACC_enter_exit_data (int device, size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned short *kinds, + int async, int num_waits, ...) +{ + goacc_lazy_initialize (); +} + +static void +goacc_wait (int async, int num_waits, va_list *ap) +{ + struct goacc_thread *thr = goacc_thread (); + struct gomp_device_descr *acc_dev = thr->dev; + + while (num_waits--) + { + int qid = va_arg (*ap, int); + + if (acc_async_test (qid)) + continue; + + if (async == acc_async_sync) + acc_wait (qid); + else if (qid == async) + ;/* If we're waiting on the same asynchronous queue as we're + launching on, the queue itself will order work as + required, so there's no need to wait explicitly. */ + else + acc_dev->openacc.async_wait_async_func (qid, async); + } +} + +void +GOACC_update (int device, size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned short *kinds, + int async, int num_waits, ...) +{ + goacc_lazy_initialize (); +} + +void +GOACC_wait (int async, int num_waits, ...) +{ + if (num_waits) + { + va_list ap; + + va_start (ap, num_waits); + goacc_wait (async, num_waits, &ap); + va_end (ap); + } + else if (async == acc_async_sync) + acc_wait_all (); + else if (async == acc_async_noval) + goacc_thread ()->dev->openacc.async_wait_all_async_func (acc_async_noval); +} + +int +GOACC_get_num_threads (void) +{ + return 1; +} + +int +GOACC_get_thread_num (void) +{ + return 0; +} + +void +GOACC_declare (int device, size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned short *kinds) +{ + int i; + + for (i = 0; i < mapnum; i++) + { + unsigned char kind = kinds[i] & 0xff; + + if (kind == GOMP_MAP_POINTER || kind == GOMP_MAP_TO_PSET) + continue; + + switch (kind) + { + case GOMP_MAP_FORCE_ALLOC: + case GOMP_MAP_FORCE_FROM: + case GOMP_MAP_FORCE_TO: + case GOMP_MAP_POINTER: + case GOMP_MAP_DELETE: + GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], + &kinds[i], 0, 0); + break; + + case GOMP_MAP_FORCE_DEVICEPTR: + break; + + case GOMP_MAP_ALLOC: + if (!acc_is_present (hostaddrs[i], sizes[i])) + GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], + &kinds[i], 0, 0); + break; + + case GOMP_MAP_TO: + GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], + &kinds[i], 0, 0); + + break; + + case GOMP_MAP_FROM: + kinds[i] = GOMP_MAP_FORCE_FROM; + GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], + &kinds[i], 0, 0); + break; + + case GOMP_MAP_FORCE_PRESENT: + if (!acc_is_present (hostaddrs[i], sizes[i])) + gomp_fatal ("[%p,%ld] is not mapped", hostaddrs[i], + (unsigned long) sizes[i]); + break; + + default: + assert (0); + break; + } + } +} --- libgomp/oacc-cuda.c.jj 2016-07-13 16:57:04.432535397 +0200 +++ libgomp/oacc-cuda.c 2016-07-13 16:57:04.432535397 +0200 @@ -0,0 +1,86 @@ +/* OpenACC Runtime Library: CUDA support glue. + + Copyright (C) 2014-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#include "openacc.h" +#include "config.h" +#include "libgomp.h" +#include "oacc-int.h" + +void * +acc_get_current_cuda_device (void) +{ + struct goacc_thread *thr = goacc_thread (); + + if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func) + return thr->dev->openacc.cuda.get_current_device_func (); + + return NULL; +} + +void * +acc_get_current_cuda_context (void) +{ + struct goacc_thread *thr = goacc_thread (); + + if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func) + return thr->dev->openacc.cuda.get_current_context_func (); + + return NULL; +} + +void * +acc_get_cuda_stream (int async) +{ + struct goacc_thread *thr = goacc_thread (); + + if (async < 0) + return NULL; + + if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func) + return thr->dev->openacc.cuda.get_stream_func (async); + + return NULL; +} + +int +acc_set_cuda_stream (int async, void *stream) +{ + struct goacc_thread *thr; + + if (async < 0 || stream == NULL) + return 0; + + goacc_lazy_initialize (); + + thr = goacc_thread (); + + if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func) + return thr->dev->openacc.cuda.set_stream_func (async, stream); + + return -1; +} --- libgomp/openacc_lib.h.jj 2016-07-13 16:57:13.486423134 +0200 +++ libgomp/openacc_lib.h 2016-07-13 16:57:13.486423134 +0200 @@ -0,0 +1,382 @@ +! OpenACC Runtime Library Definitions. -*- mode: fortran -*- + +! Copyright (C) 2014-2016 Free Software Foundation, Inc. + +! Contributed by Tobias Burnus +! and Mentor Embedded. + +! This file is part of the GNU Offloading and Multi Processing Library +! (libgomp). + +! Libgomp is free software; you can redistribute it and/or modify it +! under the terms of the GNU General Public License as published by +! the Free Software Foundation; either version 3, or (at your option) +! any later version. + +! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY +! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +! FOR A PARTICULAR PURPOSE. See the GNU General Public License for +! more details. + +! Under Section 7 of GPL version 3, you are granted additional +! permissions described in the GCC Runtime Library Exception, version +! 3.1, as published by the Free Software Foundation. + +! You should have received a copy of the GNU General Public License and +! a copy of the GCC Runtime Library Exception along with this program; +! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +! . + +! NOTE: Due to the use of dimension (..), the code only works when compiled +! with -std=f2008ts/gnu/legacy but not with other standard settings. +! Alternatively, the user can use the module version, which permits +! compilation with -std=f95. + + integer, parameter :: acc_device_kind = 4 + +! Keep in sync with include/gomp-constants.h. + integer (acc_device_kind), parameter :: acc_device_none = 0 + integer (acc_device_kind), parameter :: acc_device_default = 1 + integer (acc_device_kind), parameter :: acc_device_host = 2 +! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 +! removed. + integer (acc_device_kind), parameter :: acc_device_not_host = 4 + integer (acc_device_kind), parameter :: acc_device_nvidia = 5 + + integer, parameter :: acc_handle_kind = 4 + +! Keep in sync with include/gomp-constants.h. + integer (acc_handle_kind), parameter :: acc_async_noval = -1 + integer (acc_handle_kind), parameter :: acc_async_sync = -2 + + integer, parameter :: openacc_version = 201306 + + interface acc_get_num_devices + function acc_get_num_devices_h (d) + import acc_device_kind + integer acc_get_num_devices_h + integer (acc_device_kind) d + end function + end interface + + interface acc_set_device_type + subroutine acc_set_device_type_h (d) + import acc_device_kind + integer (acc_device_kind) d + end subroutine + end interface + + interface acc_get_device_type + function acc_get_device_type_h () + import acc_device_kind + integer (acc_device_kind) acc_get_device_type_h + end function + end interface + + interface acc_set_device_num + subroutine acc_set_device_num_h (n, d) + import acc_device_kind + integer n + integer (acc_device_kind) d + end subroutine + end interface + + interface acc_get_device_num + function acc_get_device_num_h (d) + import acc_device_kind + integer acc_get_device_num_h + integer (acc_device_kind) d + end function + end interface + + interface acc_async_test + function acc_async_test_h (a) + logical acc_async_test_h + integer a + end function + end interface + + interface acc_async_test_all + function acc_async_test_all_h () + logical acc_async_test_all_h + end function + end interface + + interface acc_wait + subroutine acc_wait_h (a) + integer a + end subroutine + end interface + + interface acc_wait_async + subroutine acc_wait_async_h (a1, a2) + integer a1, a2 + end subroutine + end interface + + interface acc_wait_all + subroutine acc_wait_all_h () + end subroutine + end interface + + interface acc_wait_all_async + subroutine acc_wait_all_async_h (a) + integer a + end subroutine + end interface + + interface acc_init + subroutine acc_init_h (devicetype) + import acc_device_kind + integer (acc_device_kind) devicetype + end subroutine + end interface + + interface acc_shutdown + subroutine acc_shutdown_h (devicetype) + import acc_device_kind + integer (acc_device_kind) devicetype + end subroutine + end interface + + interface acc_on_device + function acc_on_device_h (devicetype) + import acc_device_kind + logical acc_on_device_h + integer (acc_device_kind) devicetype + end function + end interface + + ! acc_malloc: Only available in C/C++ + ! acc_free: Only available in C/C++ + + interface acc_copyin + subroutine acc_copyin_32_h (a, len) + use iso_c_binding, only: c_int32_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_copyin_64_h (a, len) + use iso_c_binding, only: c_int64_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_copyin_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + end interface + + interface acc_present_or_copyin + subroutine acc_present_or_copyin_32_h (a, len) + use iso_c_binding, only: c_int32_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_present_or_copyin_64_h (a, len) + use iso_c_binding, only: c_int64_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_present_or_copyin_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + end interface + + interface acc_pcopyin + subroutine acc_pcopyin_32_h (a, len) + use iso_c_binding, only: c_int32_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_pcopyin_64_h (a, len) + use iso_c_binding, only: c_int64_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_pcopyin_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + end interface + + interface acc_create + subroutine acc_create_32_h (a, len) + use iso_c_binding, only: c_int32_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_create_64_h (a, len) + use iso_c_binding, only: c_int64_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_create_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + end interface + + interface acc_present_or_create + subroutine acc_present_or_create_32_h (a, len) + use iso_c_binding, only: c_int32_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_present_or_create_64_h (a, len) + use iso_c_binding, only: c_int64_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_present_or_create_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + end interface + + interface acc_pcreate + subroutine acc_pcreate_32_h (a, len) + use iso_c_binding, only: c_int32_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_pcreate_64_h (a, len) + use iso_c_binding, only: c_int64_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_pcreate_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + end interface + + interface acc_copyout + subroutine acc_copyout_32_h (a, len) + use iso_c_binding, only: c_int32_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_copyout_64_h (a, len) + use iso_c_binding, only: c_int64_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_copyout_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + end interface + + interface acc_delete + subroutine acc_delete_32_h (a, len) + use iso_c_binding, only: c_int32_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_delete_64_h (a, len) + use iso_c_binding, only: c_int64_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_delete_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + end interface + + interface acc_update_device + subroutine acc_update_device_32_h (a, len) + use iso_c_binding, only: c_int32_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_update_device_64_h (a, len) + use iso_c_binding, only: c_int64_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_update_device_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + end interface + + interface acc_update_self + subroutine acc_update_self_32_h (a, len) + use iso_c_binding, only: c_int32_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_update_self_64_h (a, len) + use iso_c_binding, only: c_int64_t + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_update_self_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + end interface + + ! acc_map_data: Only available in C/C++ + ! acc_unmap_data: Only available in C/C++ + ! acc_deviceptr: Only available in C/C++ + ! acc_ostptr: Only available in C/C++ + + interface acc_is_present + function acc_is_present_32_h (a, len) + use iso_c_binding, only: c_int32_t + logical acc_is_present_32_h + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + end function + + function acc_is_present_64_h (a, len) + use iso_c_binding, only: c_int64_t + logical acc_is_present_64_h + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + end function + + function acc_is_present_array_h (a) + logical acc_is_present_array_h + type (*), dimension (..), contiguous :: a + end function + end interface + + ! acc_memcpy_to_device: Only available in C/C++ + ! acc_memcpy_from_device: Only available in C/C++ --- libgomp/gomp-constants.h.jj 2016-07-14 16:02:47.212545826 +0200 +++ libgomp/gomp-constants.h 2016-05-26 21:04:40.000000000 +0200 @@ -0,0 +1,259 @@ +/* Communication between GCC and libgomp. + + Copyright (C) 2014-2015 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef GOMP_CONSTANTS_H +#define GOMP_CONSTANTS_H 1 + +/* Memory mapping types. */ + +/* One byte. */ +#define GOMP_MAP_LAST (1 << 8) + +#define GOMP_MAP_FLAG_TO (1 << 0) +#define GOMP_MAP_FLAG_FROM (1 << 1) +/* Special map kinds, enumerated starting here. */ +#define GOMP_MAP_FLAG_SPECIAL_0 (1 << 2) +#define GOMP_MAP_FLAG_SPECIAL_1 (1 << 3) +#define GOMP_MAP_FLAG_SPECIAL_2 (1 << 4) +#define GOMP_MAP_FLAG_SPECIAL (GOMP_MAP_FLAG_SPECIAL_1 \ + | GOMP_MAP_FLAG_SPECIAL_0) +/* Flag to force a specific behavior (or else, trigger a run-time error). */ +#define GOMP_MAP_FLAG_FORCE (1 << 7) + +enum gomp_map_kind + { + /* If not already present, allocate. */ + GOMP_MAP_ALLOC = 0, + /* ..., and copy to device. */ + GOMP_MAP_TO = (GOMP_MAP_ALLOC | GOMP_MAP_FLAG_TO), + /* ..., and copy from device. */ + GOMP_MAP_FROM = (GOMP_MAP_ALLOC | GOMP_MAP_FLAG_FROM), + /* ..., and copy to and from device. */ + GOMP_MAP_TOFROM = (GOMP_MAP_TO | GOMP_MAP_FROM), + /* The following kind is an internal only map kind, used for pointer based + array sections. OMP_CLAUSE_SIZE for these is not the pointer size, + which is implicitly POINTER_SIZE_UNITS, but the bias. */ + GOMP_MAP_POINTER = (GOMP_MAP_FLAG_SPECIAL_0 | 0), + /* Also internal, behaves like GOMP_MAP_TO, but additionally any + GOMP_MAP_POINTER records consecutive after it which have addresses + falling into that range will not be ignored if GOMP_MAP_TO_PSET wasn't + mapped already. */ + GOMP_MAP_TO_PSET = (GOMP_MAP_FLAG_SPECIAL_0 | 1), + /* Must already be present. */ + GOMP_MAP_FORCE_PRESENT = (GOMP_MAP_FLAG_SPECIAL_0 | 2), + /* Deallocate a mapping, without copying from device. */ + GOMP_MAP_DELETE = (GOMP_MAP_FLAG_SPECIAL_0 | 3), + /* Is a device pointer. OMP_CLAUSE_SIZE for these is unused; is implicitly + POINTER_SIZE_UNITS. */ + GOMP_MAP_FORCE_DEVICEPTR = (GOMP_MAP_FLAG_SPECIAL_1 | 0), + /* Do not map, copy bits for firstprivate instead. */ + /* OpenACC device_resident. */ + GOMP_MAP_DEVICE_RESIDENT = (GOMP_MAP_FLAG_SPECIAL_1 | 1), + /* OpenACC link. */ + GOMP_MAP_LINK = (GOMP_MAP_FLAG_SPECIAL_1 | 2), + /* Allocate. */ + GOMP_MAP_FIRSTPRIVATE = (GOMP_MAP_FLAG_SPECIAL | 0), + /* Similarly, but store the value in the pointer rather than + pointed by the pointer. */ + GOMP_MAP_FIRSTPRIVATE_INT = (GOMP_MAP_FLAG_SPECIAL | 1), + /* Pointer translate host address into device address and copy that + back to host. */ + GOMP_MAP_USE_DEVICE_PTR = (GOMP_MAP_FLAG_SPECIAL | 2), + /* Allocate a zero length array section. Prefer next non-zero length + mapping over previous non-zero length mapping over zero length mapping + at the address. If not already mapped, do nothing (and pointer translate + to NULL). */ + GOMP_MAP_ZERO_LEN_ARRAY_SECTION = (GOMP_MAP_FLAG_SPECIAL | 3), + /* Allocate. */ + GOMP_MAP_FORCE_ALLOC = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_ALLOC), + /* ..., and copy to device. */ + GOMP_MAP_FORCE_TO = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_TO), + /* ..., and copy from device. */ + GOMP_MAP_FORCE_FROM = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_FROM), + /* ..., and copy to and from device. */ + GOMP_MAP_FORCE_TOFROM = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_TOFROM), + /* If not already present, allocate. And unconditionally copy to + device. */ + GOMP_MAP_ALWAYS_TO = (GOMP_MAP_FLAG_SPECIAL_2 | GOMP_MAP_TO), + /* If not already present, allocate. And unconditionally copy from + device. */ + GOMP_MAP_ALWAYS_FROM = (GOMP_MAP_FLAG_SPECIAL_2 + | GOMP_MAP_FROM), + /* If not already present, allocate. And unconditionally copy to and from + device. */ + GOMP_MAP_ALWAYS_TOFROM = (GOMP_MAP_FLAG_SPECIAL_2 + | GOMP_MAP_TOFROM), + /* Map a sparse struct; the address is the base of the structure, alignment + it's required alignment, and size is the number of adjacent entries + that belong to the struct. The adjacent entries should be sorted by + increasing address, so it is easy to determine lowest needed address + (address of the first adjacent entry) and highest needed address + (address of the last adjacent entry plus its size). */ + GOMP_MAP_STRUCT = (GOMP_MAP_FLAG_SPECIAL_2 + | GOMP_MAP_FLAG_SPECIAL | 0), + /* On a location of a pointer/reference that is assumed to be already mapped + earlier, store the translated address of the preceeding mapping. + No refcount is bumped by this, and the store is done unconditionally. */ + GOMP_MAP_ALWAYS_POINTER = (GOMP_MAP_FLAG_SPECIAL_2 + | GOMP_MAP_FLAG_SPECIAL | 1), + /* Forced deallocation of zero length array section. */ + GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION + = (GOMP_MAP_FLAG_SPECIAL_2 + | GOMP_MAP_FLAG_SPECIAL | 3), + /* Decrement usage count and deallocate if zero. */ + GOMP_MAP_RELEASE = (GOMP_MAP_FLAG_SPECIAL_2 + | GOMP_MAP_DELETE), + + /* Internal to GCC, not used in libgomp. */ + /* Do not map, but pointer assign a pointer instead. */ + GOMP_MAP_FIRSTPRIVATE_POINTER = (GOMP_MAP_LAST | 1), + /* Do not map, but pointer assign a reference instead. */ + GOMP_MAP_FIRSTPRIVATE_REFERENCE = (GOMP_MAP_LAST | 2) + }; + +#define GOMP_MAP_COPY_TO_P(X) \ + (!((X) & GOMP_MAP_FLAG_SPECIAL) \ + && ((X) & GOMP_MAP_FLAG_TO)) + +#define GOMP_MAP_COPY_FROM_P(X) \ + (!((X) & GOMP_MAP_FLAG_SPECIAL) \ + && ((X) & GOMP_MAP_FLAG_FROM)) + +#define GOMP_MAP_POINTER_P(X) \ + ((X) == GOMP_MAP_POINTER) + +#define GOMP_MAP_ALWAYS_TO_P(X) \ + (((X) == GOMP_MAP_ALWAYS_TO) || ((X) == GOMP_MAP_ALWAYS_TOFROM)) + +#define GOMP_MAP_ALWAYS_FROM_P(X) \ + (((X) == GOMP_MAP_ALWAYS_FROM) || ((X) == GOMP_MAP_ALWAYS_TOFROM)) + +#define GOMP_MAP_ALWAYS_P(X) \ + (GOMP_MAP_ALWAYS_TO_P (X) || ((X) == GOMP_MAP_ALWAYS_FROM)) + + +/* Asynchronous behavior. Keep in sync with + libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_async_t. */ + +#define GOMP_ASYNC_NOVAL -1 +#define GOMP_ASYNC_SYNC -2 + + +/* Device codes. Keep in sync with + libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_device_t as well as + libgomp/libgomp-plugin.h. */ +#define GOMP_DEVICE_NONE 0 +#define GOMP_DEVICE_DEFAULT 1 +#define GOMP_DEVICE_HOST 2 +/* #define GOMP_DEVICE_HOST_NONSHM 3 removed. */ +#define GOMP_DEVICE_NOT_HOST 4 +#define GOMP_DEVICE_NVIDIA_PTX 5 +#define GOMP_DEVICE_INTEL_MIC 6 +#define GOMP_DEVICE_HSA 7 + +#define GOMP_DEVICE_ICV -1 +#define GOMP_DEVICE_HOST_FALLBACK -2 + +/* GOMP_task/GOMP_taskloop* flags argument. */ +#define GOMP_TASK_FLAG_UNTIED (1 << 0) +#define GOMP_TASK_FLAG_FINAL (1 << 1) +#define GOMP_TASK_FLAG_MERGEABLE (1 << 2) +#define GOMP_TASK_FLAG_DEPEND (1 << 3) +#define GOMP_TASK_FLAG_PRIORITY (1 << 4) +#define GOMP_TASK_FLAG_UP (1 << 8) +#define GOMP_TASK_FLAG_GRAINSIZE (1 << 9) +#define GOMP_TASK_FLAG_IF (1 << 10) +#define GOMP_TASK_FLAG_NOGROUP (1 << 11) + +/* GOMP_target{_ext,update_ext,enter_exit_data} flags argument. */ +#define GOMP_TARGET_FLAG_NOWAIT (1 << 0) +#define GOMP_TARGET_FLAG_EXIT_DATA (1 << 1) +/* Internal to libgomp. */ +#define GOMP_TARGET_FLAG_UPDATE (1U << 31) + +/* Versions of libgomp and device-specific plugins. GOMP_VERSION + should be incremented whenever an ABI-incompatible change is introduced + to the plugin interface defined in libgomp/libgomp.h. */ +#define GOMP_VERSION 1 +#define GOMP_VERSION_NVIDIA_PTX 1 +#define GOMP_VERSION_INTEL_MIC 0 +#define GOMP_VERSION_HSA 0 + +#define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV)) +#define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff) +#define GOMP_VERSION_DEV(PACK) ((PACK) & 0xffff) + +#define GOMP_DIM_GANG 0 +#define GOMP_DIM_WORKER 1 +#define GOMP_DIM_VECTOR 2 +#define GOMP_DIM_MAX 3 +#define GOMP_DIM_MASK(X) (1u << (X)) + +/* Varadic launch arguments. End of list is marked by a zero. */ +#define GOMP_LAUNCH_DIM 1 /* Launch dimensions, op = mask */ +#define GOMP_LAUNCH_ASYNC 2 /* Async, op = cst val if not MAX */ +#define GOMP_LAUNCH_WAIT 3 /* Waits, op = num waits. */ +#define GOMP_LAUNCH_CODE_SHIFT 28 +#define GOMP_LAUNCH_DEVICE_SHIFT 16 +#define GOMP_LAUNCH_OP_SHIFT 0 +#define GOMP_LAUNCH_PACK(CODE,DEVICE,OP) \ + (((CODE) << GOMP_LAUNCH_CODE_SHIFT) \ + | ((DEVICE) << GOMP_LAUNCH_DEVICE_SHIFT) \ + | ((OP) << GOMP_LAUNCH_OP_SHIFT)) +#define GOMP_LAUNCH_CODE(X) (((X) >> GOMP_LAUNCH_CODE_SHIFT) & 0xf) +#define GOMP_LAUNCH_DEVICE(X) (((X) >> GOMP_LAUNCH_DEVICE_SHIFT) & 0xfff) +#define GOMP_LAUNCH_OP(X) (((X) >> GOMP_LAUNCH_OP_SHIFT) & 0xffff) +#define GOMP_LAUNCH_OP_MAX 0xffff + +/* Bitmask to apply in order to find out the intended device of a target + argument. */ +#define GOMP_TARGET_ARG_DEVICE_MASK ((1 << 7) - 1) +/* The target argument is significant for all devices. */ +#define GOMP_TARGET_ARG_DEVICE_ALL 0 + +/* Flag set when the subsequent element in the device-specific argument + values. */ +#define GOMP_TARGET_ARG_SUBSEQUENT_PARAM (1 << 7) + +/* Bitmask to apply to a target argument to find out the value identifier. */ +#define GOMP_TARGET_ARG_ID_MASK (((1 << 8) - 1) << 8) +/* Target argument index of NUM_TEAMS. */ +#define GOMP_TARGET_ARG_NUM_TEAMS (1 << 8) +/* Target argument index of THREAD_LIMIT. */ +#define GOMP_TARGET_ARG_THREAD_LIMIT (2 << 8) + +/* If the value is directly embeded in target argument, it should be a 16-bit + at most and shifted by this many bits. */ +#define GOMP_TARGET_ARG_VALUE_SHIFT 16 + +/* HSA specific data structures. */ + +/* Identifiers of device-specific target arguments. */ +#define GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES (1 << 8) + +#endif --- libgomp/oacc-mem.c.jj 2016-07-13 16:57:04.433535385 +0200 +++ libgomp/oacc-mem.c 2016-07-14 15:39:44.644631308 +0200 @@ -0,0 +1,204 @@ +/* OpenACC Runtime initialization routines + + Copyright (C) 2013-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#include "openacc.h" +#include "config.h" +#include "libgomp.h" +#include "gomp-constants.h" +#include "oacc-int.h" +#include +#include +#include + +/* OpenACC is silent on how memory exhaustion is indicated. We return + NULL. */ + +void * +acc_malloc (size_t s) +{ + if (!s) + return NULL; + + goacc_lazy_initialize (); + return malloc (s); +} + +/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event + the device address is mapped. We choose to check if it mapped, + and if it is, to unmap it. */ +void +acc_free (void *d) +{ + return free (d); +} + +void +acc_memcpy_to_device (void *d, void *h, size_t s) +{ + memmove (d, h, s); +} + +void +acc_memcpy_from_device (void *h, void *d, size_t s) +{ + memmove (h, d, s); +} + +/* Return the device pointer that corresponds to host data H. Or NULL + if no mapping. */ + +void * +acc_deviceptr (void *h) +{ + goacc_lazy_initialize (); + return h; +} + +/* Return the host pointer that corresponds to device data D. Or NULL + if no mapping. */ + +void * +acc_hostptr (void *d) +{ + goacc_lazy_initialize (); + return d; +} + +/* Return 1 if host data [H,+S] is present on the device. */ + +int +acc_is_present (void *h, size_t s) +{ + if (!s || !h) + return 0; + + goacc_lazy_initialize (); + return h != NULL; +} + +/* Create a mapping for host [H,+S] -> device [D,+S] */ + +void +acc_map_data (void *h, void *d, size_t s) +{ + goacc_lazy_initialize (); + + if (d != h) + gomp_fatal ("cannot map data on shared-memory system"); +} + +void +acc_unmap_data (void *h) +{ +} + +#define FLAG_PRESENT (1 << 0) +#define FLAG_CREATE (1 << 1) +#define FLAG_COPY (1 << 2) + +static void * +present_create_copy (unsigned f, void *h, size_t s) +{ + if (!h || !s) + gomp_fatal ("[%p,+%d] is a bad range", (void *)h, (int)s); + + goacc_lazy_initialize (); + return h; +} + +void * +acc_create (void *h, size_t s) +{ + return present_create_copy (FLAG_CREATE, h, s); +} + +void * +acc_copyin (void *h, size_t s) +{ + return present_create_copy (FLAG_CREATE | FLAG_COPY, h, s); +} + +void * +acc_present_or_create (void *h, size_t s) +{ + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); +} + +void * +acc_present_or_copyin (void *h, size_t s) +{ + return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); +} + +#define FLAG_COPYOUT (1 << 0) + +static void +delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) +{ +} + +void +acc_delete (void *h , size_t s) +{ + delete_copyout (0, h, s, __FUNCTION__); +} + +void +acc_copyout (void *h, size_t s) +{ + delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__); +} + +static void +update_dev_host (int is_dev, void *h, size_t s) +{ + goacc_lazy_initialize (); +} + +void +acc_update_device (void *h, size_t s) +{ + update_dev_host (1, h, s); +} + +void +acc_update_self (void *h, size_t s) +{ + update_dev_host (0, h, s); +} + +void +gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes, + void *kinds) +{ +} + +void +gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum) +{ +} --- libgomp/oacc-plugin.h.jj 2016-07-13 16:57:13.487423121 +0200 +++ libgomp/oacc-plugin.h 2016-07-13 16:57:13.487423121 +0200 @@ -0,0 +1,33 @@ +/* Copyright (C) 2014-2016 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef OACC_PLUGIN_H +#define OACC_PLUGIN_H 1 + +extern void GOMP_PLUGIN_async_unmap_vars (void *, int); +extern void *GOMP_PLUGIN_acc_thread (void); + +#endif --- libgomp/taskloop.c.jj 2016-07-13 16:57:18.935355570 +0200 +++ libgomp/taskloop.c 2016-07-13 16:57:18.935355570 +0200 @@ -0,0 +1,340 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Jakub Jelinek . + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* This file handles the taskloop construct. It is included twice, once + for the long and once for unsigned long long variant. */ + +/* Called when encountering an explicit task directive. If IF_CLAUSE is + false, then we must not delay in executing the task. If UNTIED is true, + then the task may be executed by any member of the team. */ + +void +GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), + long arg_size, long arg_align, unsigned flags, + unsigned long num_tasks, int priority, + TYPE start, TYPE end, TYPE step) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + +#ifdef HAVE_BROKEN_POSIX_SEMAPHORES + /* If pthread_mutex_* is used for omp_*lock*, then each task must be + tied to one thread all the time. This means UNTIED tasks must be + tied and if CPYFN is non-NULL IF(0) must be forced, as CPYFN + might be running on different thread than FN. */ + if (cpyfn) + flags &= ~GOMP_TASK_FLAG_IF; + flags &= ~GOMP_TASK_FLAG_UNTIED; +#endif + + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ + if (team && gomp_team_barrier_cancelled (&team->barrier)) + return; + +#ifdef TYPE_is_long + TYPE s = step; + if (step > 0) + { + if (start >= end) + return; + s--; + } + else + { + if (start <= end) + return; + s++; + } + UTYPE n = (end - start + s) / step; +#else + UTYPE n; + if (flags & GOMP_TASK_FLAG_UP) + { + if (start >= end) + return; + n = (end - start + step - 1) / step; + } + else + { + if (start <= end) + return; + n = (start - end - step - 1) / -step; + } +#endif + + TYPE task_step = step; + unsigned long nfirst = n; + if (flags & GOMP_TASK_FLAG_GRAINSIZE) + { + unsigned long grainsize = num_tasks; +#ifdef TYPE_is_long + num_tasks = n / grainsize; +#else + UTYPE ndiv = n / grainsize; + num_tasks = ndiv; + if (num_tasks != ndiv) + num_tasks = ~0UL; +#endif + if (num_tasks <= 1) + { + num_tasks = 1; + task_step = end - start; + } + else if (num_tasks >= grainsize +#ifndef TYPE_is_long + && num_tasks != ~0UL +#endif + ) + { + UTYPE mul = num_tasks * grainsize; + task_step = (TYPE) grainsize * step; + if (mul != n) + { + task_step += step; + nfirst = n - mul - 1; + } + } + else + { + UTYPE div = n / num_tasks; + UTYPE mod = n % num_tasks; + task_step = (TYPE) div * step; + if (mod) + { + task_step += step; + nfirst = mod - 1; + } + } + } + else + { + if (num_tasks == 0) + num_tasks = team ? team->nthreads : 1; + if (num_tasks >= n) + num_tasks = n; + else + { + UTYPE div = n / num_tasks; + UTYPE mod = n % num_tasks; + task_step = (TYPE) div * step; + if (mod) + { + task_step += step; + nfirst = mod - 1; + } + } + } + + if (flags & GOMP_TASK_FLAG_NOGROUP) + { + if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled) + return; + } + else + ialias_call (GOMP_taskgroup_start) (); + + if (priority > gomp_max_task_priority_var) + priority = gomp_max_task_priority_var; + + if ((flags & GOMP_TASK_FLAG_IF) == 0 || team == NULL + || (thr->task && thr->task->final_task) + || team->task_count + num_tasks > 64 * team->nthreads) + { + unsigned long i; + if (__builtin_expect (cpyfn != NULL, 0)) + { + struct gomp_task task[num_tasks]; + struct gomp_task *parent = thr->task; + arg_size = (arg_size + arg_align - 1) & ~(arg_align - 1); + char buf[num_tasks * arg_size + arg_align - 1]; + char *arg = (char *) (((uintptr_t) buf + arg_align - 1) + & ~(uintptr_t) (arg_align - 1)); + char *orig_arg = arg; + for (i = 0; i < num_tasks; i++) + { + gomp_init_task (&task[i], parent, gomp_icv (false)); + task[i].priority = priority; + task[i].kind = GOMP_TASK_UNDEFERRED; + task[i].final_task = (thr->task && thr->task->final_task) + || (flags & GOMP_TASK_FLAG_FINAL); + if (thr->task) + { + task[i].in_tied_task = thr->task->in_tied_task; + task[i].taskgroup = thr->task->taskgroup; + } + thr->task = &task[i]; + cpyfn (arg, data); + arg += arg_size; + } + arg = orig_arg; + for (i = 0; i < num_tasks; i++) + { + thr->task = &task[i]; + ((TYPE *)arg)[0] = start; + start += task_step; + ((TYPE *)arg)[1] = start; + if (i == nfirst) + task_step -= step; + fn (arg); + arg += arg_size; + if (!priority_queue_empty_p (&task[i].children_queue, + MEMMODEL_RELAXED)) + { + gomp_mutex_lock (&team->task_lock); + gomp_clear_parent (&task[i].children_queue); + gomp_mutex_unlock (&team->task_lock); + } + gomp_end_task (); + } + } + else + for (i = 0; i < num_tasks; i++) + { + struct gomp_task task; + + gomp_init_task (&task, thr->task, gomp_icv (false)); + task.priority = priority; + task.kind = GOMP_TASK_UNDEFERRED; + task.final_task = (thr->task && thr->task->final_task) + || (flags & GOMP_TASK_FLAG_FINAL); + if (thr->task) + { + task.in_tied_task = thr->task->in_tied_task; + task.taskgroup = thr->task->taskgroup; + } + thr->task = &task; + ((TYPE *)data)[0] = start; + start += task_step; + ((TYPE *)data)[1] = start; + if (i == nfirst) + task_step -= step; + fn (data); + if (!priority_queue_empty_p (&task.children_queue, + MEMMODEL_RELAXED)) + { + gomp_mutex_lock (&team->task_lock); + gomp_clear_parent (&task.children_queue); + gomp_mutex_unlock (&team->task_lock); + } + gomp_end_task (); + } + } + else + { + struct gomp_task *tasks[num_tasks]; + struct gomp_task *parent = thr->task; + struct gomp_taskgroup *taskgroup = parent->taskgroup; + char *arg; + int do_wake; + unsigned long i; + + for (i = 0; i < num_tasks; i++) + { + struct gomp_task *task + = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1); + tasks[i] = task; + arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1) + & ~(uintptr_t) (arg_align - 1)); + gomp_init_task (task, parent, gomp_icv (false)); + task->priority = priority; + task->kind = GOMP_TASK_UNDEFERRED; + task->in_tied_task = parent->in_tied_task; + task->taskgroup = taskgroup; + thr->task = task; + if (cpyfn) + { + cpyfn (arg, data); + task->copy_ctors_done = true; + } + else + memcpy (arg, data, arg_size); + ((TYPE *)arg)[0] = start; + start += task_step; + ((TYPE *)arg)[1] = start; + if (i == nfirst) + task_step -= step; + thr->task = parent; + task->kind = GOMP_TASK_WAITING; + task->fn = fn; + task->fn_data = arg; + task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1; + } + gomp_mutex_lock (&team->task_lock); + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ + if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier) + || (taskgroup && taskgroup->cancelled)) + && cpyfn == NULL, 0)) + { + gomp_mutex_unlock (&team->task_lock); + for (i = 0; i < num_tasks; i++) + { + gomp_finish_task (tasks[i]); + free (tasks[i]); + } + if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0) + ialias_call (GOMP_taskgroup_end) (); + return; + } + if (taskgroup) + taskgroup->num_children += num_tasks; + for (i = 0; i < num_tasks; i++) + { + struct gomp_task *task = tasks[i]; + priority_queue_insert (PQ_CHILDREN, &parent->children_queue, + task, priority, + PRIORITY_INSERT_BEGIN, + /*last_parent_depends_on=*/false, + task->parent_depends_on); + if (taskgroup) + priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + task, priority, PRIORITY_INSERT_BEGIN, + /*last_parent_depends_on=*/false, + task->parent_depends_on); + priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority, + PRIORITY_INSERT_END, + /*last_parent_depends_on=*/false, + task->parent_depends_on); + ++team->task_count; + ++team->task_queued_count; + } + gomp_team_barrier_set_task_pending (&team->barrier); + if (team->task_running_count + !parent->in_tied_task + < team->nthreads) + { + do_wake = team->nthreads - team->task_running_count + - !parent->in_tied_task; + if ((unsigned long) do_wake > num_tasks) + do_wake = num_tasks; + } + else + do_wake = 0; + gomp_mutex_unlock (&team->task_lock); + if (do_wake) + gomp_team_barrier_wake (&team->barrier, do_wake); + } + if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0) + ialias_call (GOMP_taskgroup_end) (); +} --- libgomp/priority_queue.h.jj 2016-07-13 16:57:04.438535323 +0200 +++ libgomp/priority_queue.h 2016-07-13 16:57:04.438535323 +0200 @@ -0,0 +1,485 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Aldy Hernandez . + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Header file for a priority queue of GOMP tasks. */ + +/* ?? Perhaps all the priority_tree_* functions are complex and rare + enough to go out-of-line and be moved to priority_queue.c. ?? */ + +#ifndef _PRIORITY_QUEUE_H_ +#define _PRIORITY_QUEUE_H_ + +/* One task. */ + +struct priority_node +{ + /* Next and previous chains in a circular doubly linked list for + tasks within this task's priority. */ + struct priority_node *next, *prev; +}; + +/* All tasks within the same priority. */ + +struct priority_list +{ + /* Priority of the tasks in this set. */ + int priority; + + /* Tasks. */ + struct priority_node *tasks; + + /* This points to the last of the higher priority WAITING tasks. + Remember that for the children queue, we have: + + parent_depends_on WAITING tasks. + !parent_depends_on WAITING tasks. + TIED tasks. + + This is a pointer to the last of the parent_depends_on WAITING + tasks which are essentially, higher priority items within their + priority. */ + struct priority_node *last_parent_depends_on; +}; + +/* Another splay tree instantiation, for priority_list's. */ +typedef struct prio_splay_tree_node_s *prio_splay_tree_node; +typedef struct prio_splay_tree_s *prio_splay_tree; +typedef struct prio_splay_tree_key_s *prio_splay_tree_key; +struct prio_splay_tree_key_s { + /* This structure must only containing a priority_list, as we cast + prio_splay_tree_key to priority_list throughout. */ + struct priority_list l; +}; +#define splay_tree_prefix prio +#include "splay-tree.h" + +/* The entry point into a priority queue of tasks. + + There are two alternate implementations with which to store tasks: + as a balanced tree of sorts, or as a simple list of tasks. If + there are only priority-0 items (ROOT is NULL), we use the simple + list, otherwise (ROOT is non-NULL) we use the tree. */ + +struct priority_queue +{ + /* If t.root != NULL, this is a splay tree of priority_lists to hold + all tasks. This is only used if multiple priorities are in play, + otherwise we use the priority_list `l' below to hold all + (priority-0) tasks. */ + struct prio_splay_tree_s t; + + /* If T above is NULL, only priority-0 items exist, so keep them + in a simple list. */ + struct priority_list l; +}; + +enum priority_insert_type { + /* Insert at the beginning of a priority list. */ + PRIORITY_INSERT_BEGIN, + /* Insert at the end of a priority list. */ + PRIORITY_INSERT_END +}; + +/* Used to determine in which queue a given priority node belongs in. + See pnode field of gomp_task. */ + +enum priority_queue_type +{ + PQ_TEAM, /* Node belongs in gomp_team's task_queue. */ + PQ_CHILDREN, /* Node belongs in parent's children_queue. */ + PQ_TASKGROUP, /* Node belongs in taskgroup->taskgroup_queue. */ + PQ_IGNORED = 999 +}; + +/* Priority queue implementation prototypes. */ + +extern bool priority_queue_task_in_queue_p (enum priority_queue_type, + struct priority_queue *, + struct gomp_task *); +extern void priority_queue_dump (enum priority_queue_type, + struct priority_queue *); +extern void priority_queue_verify (enum priority_queue_type, + struct priority_queue *, bool); +extern void priority_tree_remove (enum priority_queue_type, + struct priority_queue *, + struct priority_node *); +extern struct gomp_task *priority_tree_next_task (enum priority_queue_type, + struct priority_queue *, + enum priority_queue_type, + struct priority_queue *, + bool *); + +/* Return TRUE if there is more than one priority in HEAD. This is + used throughout to to choose between the fast path (priority 0 only + items) and a world with multiple priorities. */ + +static inline bool +priority_queue_multi_p (struct priority_queue *head) +{ + return __builtin_expect (head->t.root != NULL, 0); +} + +/* Initialize a priority queue. */ + +static inline void +priority_queue_init (struct priority_queue *head) +{ + head->t.root = NULL; + /* To save a few microseconds, we don't initialize head->l.priority + to 0 here. It is implied that priority will be 0 if head->t.root + == NULL. + + priority_tree_insert() will fix this when we encounter multiple + priorities. */ + head->l.tasks = NULL; + head->l.last_parent_depends_on = NULL; +} + +static inline void +priority_queue_free (struct priority_queue *head) +{ + /* There's nothing to do, as tasks were freed as they were removed + in priority_queue_remove. */ +} + +/* Forward declarations. */ +static inline size_t priority_queue_offset (enum priority_queue_type); +static inline struct gomp_task *priority_node_to_task + (enum priority_queue_type, + struct priority_node *); +static inline struct priority_node *task_to_priority_node + (enum priority_queue_type, + struct gomp_task *); + +/* Return TRUE if priority queue HEAD is empty. + + MODEL IS MEMMODEL_ACQUIRE if we should use an acquire atomic to + read from the root of the queue, otherwise MEMMODEL_RELAXED if we + should use a plain load. */ + +static inline _Bool +priority_queue_empty_p (struct priority_queue *head, enum memmodel model) +{ + /* Note: The acquire barriers on the loads here synchronize with + the write of a NULL in gomp_task_run_post_remove_parent. It is + not necessary that we synchronize with other non-NULL writes at + this point, but we must ensure that all writes to memory by a + child thread task work function are seen before we exit from + GOMP_taskwait. */ + if (priority_queue_multi_p (head)) + { + if (model == MEMMODEL_ACQUIRE) + return __atomic_load_n (&head->t.root, MEMMODEL_ACQUIRE) == NULL; + return head->t.root == NULL; + } + if (model == MEMMODEL_ACQUIRE) + return __atomic_load_n (&head->l.tasks, MEMMODEL_ACQUIRE) == NULL; + return head->l.tasks == NULL; +} + +/* Look for a given PRIORITY in HEAD. Return it if found, otherwise + return NULL. This only applies to the tree variant in HEAD. There + is no point in searching for priorities in HEAD->L. */ + +static inline struct priority_list * +priority_queue_lookup_priority (struct priority_queue *head, int priority) +{ + if (head->t.root == NULL) + return NULL; + struct prio_splay_tree_key_s k; + k.l.priority = priority; + return (struct priority_list *) + prio_splay_tree_lookup (&head->t, &k); +} + +/* Insert task in DATA, with PRIORITY, in the priority list in LIST. + LIST contains items of type TYPE. + + If POS is PRIORITY_INSERT_BEGIN, the new task is inserted at the + top of its respective priority. If POS is PRIORITY_INSERT_END, the + task is inserted at the end of its priority. + + If ADJUST_PARENT_DEPENDS_ON is TRUE, LIST is a children queue, and + we must keep track of higher and lower priority WAITING tasks by + keeping the queue's last_parent_depends_on field accurate. This + only applies to the children queue, and the caller must ensure LIST + is a children queue in this case. + + If ADJUST_PARENT_DEPENDS_ON is TRUE, TASK_IS_PARENT_DEPENDS_ON is + set to the task's parent_depends_on field. If + ADJUST_PARENT_DEPENDS_ON is FALSE, this field is irrelevant. + + Return the new priority_node. */ + +static inline void +priority_list_insert (enum priority_queue_type type, + struct priority_list *list, + struct gomp_task *task, + int priority, + enum priority_insert_type pos, + bool adjust_parent_depends_on, + bool task_is_parent_depends_on) +{ + struct priority_node *node = task_to_priority_node (type, task); + if (list->tasks) + { + /* If we are keeping track of higher/lower priority items, + but this is a lower priority WAITING task + (parent_depends_on != NULL), put it after all ready to + run tasks. See the comment in + priority_queue_upgrade_task for a visual on how tasks + should be organized. */ + if (adjust_parent_depends_on + && pos == PRIORITY_INSERT_BEGIN + && list->last_parent_depends_on + && !task_is_parent_depends_on) + { + struct priority_node *last_parent_depends_on + = list->last_parent_depends_on; + node->next = last_parent_depends_on->next; + node->prev = last_parent_depends_on; + } + /* Otherwise, put it at the top/bottom of the queue. */ + else + { + node->next = list->tasks; + node->prev = list->tasks->prev; + if (pos == PRIORITY_INSERT_BEGIN) + list->tasks = node; + } + node->next->prev = node; + node->prev->next = node; + } + else + { + node->next = node; + node->prev = node; + list->tasks = node; + } + if (adjust_parent_depends_on + && list->last_parent_depends_on == NULL + && task_is_parent_depends_on) + list->last_parent_depends_on = node; +} + +/* Tree version of priority_list_insert. */ + +static inline void +priority_tree_insert (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task, + int priority, + enum priority_insert_type pos, + bool adjust_parent_depends_on, + bool task_is_parent_depends_on) +{ + if (__builtin_expect (head->t.root == NULL, 0)) + { + /* The first time around, transfer any priority 0 items to the + tree. */ + if (head->l.tasks != NULL) + { + prio_splay_tree_node k = gomp_malloc (sizeof (*k)); + k->left = NULL; + k->right = NULL; + k->key.l.priority = 0; + k->key.l.tasks = head->l.tasks; + k->key.l.last_parent_depends_on = head->l.last_parent_depends_on; + prio_splay_tree_insert (&head->t, k); + head->l.tasks = NULL; + } + } + struct priority_list *list + = priority_queue_lookup_priority (head, priority); + if (!list) + { + prio_splay_tree_node k = gomp_malloc (sizeof (*k)); + k->left = NULL; + k->right = NULL; + k->key.l.priority = priority; + k->key.l.tasks = NULL; + k->key.l.last_parent_depends_on = NULL; + prio_splay_tree_insert (&head->t, k); + list = &k->key.l; + } + priority_list_insert (type, list, task, priority, pos, + adjust_parent_depends_on, + task_is_parent_depends_on); +} + +/* Generic version of priority_*_insert. */ + +static inline void +priority_queue_insert (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task, + int priority, + enum priority_insert_type pos, + bool adjust_parent_depends_on, + bool task_is_parent_depends_on) +{ +#if _LIBGOMP_CHECKING_ + if (priority_queue_task_in_queue_p (type, head, task)) + gomp_fatal ("Attempt to insert existing task %p", task); +#endif + if (priority_queue_multi_p (head) || __builtin_expect (priority > 0, 0)) + priority_tree_insert (type, head, task, priority, pos, + adjust_parent_depends_on, + task_is_parent_depends_on); + else + priority_list_insert (type, &head->l, task, priority, pos, + adjust_parent_depends_on, + task_is_parent_depends_on); +} + +/* If multiple priorities are in play, return the highest priority + task from within Q1 and Q2, while giving preference to tasks from + Q1. If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to + TRUE, otherwise it is set to FALSE. + + If multiple priorities are not in play (only 0 priorities are + available), the next task is chosen exclusively from Q1. + + As a special case, Q2 can be NULL, in which case, we just choose + the highest priority WAITING task in Q1. This is an optimization + to speed up looking through only one queue. + + We assume Q1 has at least one item. */ + +static inline struct gomp_task * +priority_queue_next_task (enum priority_queue_type t1, + struct priority_queue *q1, + enum priority_queue_type t2, + struct priority_queue *q2, + bool *q1_chosen_p) +{ +#if _LIBGOMP_CHECKING_ + if (priority_queue_empty_p (q1, MEMMODEL_RELAXED)) + gomp_fatal ("priority_queue_next_task: Q1 is empty"); +#endif + if (priority_queue_multi_p (q1)) + { + struct gomp_task *t + = priority_tree_next_task (t1, q1, t2, q2, q1_chosen_p); + /* If T is NULL, there are no WAITING tasks in Q1. In which + case, return any old (non-waiting) task which will cause the + caller to do the right thing when checking T->KIND == + GOMP_TASK_WAITING. */ + if (!t) + { +#if _LIBGOMP_CHECKING_ + if (*q1_chosen_p == false) + gomp_fatal ("priority_queue_next_task inconsistency"); +#endif + return priority_node_to_task (t1, q1->t.root->key.l.tasks); + } + return t; + } + else + { + *q1_chosen_p = true; + return priority_node_to_task (t1, q1->l.tasks); + } +} + +/* Remove NODE from LIST. + + If we are removing the one and only item in the list, and MODEL is + MEMMODEL_RELEASE, use an atomic release to clear the list. + + If the list becomes empty after the remove, return TRUE. */ + +static inline bool +priority_list_remove (struct priority_list *list, + struct priority_node *node, + enum memmodel model) +{ + bool empty = false; + node->prev->next = node->next; + node->next->prev = node->prev; + if (list->tasks == node) + { + if (node->next != node) + list->tasks = node->next; + else + { + /* We access task->children in GOMP_taskwait outside of + the task lock mutex region, so need a release barrier + here to ensure memory written by child_task->fn above + is flushed before the NULL is written. */ + if (model == MEMMODEL_RELEASE) + __atomic_store_n (&list->tasks, NULL, MEMMODEL_RELEASE); + else + list->tasks = NULL; + empty = true; + goto remove_out; + } + } +remove_out: +#if _LIBGOMP_CHECKING_ + memset (node, 0xaf, sizeof (*node)); +#endif + return empty; +} + +/* This is the generic version of priority_list_remove. + + Remove NODE from priority queue HEAD. HEAD contains tasks of type TYPE. + + If we are removing the one and only item in the priority queue and + MODEL is MEMMODEL_RELEASE, use an atomic release to clear the queue. + + If the queue becomes empty after the remove, return TRUE. */ + +static inline bool +priority_queue_remove (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task, + enum memmodel model) +{ +#if _LIBGOMP_CHECKING_ + if (!priority_queue_task_in_queue_p (type, head, task)) + gomp_fatal ("Attempt to remove missing task %p", task); +#endif + if (priority_queue_multi_p (head)) + { + priority_tree_remove (type, head, task_to_priority_node (type, task)); + if (head->t.root == NULL) + { + if (model == MEMMODEL_RELEASE) + /* Errr, we store NULL twice, the alternative would be to + use an atomic release directly in the splay tree + routines. Worth it? */ + __atomic_store_n (&head->t.root, NULL, MEMMODEL_RELEASE); + return true; + } + return false; + } + else + return priority_list_remove (&head->l, + task_to_priority_node (type, task), model); +} + +#endif /* _PRIORITY_QUEUE_H_ */ --- libgomp/priority_queue.c.jj 2016-07-13 16:57:04.435535360 +0200 +++ libgomp/priority_queue.c 2016-07-13 16:57:04.435535360 +0200 @@ -0,0 +1,300 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by Aldy Hernandez . + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Priority queue implementation of GOMP tasks. */ + +#include "libgomp.h" + +#if _LIBGOMP_CHECKING_ +#include + +/* Sanity check to verify whether a TASK is in LIST. Return TRUE if + found, FALSE otherwise. + + TYPE is the type of priority queue this task resides in. */ + +static inline bool +priority_queue_task_in_list_p (enum priority_queue_type type, + struct priority_list *list, + struct gomp_task *task) +{ + struct priority_node *p = list->tasks; + do + { + if (priority_node_to_task (type, p) == task) + return true; + p = p->next; + } + while (p != list->tasks); + return false; +} + +/* Tree version of priority_queue_task_in_list_p. */ + +static inline bool +priority_queue_task_in_tree_p (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task) +{ + struct priority_list *list + = priority_queue_lookup_priority (head, task->priority); + if (!list) + return false; + return priority_queue_task_in_list_p (type, list, task); +} + +/* Generic version of priority_queue_task_in_list_p that works for + trees or lists. */ + +bool +priority_queue_task_in_queue_p (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task) +{ + if (priority_queue_empty_p (head, MEMMODEL_RELAXED)) + return false; + if (priority_queue_multi_p (head)) + return priority_queue_task_in_tree_p (type, head, task); + else + return priority_queue_task_in_list_p (type, &head->l, task); +} + +/* Sanity check LIST to make sure the tasks therein are in the right + order. LIST is a priority list of type TYPE. + + The expected order is that GOMP_TASK_WAITING tasks come before + GOMP_TASK_TIED/GOMP_TASK_ASYNC_RUNNING ones. + + If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING + tasks come before !parent_depends_on WAITING tasks. This is only + applicable to the children queue, and the caller is expected to + ensure that we are verifying the children queue. */ + +static void +priority_list_verify (enum priority_queue_type type, + struct priority_list *list, bool check_deps) +{ + bool seen_tied = false; + bool seen_plain_waiting = false; + struct priority_node *p = list->tasks; + while (1) + { + struct gomp_task *t = priority_node_to_task (type, p); + if (seen_tied && t->kind == GOMP_TASK_WAITING) + gomp_fatal ("priority_queue_verify: WAITING task after TIED"); + if (t->kind >= GOMP_TASK_TIED) + seen_tied = true; + else if (check_deps && t->kind == GOMP_TASK_WAITING) + { + if (t->parent_depends_on) + { + if (seen_plain_waiting) + gomp_fatal ("priority_queue_verify: " + "parent_depends_on after !parent_depends_on"); + } + else + seen_plain_waiting = true; + } + p = p->next; + if (p == list->tasks) + break; + } +} + +/* Callback type for priority_tree_verify_callback. */ +struct cbtype +{ + enum priority_queue_type type; + bool check_deps; +}; + +/* Verify every task in NODE. + + Callback for splay_tree_foreach. */ + +static void +priority_tree_verify_callback (prio_splay_tree_key key, void *data) +{ + struct cbtype *cb = (struct cbtype *) data; + priority_list_verify (cb->type, &key->l, cb->check_deps); +} + +/* Generic version of priority_list_verify. + + Sanity check HEAD to make sure the tasks therein are in the right + order. The priority_queue holds tasks of type TYPE. + + If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING + tasks come before !parent_depends_on WAITING tasks. This is only + applicable to the children queue, and the caller is expected to + ensure that we are verifying the children queue. */ + +void +priority_queue_verify (enum priority_queue_type type, + struct priority_queue *head, bool check_deps) +{ + if (priority_queue_empty_p (head, MEMMODEL_RELAXED)) + return; + if (priority_queue_multi_p (head)) + { + struct cbtype cb = { type, check_deps }; + prio_splay_tree_foreach (&head->t, + priority_tree_verify_callback, &cb); + } + else + priority_list_verify (type, &head->l, check_deps); +} +#endif /* _LIBGOMP_CHECKING_ */ + +/* Remove NODE from priority queue HEAD, wherever it may be inside the + tree. HEAD contains tasks of type TYPE. */ + +void +priority_tree_remove (enum priority_queue_type type, + struct priority_queue *head, + struct priority_node *node) +{ + /* ?? The only reason this function is not inlined is because we + need to find the priority within gomp_task (which has not been + completely defined in the header file). If the lack of inlining + is a concern, we could pass the priority number as a + parameter, or we could move this to libgomp.h. */ + int priority = priority_node_to_task (type, node)->priority; + + /* ?? We could avoid this lookup by keeping a pointer to the key in + the priority_node. */ + struct priority_list *list + = priority_queue_lookup_priority (head, priority); +#if _LIBGOMP_CHECKING_ + if (!list) + gomp_fatal ("Unable to find priority %d", priority); +#endif + /* If NODE was the last in its priority, clean up the priority. */ + if (priority_list_remove (list, node, MEMMODEL_RELAXED)) + { + prio_splay_tree_remove (&head->t, (prio_splay_tree_key) list); + list->tasks = NULL; +#if _LIBGOMP_CHECKING_ + memset (list, 0xaf, sizeof (*list)); +#endif + free (list); + } +} + +/* Return the highest priority WAITING task in a splay tree NODE. If + there are no WAITING tasks available, return NULL. + + NODE is a priority list containing tasks of type TYPE. + + The right most node in a tree contains the highest priority. + Recurse down to find such a node. If the task at that max node is + not WAITING, bubble back up and look at the remaining tasks + in-order. */ + +static struct gomp_task * +priority_tree_next_task_1 (enum priority_queue_type type, + prio_splay_tree_node node) +{ + again: + if (!node) + return NULL; + struct gomp_task *ret = priority_tree_next_task_1 (type, node->right); + if (ret) + return ret; + ret = priority_node_to_task (type, node->key.l.tasks); + if (ret->kind == GOMP_TASK_WAITING) + return ret; + node = node->left; + goto again; +} + +/* Return the highest priority WAITING task from within Q1 and Q2, + while giving preference to tasks from Q1. Q1 is a queue containing + items of type TYPE1. Q2 is a queue containing items of type TYPE2. + + Since we are mostly interested in Q1, if there are no WAITING tasks + in Q1, we don't bother checking Q2, and just return NULL. + + As a special case, Q2 can be NULL, in which case, we just choose + the highest priority WAITING task in Q1. This is an optimization + to speed up looking through only one queue. + + If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to + TRUE, otherwise it is set to FALSE. */ + +struct gomp_task * +priority_tree_next_task (enum priority_queue_type type1, + struct priority_queue *q1, + enum priority_queue_type type2, + struct priority_queue *q2, + bool *q1_chosen_p) +{ + struct gomp_task *t1 = priority_tree_next_task_1 (type1, q1->t.root); + if (!t1 + /* Special optimization when only searching through one queue. */ + || !q2) + { + *q1_chosen_p = true; + return t1; + } + struct gomp_task *t2 = priority_tree_next_task_1 (type2, q2->t.root); + if (!t2 || t1->priority > t2->priority) + { + *q1_chosen_p = true; + return t1; + } + if (t2->priority > t1->priority) + { + *q1_chosen_p = false; + return t2; + } + /* If we get here, the priorities are the same, so we must look at + parent_depends_on to make our decision. */ +#if _LIBGOMP_CHECKING_ + if (t1 != t2) + gomp_fatal ("priority_tree_next_task: t1 != t2"); +#endif + if (t2->parent_depends_on && !t1->parent_depends_on) + { + *q1_chosen_p = false; + return t2; + } + *q1_chosen_p = true; + return t1; +} + +/* Priority splay trees comparison function. */ +static inline int +prio_splay_compare (prio_splay_tree_key x, prio_splay_tree_key y) +{ + if (x->l.priority == y->l.priority) + return 0; + return x->l.priority < y->l.priority ? -1 : 1; +} + +/* Define another splay tree instantiation, for priority_list's. */ +#define splay_tree_prefix prio +#define splay_tree_c +#include "splay-tree.h" --- libgomp/openacc.f90.jj 2016-07-13 16:57:04.434535373 +0200 +++ libgomp/openacc.f90 2016-07-14 19:01:54.901230875 +0200 @@ -0,0 +1,911 @@ +! OpenACC Runtime Library Definitions. + +! Copyright (C) 2014-2016 Free Software Foundation, Inc. + +! Contributed by Tobias Burnus +! and Mentor Embedded. + +! This file is part of the GNU Offloading and Multi Processing Library +! (libgomp). + +! Libgomp is free software; you can redistribute it and/or modify it +! under the terms of the GNU General Public License as published by +! the Free Software Foundation; either version 3, or (at your option) +! any later version. + +! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY +! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +! FOR A PARTICULAR PURPOSE. See the GNU General Public License for +! more details. + +! Under Section 7 of GPL version 3, you are granted additional +! permissions described in the GCC Runtime Library Exception, version +! 3.1, as published by the Free Software Foundation. + +! You should have received a copy of the GNU General Public License and +! a copy of the GCC Runtime Library Exception along with this program; +! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +! . + +module openacc_kinds + use iso_fortran_env, only: int32 + implicit none + + private :: int32 + public :: acc_device_kind + + integer, parameter :: acc_device_kind = int32 + + public :: acc_device_none, acc_device_default, acc_device_host + public :: acc_device_not_host, acc_device_nvidia + + ! Keep in sync with include/gomp-constants.h. + integer (acc_device_kind), parameter :: acc_device_none = 0 + integer (acc_device_kind), parameter :: acc_device_default = 1 + integer (acc_device_kind), parameter :: acc_device_host = 2 + ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed. + integer (acc_device_kind), parameter :: acc_device_not_host = 4 + integer (acc_device_kind), parameter :: acc_device_nvidia = 5 + + public :: acc_handle_kind + + integer, parameter :: acc_handle_kind = int32 + + public :: acc_async_noval, acc_async_sync + + ! Keep in sync with include/gomp-constants.h. + integer (acc_handle_kind), parameter :: acc_async_noval = -1 + integer (acc_handle_kind), parameter :: acc_async_sync = -2 + +end module + +module openacc_internal + use openacc_kinds + implicit none + + interface + function acc_get_num_devices_h (d) + import + integer acc_get_num_devices_h + integer (acc_device_kind) d + end function + + subroutine acc_set_device_type_h (d) + import + integer (acc_device_kind) d + end subroutine + + function acc_get_device_type_h () + import + integer (acc_device_kind) acc_get_device_type_h + end function + + subroutine acc_set_device_num_h (n, d) + import + integer n + integer (acc_device_kind) d + end subroutine + + function acc_get_device_num_h (d) + import + integer acc_get_device_num_h + integer (acc_device_kind) d + end function + + function acc_async_test_h (a) + logical acc_async_test_h + integer a + end function + + function acc_async_test_all_h () + logical acc_async_test_all_h + end function + + subroutine acc_wait_h (a) + integer a + end subroutine + + subroutine acc_wait_async_h (a1, a2) + integer a1, a2 + end subroutine + + subroutine acc_wait_all_h () + end subroutine + + subroutine acc_wait_all_async_h (a) + integer a + end subroutine + + subroutine acc_init_h (d) + import + integer (acc_device_kind) d + end subroutine + + subroutine acc_shutdown_h (d) + import + integer (acc_device_kind) d + end subroutine + + function acc_on_device_h (d) + import + integer (acc_device_kind) d + logical acc_on_device_h + end function + + subroutine acc_copyin_32_h (a, len) + use iso_c_binding, only: c_int32_t + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_copyin_64_h (a, len) + use iso_c_binding, only: c_int64_t + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_copyin_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + + subroutine acc_present_or_copyin_32_h (a, len) + use iso_c_binding, only: c_int32_t + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_present_or_copyin_64_h (a, len) + use iso_c_binding, only: c_int64_t + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_present_or_copyin_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + + subroutine acc_create_32_h (a, len) + use iso_c_binding, only: c_int32_t + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_create_64_h (a, len) + use iso_c_binding, only: c_int64_t + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_create_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + + subroutine acc_present_or_create_32_h (a, len) + use iso_c_binding, only: c_int32_t + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_present_or_create_64_h (a, len) + use iso_c_binding, only: c_int64_t + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_present_or_create_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + + subroutine acc_copyout_32_h (a, len) + use iso_c_binding, only: c_int32_t + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_copyout_64_h (a, len) + use iso_c_binding, only: c_int64_t + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_copyout_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + + subroutine acc_delete_32_h (a, len) + use iso_c_binding, only: c_int32_t + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_delete_64_h (a, len) + use iso_c_binding, only: c_int64_t + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_delete_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + + subroutine acc_update_device_32_h (a, len) + use iso_c_binding, only: c_int32_t + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_update_device_64_h (a, len) + use iso_c_binding, only: c_int64_t + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_update_device_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + + subroutine acc_update_self_32_h (a, len) + use iso_c_binding, only: c_int32_t + type (*), dimension (*) :: a + integer (c_int32_t) len + end subroutine + + subroutine acc_update_self_64_h (a, len) + use iso_c_binding, only: c_int64_t + type (*), dimension (*) :: a + integer (c_int64_t) len + end subroutine + + subroutine acc_update_self_array_h (a) + type (*), dimension (..), contiguous :: a + end subroutine + + function acc_is_present_32_h (a, len) + use iso_c_binding, only: c_int32_t + logical acc_is_present_32_h + type (*), dimension (*) :: a + integer (c_int32_t) len + end function + + function acc_is_present_64_h (a, len) + use iso_c_binding, only: c_int64_t + logical acc_is_present_64_h + type (*), dimension (*) :: a + integer (c_int64_t) len + end function + + function acc_is_present_array_h (a) + logical acc_is_present_array_h + type (*), dimension (..), contiguous :: a + end function + end interface + + interface + function acc_get_num_devices_l (d) & + bind (C, name = "acc_get_num_devices") + use iso_c_binding, only: c_int + integer (c_int) :: acc_get_num_devices_l + integer (c_int), value :: d + end function + + subroutine acc_set_device_type_l (d) & + bind (C, name = "acc_set_device_type") + use iso_c_binding, only: c_int + integer (c_int), value :: d + end subroutine + + function acc_get_device_type_l () & + bind (C, name = "acc_get_device_type") + use iso_c_binding, only: c_int + integer (c_int) :: acc_get_device_type_l + end function + + subroutine acc_set_device_num_l (n, d) & + bind (C, name = "acc_set_device_num") + use iso_c_binding, only: c_int + integer (c_int), value :: n, d + end subroutine + + function acc_get_device_num_l (d) & + bind (C, name = "acc_get_device_num") + use iso_c_binding, only: c_int + integer (c_int) :: acc_get_device_num_l + integer (c_int), value :: d + end function + + function acc_async_test_l (a) & + bind (C, name = "acc_async_test") + use iso_c_binding, only: c_int + integer (c_int) :: acc_async_test_l + integer (c_int), value :: a + end function + + function acc_async_test_all_l () & + bind (C, name = "acc_async_test_all") + use iso_c_binding, only: c_int + integer (c_int) :: acc_async_test_all_l + end function + + subroutine acc_wait_l (a) & + bind (C, name = "acc_wait") + use iso_c_binding, only: c_int + integer (c_int), value :: a + end subroutine + + subroutine acc_wait_async_l (a1, a2) & + bind (C, name = "acc_wait_async") + use iso_c_binding, only: c_int + integer (c_int), value :: a1, a2 + end subroutine + + subroutine acc_wait_all_l () & + bind (C, name = "acc_wait_all") + use iso_c_binding, only: c_int + end subroutine + + subroutine acc_wait_all_async_l (a) & + bind (C, name = "acc_wait_all_async") + use iso_c_binding, only: c_int + integer (c_int), value :: a + end subroutine + + subroutine acc_init_l (d) & + bind (C, name = "acc_init") + use iso_c_binding, only: c_int + integer (c_int), value :: d + end subroutine + + subroutine acc_shutdown_l (d) & + bind (C, name = "acc_shutdown") + use iso_c_binding, only: c_int + integer (c_int), value :: d + end subroutine + + function acc_on_device_l (d) & + bind (C, name = "acc_on_device") + use iso_c_binding, only: c_int + integer (c_int) :: acc_on_device_l + integer (c_int), value :: d + end function + + subroutine acc_copyin_l (a, len) & + bind (C, name = "acc_copyin") + use iso_c_binding, only: c_size_t + type (*), dimension (*) :: a + integer (c_size_t), value :: len + end subroutine + + subroutine acc_present_or_copyin_l (a, len) & + bind (C, name = "acc_present_or_copyin") + use iso_c_binding, only: c_size_t + type (*), dimension (*) :: a + integer (c_size_t), value :: len + end subroutine + + subroutine acc_create_l (a, len) & + bind (C, name = "acc_create") + use iso_c_binding, only: c_size_t + type (*), dimension (*) :: a + integer (c_size_t), value :: len + end subroutine + + subroutine acc_present_or_create_l (a, len) & + bind (C, name = "acc_present_or_create") + use iso_c_binding, only: c_size_t + type (*), dimension (*) :: a + integer (c_size_t), value :: len + end subroutine + + subroutine acc_copyout_l (a, len) & + bind (C, name = "acc_copyout") + use iso_c_binding, only: c_size_t + type (*), dimension (*) :: a + integer (c_size_t), value :: len + end subroutine + + subroutine acc_delete_l (a, len) & + bind (C, name = "acc_delete") + use iso_c_binding, only: c_size_t + type (*), dimension (*) :: a + integer (c_size_t), value :: len + end subroutine + + subroutine acc_update_device_l (a, len) & + bind (C, name = "acc_update_device") + use iso_c_binding, only: c_size_t + type (*), dimension (*) :: a + integer (c_size_t), value :: len + end subroutine + + subroutine acc_update_self_l (a, len) & + bind (C, name = "acc_update_self") + use iso_c_binding, only: c_size_t + type (*), dimension (*) :: a + integer (c_size_t), value :: len + end subroutine + + function acc_is_present_l (a, len) & + bind (C, name = "acc_is_present") + use iso_c_binding, only: c_int32_t, c_size_t + integer (c_int32_t) :: acc_is_present_l + type (*), dimension (*) :: a + integer (c_size_t), value :: len + end function + end interface +end module + +module openacc + use openacc_kinds + use openacc_internal + implicit none + + public :: openacc_version + + public :: acc_get_num_devices, acc_set_device_type, acc_get_device_type + public :: acc_set_device_num, acc_get_device_num, acc_async_test + public :: acc_async_test_all, acc_wait, acc_wait_async, acc_wait_all + public :: acc_wait_all_async, acc_init, acc_shutdown, acc_on_device + public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create + public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete + public :: acc_update_device, acc_update_self, acc_is_present + + integer, parameter :: openacc_version = 201306 + + interface acc_get_num_devices + procedure :: acc_get_num_devices_h + end interface + + interface acc_set_device_type + procedure :: acc_set_device_type_h + end interface + + interface acc_get_device_type + procedure :: acc_get_device_type_h + end interface + + interface acc_set_device_num + procedure :: acc_set_device_num_h + end interface + + interface acc_get_device_num + procedure :: acc_get_device_num_h + end interface + + interface acc_async_test + procedure :: acc_async_test_h + end interface + + interface acc_async_test_all + procedure :: acc_async_test_all_h + end interface + + interface acc_wait + procedure :: acc_wait_h + end interface + + interface acc_wait_async + procedure :: acc_wait_async_h + end interface + + interface acc_wait_all + procedure :: acc_wait_all_h + end interface + + interface acc_wait_all_async + procedure :: acc_wait_all_async_h + end interface + + interface acc_init + procedure :: acc_init_h + end interface + + interface acc_shutdown + procedure :: acc_shutdown_h + end interface + + interface acc_on_device + procedure :: acc_on_device_h + end interface + + ! acc_malloc: Only available in C/C++ + ! acc_free: Only available in C/C++ + + ! As vendor extension, the following code supports both 32bit and 64bit + ! arguments for "size"; the OpenACC standard only permits default-kind + ! integers, which are of kind 4 (i.e. 32 bits). + ! Additionally, the two-argument version also takes arrays as argument. + ! and the one argument version also scalars. Note that the code assumes + ! that the arrays are contiguous. + + interface acc_copyin + procedure :: acc_copyin_32_h + procedure :: acc_copyin_64_h + procedure :: acc_copyin_array_h + end interface + + interface acc_present_or_copyin + procedure :: acc_present_or_copyin_32_h + procedure :: acc_present_or_copyin_64_h + procedure :: acc_present_or_copyin_array_h + end interface + + interface acc_pcopyin + procedure :: acc_present_or_copyin_32_h + procedure :: acc_present_or_copyin_64_h + procedure :: acc_present_or_copyin_array_h + end interface + + interface acc_create + procedure :: acc_create_32_h + procedure :: acc_create_64_h + procedure :: acc_create_array_h + end interface + + interface acc_present_or_create + procedure :: acc_present_or_create_32_h + procedure :: acc_present_or_create_64_h + procedure :: acc_present_or_create_array_h + end interface + + interface acc_pcreate + procedure :: acc_present_or_create_32_h + procedure :: acc_present_or_create_64_h + procedure :: acc_present_or_create_array_h + end interface + + interface acc_copyout + procedure :: acc_copyout_32_h + procedure :: acc_copyout_64_h + procedure :: acc_copyout_array_h + end interface + + interface acc_delete + procedure :: acc_delete_32_h + procedure :: acc_delete_64_h + procedure :: acc_delete_array_h + end interface + + interface acc_update_device + procedure :: acc_update_device_32_h + procedure :: acc_update_device_64_h + procedure :: acc_update_device_array_h + end interface + + interface acc_update_self + procedure :: acc_update_self_32_h + procedure :: acc_update_self_64_h + procedure :: acc_update_self_array_h + end interface + + ! acc_map_data: Only available in C/C++ + ! acc_unmap_data: Only available in C/C++ + ! acc_deviceptr: Only available in C/C++ + ! acc_hostptr: Only available in C/C++ + + interface acc_is_present + procedure :: acc_is_present_32_h + procedure :: acc_is_present_64_h + procedure :: acc_is_present_array_h + end interface + + ! acc_memcpy_to_device: Only available in C/C++ + ! acc_memcpy_from_device: Only available in C/C++ + +end module + +function acc_get_num_devices_h (d) + use openacc_internal, only: acc_get_num_devices_l + use openacc_kinds + integer acc_get_num_devices_h + integer (acc_device_kind) d + acc_get_num_devices_h = acc_get_num_devices_l (d) +end function + +subroutine acc_set_device_type_h (d) + use openacc_internal, only: acc_set_device_type_l + use openacc_kinds + integer (acc_device_kind) d + call acc_set_device_type_l (d) +end subroutine + +function acc_get_device_type_h () + use openacc_internal, only: acc_get_device_type_l + use openacc_kinds + integer (acc_device_kind) acc_get_device_type_h + acc_get_device_type_h = acc_get_device_type_l () +end function + +subroutine acc_set_device_num_h (n, d) + use openacc_internal, only: acc_set_device_num_l + use openacc_kinds + integer n + integer (acc_device_kind) d + call acc_set_device_num_l (n, d) +end subroutine + +function acc_get_device_num_h (d) + use openacc_internal, only: acc_get_device_num_l + use openacc_kinds + integer acc_get_device_num_h + integer (acc_device_kind) d + acc_get_device_num_h = acc_get_device_num_l (d) +end function + +function acc_async_test_h (a) + use openacc_internal, only: acc_async_test_l + logical acc_async_test_h + integer a + if (acc_async_test_l (a) .eq. 1) then + acc_async_test_h = .TRUE. + else + acc_async_test_h = .FALSE. + end if +end function + +function acc_async_test_all_h () + use openacc_internal, only: acc_async_test_all_l + logical acc_async_test_all_h + if (acc_async_test_all_l () .eq. 1) then + acc_async_test_all_h = .TRUE. + else + acc_async_test_all_h = .FALSE. + end if +end function + +subroutine acc_wait_h (a) + use openacc_internal, only: acc_wait_l + integer a + call acc_wait_l (a) +end subroutine + +subroutine acc_wait_async_h (a1, a2) + use openacc_internal, only: acc_wait_async_l + integer a1, a2 + call acc_wait_async_l (a1, a2) +end subroutine + +subroutine acc_wait_all_h () + use openacc_internal, only: acc_wait_all_l + call acc_wait_all_l () +end subroutine + +subroutine acc_wait_all_async_h (a) + use openacc_internal, only: acc_wait_all_async_l + integer a + call acc_wait_all_async_l (a) +end subroutine + +subroutine acc_init_h (d) + use openacc_internal, only: acc_init_l + use openacc_kinds + integer (acc_device_kind) d + call acc_init_l (d) +end subroutine + +subroutine acc_shutdown_h (d) + use openacc_internal, only: acc_shutdown_l + use openacc_kinds + integer (acc_device_kind) d + call acc_shutdown_l (d) +end subroutine + +function acc_on_device_h (d) + use openacc_internal, only: acc_on_device_l + use openacc_kinds + integer (acc_device_kind) d + logical acc_on_device_h + if (acc_on_device_l (d) .eq. 1) then + acc_on_device_h = .TRUE. + else + acc_on_device_h = .FALSE. + end if +end function + +subroutine acc_copyin_32_h (a, len) + use iso_c_binding, only: c_int32_t, c_size_t + use openacc_internal, only: acc_copyin_l + type (*), dimension (*) :: a + integer (c_int32_t) len + call acc_copyin_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_copyin_64_h (a, len) + use iso_c_binding, only: c_int64_t, c_size_t + use openacc_internal, only: acc_copyin_l + type (*), dimension (*) :: a + integer (c_int64_t) len + call acc_copyin_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_copyin_array_h (a) + use openacc_internal, only: acc_copyin_l + type (*), dimension (..), contiguous :: a + call acc_copyin_l (a, sizeof (a)) +end subroutine + +subroutine acc_present_or_copyin_32_h (a, len) + use iso_c_binding, only: c_int32_t, c_size_t + use openacc_internal, only: acc_present_or_copyin_l + type (*), dimension (*) :: a + integer (c_int32_t) len + call acc_present_or_copyin_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_present_or_copyin_64_h (a, len) + use iso_c_binding, only: c_int64_t, c_size_t + use openacc_internal, only: acc_present_or_copyin_l + type (*), dimension (*) :: a + integer (c_int64_t) len + call acc_present_or_copyin_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_present_or_copyin_array_h (a) + use openacc_internal, only: acc_present_or_copyin_l + type (*), dimension (..), contiguous :: a + call acc_present_or_copyin_l (a, sizeof (a)) +end subroutine + +subroutine acc_create_32_h (a, len) + use iso_c_binding, only: c_int32_t, c_size_t + use openacc_internal, only: acc_create_l + type (*), dimension (*) :: a + integer (c_int32_t) len + call acc_create_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_create_64_h (a, len) + use iso_c_binding, only: c_int64_t, c_size_t + use openacc_internal, only: acc_create_l + type (*), dimension (*) :: a + integer (c_int64_t) len + call acc_create_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_create_array_h (a) + use openacc_internal, only: acc_create_l + type (*), dimension (..), contiguous :: a + call acc_create_l (a, sizeof (a)) +end subroutine + +subroutine acc_present_or_create_32_h (a, len) + use iso_c_binding, only: c_int32_t, c_size_t + use openacc_internal, only: acc_present_or_create_l + type (*), dimension (*) :: a + integer (c_int32_t) len + call acc_present_or_create_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_present_or_create_64_h (a, len) + use iso_c_binding, only: c_int64_t, c_size_t + use openacc_internal, only: acc_present_or_create_l + type (*), dimension (*) :: a + integer (c_int64_t) len + call acc_present_or_create_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_present_or_create_array_h (a) + use openacc_internal, only: acc_present_or_create_l + type (*), dimension (..), contiguous :: a + call acc_present_or_create_l (a, sizeof (a)) +end subroutine + +subroutine acc_copyout_32_h (a, len) + use iso_c_binding, only: c_int32_t, c_size_t + use openacc_internal, only: acc_copyout_l + type (*), dimension (*) :: a + integer (c_int32_t) len + call acc_copyout_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_copyout_64_h (a, len) + use iso_c_binding, only: c_int64_t, c_size_t + use openacc_internal, only: acc_copyout_l + type (*), dimension (*) :: a + integer (c_int64_t) len + call acc_copyout_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_copyout_array_h (a) + use openacc_internal, only: acc_copyout_l + type (*), dimension (..), contiguous :: a + call acc_copyout_l (a, sizeof (a)) +end subroutine + +subroutine acc_delete_32_h (a, len) + use iso_c_binding, only: c_int32_t, c_size_t + use openacc_internal, only: acc_delete_l + type (*), dimension (*) :: a + integer (c_int32_t) len + call acc_delete_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_delete_64_h (a, len) + use iso_c_binding, only: c_int64_t, c_size_t + use openacc_internal, only: acc_delete_l + type (*), dimension (*) :: a + integer (c_int64_t) len + call acc_delete_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_delete_array_h (a) + use openacc_internal, only: acc_delete_l + type (*), dimension (..), contiguous :: a + call acc_delete_l (a, sizeof (a)) +end subroutine + +subroutine acc_update_device_32_h (a, len) + use iso_c_binding, only: c_int32_t, c_size_t + use openacc_internal, only: acc_update_device_l + type (*), dimension (*) :: a + integer (c_int32_t) len + call acc_update_device_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_update_device_64_h (a, len) + use iso_c_binding, only: c_int64_t, c_size_t + use openacc_internal, only: acc_update_device_l + type (*), dimension (*) :: a + integer (c_int64_t) len + call acc_update_device_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_update_device_array_h (a) + use openacc_internal, only: acc_update_device_l + type (*), dimension (..), contiguous :: a + call acc_update_device_l (a, sizeof (a)) +end subroutine + +subroutine acc_update_self_32_h (a, len) + use iso_c_binding, only: c_int32_t, c_size_t + use openacc_internal, only: acc_update_self_l + type (*), dimension (*) :: a + integer (c_int32_t) len + call acc_update_self_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_update_self_64_h (a, len) + use iso_c_binding, only: c_int64_t, c_size_t + use openacc_internal, only: acc_update_self_l + type (*), dimension (*) :: a + integer (c_int64_t) len + call acc_update_self_l (a, int (len, kind = c_size_t)) +end subroutine + +subroutine acc_update_self_array_h (a) + use openacc_internal, only: acc_update_self_l + type (*), dimension (..), contiguous :: a + call acc_update_self_l (a, sizeof (a)) +end subroutine + +function acc_is_present_32_h (a, len) + use iso_c_binding, only: c_int32_t, c_size_t + use openacc_internal, only: acc_is_present_l + logical acc_is_present_32_h + type (*), dimension (*) :: a + integer (c_int32_t) len + if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then + acc_is_present_32_h = .TRUE. + else + acc_is_present_32_h = .FALSE. + end if +end function + +function acc_is_present_64_h (a, len) + use iso_c_binding, only: c_int64_t, c_size_t + use openacc_internal, only: acc_is_present_l + logical acc_is_present_64_h + type (*), dimension (*) :: a + integer (c_int64_t) len + if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then + acc_is_present_64_h = .TRUE. + else + acc_is_present_64_h = .FALSE. + end if +end function + +function acc_is_present_array_h (a) + use openacc_internal, only: acc_is_present_l + logical acc_is_present_array_h + type (*), dimension (..), contiguous :: a + acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1 +end function