From 3271520bd63a4acc3f4e653fedfca784643279a3 Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Aug 18 2014 06:57:05 +0000 Subject: import gcc-4.8.2-16.2.el7_0 --- diff --git a/SOURCES/gcc48-pr61801.patch b/SOURCES/gcc48-pr61801.patch new file mode 100644 index 0000000..b75696e --- /dev/null +++ b/SOURCES/gcc48-pr61801.patch @@ -0,0 +1,58 @@ +2014-07-17 Richard Biener + + PR rtl-optimization/61801 + * sched-deps.c (sched_analyze_2): For ASM_OPERANDS and + ASM_INPUT don't set reg_pending_barrier if it appears in a + debug-insn. + +2014-08-06 Jakub Jelinek + + PR rtl-optimization/61801 + * gcc.target/i386/pr61801.c: Rewritten. + +2014-07-28 Richard Biener + + PR rtl-optimization/61801 + * gcc.target/i386/pr61801.c: Fix testcase. + +2014-07-28 Richard Biener + + PR rtl-optimization/61801 + * gcc.target/i386/pr61801.c: New testcase. + +--- gcc/sched-deps.c (revision 212737) ++++ gcc/sched-deps.c (revision 212738) +@@ -2750,7 +2750,8 @@ sched_analyze_2 (struct deps_desc *deps, + Consider for instance a volatile asm that changes the fpu rounding + mode. An insn should not be moved across this even if it only uses + pseudo-regs because it might give an incorrectly rounded result. */ +- if (code != ASM_OPERANDS || MEM_VOLATILE_P (x)) ++ if ((code != ASM_OPERANDS || MEM_VOLATILE_P (x)) ++ && !DEBUG_INSN_P (insn)) + reg_pending_barrier = TRUE_BARRIER; + + /* For all ASM_OPERANDS, we must traverse the vector of input operands. +--- gcc/testsuite/gcc.target/i386/pr61801.c (revision 0) ++++ gcc/testsuite/gcc.target/i386/pr61801.c (revision 213654) +@@ -0,0 +1,21 @@ ++/* PR rtl-optimization/61801 */ ++/* { dg-do compile } */ ++/* { dg-options "-Os -fcompare-debug" } */ ++ ++int a, c; ++int bar (void); ++void baz (void); ++ ++void ++foo (void) ++{ ++ int d; ++ if (bar ()) ++ { ++ int e; ++ baz (); ++ asm volatile ("" : "=a" (e) : "0" (a), "i" (0)); ++ d = e; ++ } ++ c = d; ++} diff --git a/SOURCES/gcc48-rh1121077.patch b/SOURCES/gcc48-rh1121077.patch new file mode 100644 index 0000000..a610217 --- /dev/null +++ b/SOURCES/gcc48-rh1121077.patch @@ -0,0 +1,5990 @@ +2014-08-04 Jakub Jelinek + + * task.c (GOMP_taskgroup_end): If taskgroup->num_children + is not zero, but taskgroup->children is NULL and there are + any task->children, schedule those instead of waiting. + +2014-08-01 Jakub Jelinek + + * libgomp.h (struct gomp_task_depend_entry): Add redundant_out field. + (struct gomp_taskwait): New type. + (struct gomp_task): Add taskwait and parent_depends_on, remove + in_taskwait and taskwait_sem fields. + (gomp_finish_task): Don't destroy taskwait_sem. + * task.c (gomp_init_task): Don't init in_taskwait, instead init + taskwait and parent_depends_on. + (GOMP_task): For if (0) tasks with depend clause that depend on + earlier tasks don't defer them, instead call + gomp_task_maybe_wait_for_dependencies to wait for the dependencies. + Initialize redundant_out field, for redundant out entries just + move them at the end of linked list instead of removing them + completely, and set redundant_out flag instead of redundant. + (gomp_task_run_pre): Update last_parent_depends_on if scheduling + that task. + (gomp_task_run_post_handle_dependers): If parent is in + gomp_task_maybe_wait_for_dependencies and newly runnable task + is not parent_depends_on, queue it in parent->children linked + list after all runnable tasks with parent_depends_on set. + Adjust for addition of taskwait indirection. + (gomp_task_run_post_remove_parent): If parent is in + gomp_task_maybe_wait_for_dependencies and task to be removed + is parent_depends_on, decrement n_depend and if needed awake + parent. Adjust for addition of taskwait indirection. + (GOMP_taskwait): Adjust for addition of taskwait indirection. + (gomp_task_maybe_wait_for_dependencies): New function. + +2013-10-14 Jakub Jelinek + + * env.c (parse_bind_var): Initialize value to avoid + (false positive) warning. + +2013-10-12 Jakub Jelinek + + PR libgomp/58691 + * config/linux/proc.c (gomp_cpuset_popcount): Add unused attribute + to check variable. + (gomp_init_num_threads): Move i variable declaration into + #ifdef CPU_ALLOC_SIZE block. + * config/linux/affinity.c (gomp_affinity_init_level): Test + gomp_places_list_len == 0 rather than gomp_places_list == 0 + when checking for topology reading error. + * team.c (gomp_team_start): Don't handle bind == omp_proc_bind_false. + * env.c (parse_affinity): Add ignore argument, if true, don't populate + gomp_places_list, only parse env var and always return false. + (parse_places_var): Likewise. Don't check gomp_global_icv.bind_var. + (initialize_env): Always parse OMP_PLACES and GOMP_CPU_AFFINITY env + vars, default to OMP_PROC_BIND=true if OMP_PROC_BIND wasn't specified + and either of these variables were parsed correctly into a places + list. + +2013-10-11 Thomas Schwinge + + * testsuite/libgomp.c/lib-1.c (main): Add missing error check. + * testsuite/libgomp.fortran/lib1.f90: Likewise. + * testsuite/libgomp.fortran/lib2.f: Likewise. + * testsuite/libgomp.fortran/lib3.f: Likewise. + +2013-10-11 Jakub Jelinek + Tobias Burnus + Richard Henderson + + * target.c: New file. + * Makefile.am (libgomp_la_SOURCES): Add target.c. + * Makefile.in: Regenerated. + * libgomp_g.h (GOMP_task): Add depend argument. + (GOMP_barrier_cancel, GOMP_loop_end_cancel, + GOMP_sections_end_cancel, GOMP_target, GOMP_target_data, + GOMP_target_end_data, GOMP_target_update, GOMP_teams, + GOMP_parallel_loop_static, GOMP_parallel_loop_dynamic, + GOMP_parallel_loop_guided, GOMP_parallel_loop_runtime, + GOMP_parallel, GOMP_cancel, GOMP_cancellation_point, + GOMP_taskgroup_start, GOMP_taskgroup_end, + GOMP_parallel_sections): New prototypes. + * fortran.c (omp_is_initial_device): Add ialias_redirect. + (omp_is_initial_device_): New function. + (ULP, STR1, STR2, ialias_redirect): Removed. + (omp_get_cancellation_, omp_get_proc_bind_, omp_set_default_device_, + omp_set_default_device_8_, omp_get_default_device_, + omp_get_num_devices_, omp_get_num_teams_, omp_get_team_num_): New + functions. + * libgomp.map (GOMP_barrier_cancel, GOMP_loop_end_cancel, + GOMP_sections_end_cancel, GOMP_target, GOMP_target_data, + GOMP_target_end_data, GOMP_target_update, GOMP_teams): Export + @@GOMP_4.0. + (omp_is_initial_device, omp_is_initial_device_, omp_get_cancellation, + omp_get_cancellation_, omp_get_proc_bind, omp_get_proc_bind_, + omp_set_default_device, omp_set_default_device_, + omp_set_default_device_8_, omp_get_default_device, + omp_get_default_device_, omp_get_num_devices, omp_get_num_devices_, + omp_get_num_teams, omp_get_num_teams_, omp_get_team_num, + omp_get_team_num_): Export @@OMP_4.0. + * team.c (struct gomp_thread_start_data): Add place field. + (gomp_thread_start): Clear thr->thread_pool and + thr->task before returning. Use gomp_team_barrier_wait_final + instead of gomp_team_barrier_wait. Initialize thr->place. + (gomp_new_team): Initialize work_shares_to_free, work_share_cancelled, + team_cancelled and task_queued_count fields. + (gomp_free_pool_helper): Clear thr->thread_pool and thr->task + before calling pthread_exit. + (gomp_free_thread): No longer static. Use + gomp_managed_threads_lock instead of gomp_remaining_threads_lock. + (gomp_team_start): Add flags argument. Set + thr->thread_pool->threads_busy to nthreads immediately after creating + new pool. Use gomp_managed_threads_lock instead of + gomp_remaining_threads_lock. Handle OpenMP 4.0 affinity. + (gomp_team_end): Use gomp_managed_threads_lock instead of + gomp_remaining_threads_lock. Use gomp_team_barrier_wait_final instead + of gomp_team_barrier_wait. If team->team_cancelled, call + gomp_fini_worshare on ws chain starting at team->work_shares_to_free + rather than thr->ts.work_share. + (initialize_team): Don't call gomp_sem_init here. + * sections.c (GOMP_parallel_sections_start): Adjust gomp_team_start + caller. + (GOMP_parallel_sections, GOMP_sections_end_cancel): New functions. + * env.c (gomp_global_icv): Add default_device_var, target_data and + bind_var initializers. + (gomp_cpu_affinity, gomp_cpu_affinity_len): Remove. + (gomp_bind_var_list, gomp_bind_var_list_len, gomp_places_list, + gomp_places_list_len): New variables. + (parse_bind_var, parse_one_place, parse_places_var): New functions. + (parse_affinity): Rewritten to construct OMP_PLACES list with unit + sized places. + (gomp_cancel_var): New global variable. + (parse_int): New function. + (handle_omp_display_env): New function. + (initialize_env): Use it. Initialize default_device_var. + Parse OMP_CANCELLATION env var. Use parse_bind_var to parse + OMP_PROC_BIND instead of parse_boolean. Use parse_places_var for + OMP_PLACES parsing. Don't call parse_affinity if OMP_PLACES has + been successfully parsed (and call gomp_init_affinity in that case). + (omp_get_cancellation, omp_get_proc_bind, omp_set_default_device, + omp_get_default_device, omp_get_num_devices, omp_get_num_teams, + omp_get_team_num, omp_is_initial_device): New functions. + * libgomp.h: Include stdlib.h. + (ialias_ulp, ialias_str1, ialias_str2, ialias_redirect, ialias_call): + Define. + (struct target_mem_desc): Forward declare. + (struct gomp_task_icv): Add default_device_var, target_data, bind_var + and thread_limit_var fields. + (gomp_get_num_devices): New prototype. + (gomp_cancel_var): New extern decl. + (struct gomp_team): Add work_shares_to_free, work_share_cancelled, + team_cancelled and task_queued_count fields. Add comments about + task_{,queued_,running_}count. + (gomp_cancel_kind): New enum. + (gomp_work_share_end_cancel): New prototype. + (struct gomp_task): Add next_taskgroup, prev_taskgroup, taskgroup, + copy_ctors_done, dependers, depend_hash, depend_count, num_dependees + and depend fields. + (struct gomp_taskgroup): New type. + (struct gomp_task_depend_entry, + struct gomp_dependers_vec): New types. + (gomp_finish_task): Free depend_hash if non-NULL. + (struct gomp_team_state): Add place_partition_off + and place_partition_len fields. + (gomp_bind_var_list, gomp_bind_var_list_len, gomp_places_list, + gomp_places_list_len): New extern decls. + (struct gomp_thread): Add place field. + (gomp_cpu_affinity, gomp_cpu_affinity_len): Remove. + (gomp_init_thread_affinity): Add place argument. + (gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus, + gomp_affinity_remove_cpu, gomp_affinity_copy_place, + gomp_affinity_same_place, gomp_affinity_finalize_place_list, + gomp_affinity_init_level, gomp_affinity_print_place): New + prototypes. + (gomp_team_start): Add flags argument. + (gomp_thread_limit_var, gomp_remaining_threads_count, + gomp_remaining_threads_lock): Remove. + (gomp_managed_threads_lock): New variable. + (struct gomp_thread_pool): Add threads_busy field. + (gomp_free_thread): New prototype. + * task.c: Include hashtab.h. + (hash_entry_type): New typedef. + (htab_alloc, htab_free, htab_hash, htab_eq): New inlines. + (gomp_init_task): Clear dependers, depend_hash, depend_count, + copy_ctors_done and taskgroup fields. + (GOMP_task): Add depend argument, handle depend clauses. If + gomp_team_barrier_cancelled or if it's taskgroup has been + cancelled, don't queue or start new tasks. Set copy_ctors_done + field if needed. Initialize taskgroup field. If copy_ctors_done + and already cancelled, don't discard the task. If taskgroup is + non-NULL, enqueue the task into taskgroup queue. Increment + num_children field in taskgroup. Increment task_queued_count. + (gomp_task_run_pre, gomp_task_run_post_remove_parent, + gomp_task_run_post_remove_taskgroup): New inline functions. + (gomp_task_run_post_handle_depend_hash, + gomp_task_run_post_handle_dependers, + gomp_task_run_post_handle_depend): New functions. + (GOMP_taskwait): Use them. If more than one new tasks + have been queued, wake other threads if needed. + (gomp_barrier_handle_tasks): Likewise. If + gomp_team_barrier_cancelled, don't start any new tasks, just free + all tasks. + (GOMP_taskgroup_start, GOMP_taskgroup_end): New functions. + * loop.c (gomp_parallel_loop_start): Add flags argument, pass it + through to gomp_team_start. + (GOMP_parallel_loop_static_start, GOMP_parallel_loop_dynamic_start, + GOMP_parallel_loop_guided_start, GOMP_parallel_loop_runtime_start): + Adjust gomp_parallel_loop_start callers. + (GOMP_parallel_loop_static, GOMP_parallel_loop_dynamic, + GOMP_parallel_loop_guided, GOMP_parallel_loop_runtime, + GOMP_loop_end_cancel): New functions. + (GOMP_parallel_end): Add ialias_redirect. + * hashtab.h: New file. + * work.c (gomp_work_share_end, gomp_work_share_end_nowait): Set + team->work_shares_to_free to thr->ts.work_share before calling + free_work_share. + (gomp_work_share_end_cancel): New function. + * config/linux/proc.c: Include errno.h. + (gomp_get_cpuset_size, gomp_cpuset_size, gomp_cpusetp): New variables. + (gomp_cpuset_popcount): Add cpusetsize argument, use it instead of + sizeof (cpu_set_t) to determine number of iterations. Fix up check + extern decl. Use CPU_COUNT_S if available, or CPU_COUNT if + gomp_cpuset_size is sizeof (cpu_set_t). + (gomp_init_num_threads): Initialize gomp_cpuset_size, + gomp_get_cpuset_size and gomp_cpusetp here, use gomp_cpusetp instead + of &cpuset and pass gomp_cpuset_size instead of sizeof (cpu_set_t) + to pthread_getaffinity_np. Free and clear gomp_cpusetp if it didn't + contain any logical CPUs. + (get_num_procs): Don't call pthread_getaffinity_np if gomp_cpusetp + is NULL. Use gomp_cpusetp instead of &cpuset and pass + gomp_get_cpuset_size instead of sizeof (cpu_set_t) to + pthread_getaffinity_np. Check gomp_places_list instead of + gomp_cpu_affinity. Adjust gomp_cpuset_popcount caller. + * config/linux/bar.c (gomp_barrier_wait_end, + gomp_barrier_wait_last): Use BAR_* defines. + (gomp_team_barrier_wait_end): Likewise. Clear BAR_CANCELLED + from state where needed. Set work_share_cancelled to 0 on last + thread. + (gomp_team_barrier_wait_final, gomp_team_barrier_wait_cancel_end, + gomp_team_barrier_wait_cancel, gomp_team_barrier_cancel): New + functions. + * config/linux/proc.h (gomp_cpuset_popcount): Add attribute_hidden. + Add cpusetsize argument. + (gomp_cpuset_size, gomp_cpusetp): Declare. + * config/linux/affinity.c: Include errno.h, stdio.h and string.h. + (affinity_counter): Remove. + (CPU_ISSET_S, CPU_ZERO_S, CPU_SET_S, CPU_CLR_S): Define + if CPU_ALLOC_SIZE isn't defined. + (gomp_init_affinity): Rewritten, if gomp_places_list is NULL, try + silently create OMP_PLACES=threads, if it is non-NULL afterwards, + bind current thread to the first place. + (gomp_init_thread_affinity): Rewritten. Add place argument, just + pthread_setaffinity_np to gomp_places_list[place]. + (gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus, + gomp_affinity_remove_cpu, gomp_affinity_copy_place, + gomp_affinity_same_place, gomp_affinity_finalize_place_list, + gomp_affinity_init_level, gomp_affinity_print_place): New functions. + * config/linux/bar.h (BAR_TASK_PENDING, BAR_WAS_LAST, + BAR_WAITING_FOR_TASK, BAR_INCR, BAR_CANCELLED): Define. + (gomp_barrier_t): Add awaited_final field. + (gomp_barrier_init): Initialize awaited_final field. + (gomp_team_barrier_wait_final, gomp_team_barrier_wait_cancel, + gomp_team_barrier_wait_cancel_end, gomp_team_barrier_cancel): New + prototypes. + (gomp_barrier_wait_start): Preserve BAR_CANCELLED bit. Use BAR_* + defines. + (gomp_barrier_wait_cancel_start, gomp_team_barrier_wait_final_start, + gomp_team_barrier_cancelled): New inline functions. + (gomp_barrier_last_thread, + gomp_team_barrier_set_task_pending, + gomp_team_barrier_clear_task_pending, + gomp_team_barrier_set_waiting_for_tasks, + gomp_team_barrier_waiting_for_tasks, + gomp_team_barrier_done): Use BAR_* defines. + * config/posix/bar.c (gomp_barrier_init): Clear cancellable field. + (gomp_barrier_wait_end): Use BAR_* defines. + (gomp_team_barrier_wait_end): Clear BAR_CANCELLED from state. + Set work_share_cancelled to 0 on last thread, use __atomic_load_n. + Use BAR_* defines. + (gomp_team_barrier_wait_cancel_end, gomp_team_barrier_wait_cancel, + gomp_team_barrier_cancel): New functions. + * config/posix/affinity.c (gomp_init_thread_affinity): Add place + argument. + (gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus, + gomp_affinity_remove_cpu, gomp_affinity_copy_place, + gomp_affinity_same_place, gomp_affinity_finalize_place_list, + gomp_affinity_init_level, gomp_affinity_print_place): New stubs. + * config/posix/bar.h (BAR_TASK_PENDING, BAR_WAS_LAST, + BAR_WAITING_FOR_TASK, BAR_INCR, BAR_CANCELLED): Define. + (gomp_barrier_t): Add cancellable field. + (gomp_team_barrier_wait_cancel, gomp_team_barrier_wait_cancel_end, + gomp_team_barrier_cancel): New prototypes. + (gomp_barrier_wait_start): Preserve BAR_CANCELLED bit. + (gomp_barrier_wait_cancel_start, gomp_team_barrier_wait_final, + gomp_team_barrier_cancelled): New inline functions. + (gomp_barrier_wait_start, gomp_barrier_last_thread, + gomp_team_barrier_set_task_pending, + gomp_team_barrier_clear_task_pending, + gomp_team_barrier_set_waiting_for_tasks, + gomp_team_barrier_waiting_for_tasks, + gomp_team_barrier_done): Use BAR_* defines. + * barrier.c (GOMP_barrier_cancel): New function. + * parallel.c (GOMP_parallel, GOMP_cancel, GOMP_cancellation_point): + New functions. + (gomp_resolve_num_threads): Adjust for thread_limit now being in + icv->thread_limit_var. Use UINT_MAX instead of ULONG_MAX as + infinity. If not nested, just return minimum of max_num_threads + and icv->thread_limit_var and if thr->thread_pool, set threads_busy + to the returned value. Otherwise, don't update atomically + gomp_remaining_threads_count, but instead thr->thread_pool->threads_busy. + (GOMP_parallel_end): Adjust for thread_limit now being in + icv->thread_limit_var. Use UINT_MAX instead of ULONG_MAX as + infinity. Adjust threads_busy in the pool rather than + gomp_remaining_threads_count. Remember team->nthreads and call + gomp_team_end before adjusting threads_busy, if not nested + afterwards, just set it to 1 non-atomically. Add ialias. + (GOMP_parallel_start): Adjust gomp_team_start caller. + * testsuite/libgomp.c/atomic-14.c: Add parens to make it valid. + +--- libgomp/Makefile.am (revision 210461) ++++ libgomp/Makefile.am (revision 210462) +@@ -60,7 +60,7 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_L + libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ + iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \ + task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \ +- time.c fortran.c affinity.c ++ time.c fortran.c affinity.c target.c + + nodist_noinst_HEADERS = libgomp_f.h + nodist_libsubinclude_HEADERS = omp.h +--- libgomp/Makefile.in (revision 210461) ++++ libgomp/Makefile.in (revision 210462) +@@ -96,7 +96,7 @@ am_libgomp_la_OBJECTS = alloc.lo barrier + error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \ + parallel.lo sections.lo single.lo task.lo team.lo work.lo \ + lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \ +- fortran.lo affinity.lo ++ fortran.lo affinity.lo target.lo + libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) + DEFAULT_INCLUDES = -I.@am__isrc@ + depcomp = $(SHELL) $(top_srcdir)/../depcomp +@@ -317,7 +317,7 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_L + libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ + iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \ + task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \ +- time.c fortran.c affinity.c ++ time.c fortran.c affinity.c target.c + + nodist_noinst_HEADERS = libgomp_f.h + nodist_libsubinclude_HEADERS = omp.h +@@ -474,6 +474,7 @@ distclean-compile: + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@ +--- libgomp/libgomp_g.h (revision 210461) ++++ libgomp/libgomp_g.h (revision 210462) +@@ -33,6 +33,7 @@ + /* barrier.c */ + + extern void GOMP_barrier (void); ++extern bool GOMP_barrier_cancel (void); + + /* critical.c */ + +@@ -76,9 +77,22 @@ extern void GOMP_parallel_loop_guided_st + unsigned, long, long, long, long); + extern void GOMP_parallel_loop_runtime_start (void (*)(void *), void *, + unsigned, long, long, long); ++extern void GOMP_parallel_loop_static (void (*)(void *), void *, ++ unsigned, long, long, long, long, ++ unsigned); ++extern void GOMP_parallel_loop_dynamic (void (*)(void *), void *, ++ unsigned, long, long, long, long, ++ unsigned); ++extern void GOMP_parallel_loop_guided (void (*)(void *), void *, ++ unsigned, long, long, long, long, ++ unsigned); ++extern void GOMP_parallel_loop_runtime (void (*)(void *), void *, ++ unsigned, long, long, long, ++ unsigned); + + extern void GOMP_loop_end (void); + extern void GOMP_loop_end_nowait (void); ++extern bool GOMP_loop_end_cancel (void); + + /* loop_ull.c */ + +@@ -157,13 +171,18 @@ extern void GOMP_ordered_end (void); + + extern void GOMP_parallel_start (void (*) (void *), void *, unsigned); + extern void GOMP_parallel_end (void); ++extern void GOMP_parallel (void (*) (void *), void *, unsigned, unsigned); ++extern bool GOMP_cancel (int, bool); ++extern bool GOMP_cancellation_point (int); + + /* task.c */ + + extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *), +- long, long, bool, unsigned); ++ long, long, bool, unsigned, void **); + extern void GOMP_taskwait (void); + extern void GOMP_taskyield (void); ++extern void GOMP_taskgroup_start (void); ++extern void GOMP_taskgroup_end (void); + + /* sections.c */ + +@@ -171,8 +190,11 @@ extern unsigned GOMP_sections_start (uns + extern unsigned GOMP_sections_next (void); + extern void GOMP_parallel_sections_start (void (*) (void *), void *, + unsigned, unsigned); ++extern void GOMP_parallel_sections (void (*) (void *), void *, ++ unsigned, unsigned, unsigned); + extern void GOMP_sections_end (void); + extern void GOMP_sections_end_nowait (void); ++extern bool GOMP_sections_end_cancel (void); + + /* single.c */ + +@@ -180,4 +202,15 @@ extern bool GOMP_single_start (void); + extern void *GOMP_single_copy_start (void); + extern void GOMP_single_copy_end (void *); + ++/* target.c */ ++ ++extern void GOMP_target (int, void (*) (void *), const void *, ++ size_t, void **, size_t *, unsigned char *); ++extern void GOMP_target_data (int, const void *, ++ size_t, void **, size_t *, unsigned char *); ++extern void GOMP_target_end_data (void); ++extern void GOMP_target_update (int, const void *, ++ size_t, void **, size_t *, unsigned char *); ++extern void GOMP_teams (unsigned int, unsigned int); ++ + #endif /* LIBGOMP_G_H */ +--- libgomp/fortran.c (revision 210461) ++++ libgomp/fortran.c (revision 210462) +@@ -31,11 +31,6 @@ + + #ifdef HAVE_ATTRIBUTE_ALIAS + /* Use internal aliases if possible. */ +-# define ULP STR1(__USER_LABEL_PREFIX__) +-# define STR1(x) STR2(x) +-# define STR2(x) #x +-# define ialias_redirect(fn) \ +- extern __typeof (fn) fn __asm__ (ULP "gomp_ialias_" #fn) attribute_hidden; + # ifndef LIBGOMP_GNU_SYMBOL_VERSIONING + ialias_redirect (omp_init_lock) + ialias_redirect (omp_init_nest_lock) +@@ -70,6 +65,14 @@ ialias_redirect (omp_get_ancestor_thread + ialias_redirect (omp_get_team_size) + ialias_redirect (omp_get_active_level) + ialias_redirect (omp_in_final) ++ialias_redirect (omp_get_cancellation) ++ialias_redirect (omp_get_proc_bind) ++ialias_redirect (omp_set_default_device) ++ialias_redirect (omp_get_default_device) ++ialias_redirect (omp_get_num_devices) ++ialias_redirect (omp_get_num_teams) ++ialias_redirect (omp_get_team_num) ++ialias_redirect (omp_is_initial_device) + #endif + + #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING +@@ -435,3 +438,57 @@ omp_in_final_ (void) + { + return omp_in_final (); + } ++ ++int32_t ++omp_get_cancellation_ (void) ++{ ++ return omp_get_cancellation (); ++} ++ ++int32_t ++omp_get_proc_bind_ (void) ++{ ++ return omp_get_proc_bind (); ++} ++ ++void ++omp_set_default_device_ (const int32_t *device_num) ++{ ++ return omp_set_default_device (*device_num); ++} ++ ++void ++omp_set_default_device_8_ (const int64_t *device_num) ++{ ++ return omp_set_default_device (TO_INT (*device_num)); ++} ++ ++int32_t ++omp_get_default_device_ (void) ++{ ++ return omp_get_default_device (); ++} ++ ++int32_t ++omp_get_num_devices_ (void) ++{ ++ return omp_get_num_devices (); ++} ++ ++int32_t ++omp_get_num_teams_ (void) ++{ ++ return omp_get_num_teams (); ++} ++ ++int32_t ++omp_get_team_num_ (void) ++{ ++ return omp_get_team_num (); ++} ++ ++int32_t ++omp_is_initial_device_ (void) ++{ ++ return omp_is_initial_device (); ++} +--- libgomp/libgomp.map (revision 210461) ++++ libgomp/libgomp.map (revision 210462) +@@ -113,6 +113,27 @@ OMP_3.1 { + omp_in_final_; + } OMP_3.0; + ++OMP_4.0 { ++ global: ++ omp_get_cancellation; ++ omp_get_cancellation_; ++ omp_get_proc_bind; ++ omp_get_proc_bind_; ++ omp_set_default_device; ++ omp_set_default_device_; ++ omp_set_default_device_8_; ++ omp_get_default_device; ++ omp_get_default_device_; ++ omp_get_num_devices; ++ omp_get_num_devices_; ++ omp_get_num_teams; ++ omp_get_num_teams_; ++ omp_get_team_num; ++ omp_get_team_num_; ++ omp_is_initial_device; ++ omp_is_initial_device_; ++} OMP_3.1; ++ + GOMP_1.0 { + global: + GOMP_atomic_end; +@@ -184,3 +205,25 @@ GOMP_3.0 { + global: + GOMP_taskyield; + } GOMP_2.0; ++ ++GOMP_4.0 { ++ global: ++ GOMP_barrier_cancel; ++ GOMP_cancel; ++ GOMP_cancellation_point; ++ GOMP_loop_end_cancel; ++ GOMP_parallel_loop_dynamic; ++ GOMP_parallel_loop_guided; ++ GOMP_parallel_loop_runtime; ++ GOMP_parallel_loop_static; ++ GOMP_parallel_sections; ++ GOMP_parallel; ++ GOMP_sections_end_cancel; ++ GOMP_taskgroup_start; ++ GOMP_taskgroup_end; ++ GOMP_target; ++ GOMP_target_data; ++ GOMP_target_end_data; ++ GOMP_target_update; ++ GOMP_teams; ++} GOMP_3.0; +--- libgomp/team.c (revision 210461) ++++ libgomp/team.c (revision 210462) +@@ -53,6 +53,7 @@ struct gomp_thread_start_data + struct gomp_team_state ts; + struct gomp_task *task; + struct gomp_thread_pool *thread_pool; ++ unsigned int place; + bool nested; + }; + +@@ -84,6 +85,7 @@ gomp_thread_start (void *xdata) + thr->thread_pool = data->thread_pool; + thr->ts = data->ts; + thr->task = data->task; ++ thr->place = data->place; + + thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release; + +@@ -98,7 +100,7 @@ gomp_thread_start (void *xdata) + gomp_barrier_wait (&team->barrier); + + local_fn (local_data); +- gomp_team_barrier_wait (&team->barrier); ++ gomp_team_barrier_wait_final (&team->barrier); + gomp_finish_task (task); + gomp_barrier_wait_last (&team->barrier); + } +@@ -113,7 +115,7 @@ gomp_thread_start (void *xdata) + struct gomp_task *task = thr->task; + + local_fn (local_data); +- gomp_team_barrier_wait (&team->barrier); ++ gomp_team_barrier_wait_final (&team->barrier); + gomp_finish_task (task); + + gomp_barrier_wait (&pool->threads_dock); +@@ -126,6 +128,8 @@ gomp_thread_start (void *xdata) + } + + gomp_sem_destroy (&thr->release); ++ thr->thread_pool = NULL; ++ thr->task = NULL; + return NULL; + } + +@@ -149,6 +153,7 @@ gomp_new_team (unsigned nthreads) + #else + gomp_mutex_init (&team->work_share_list_free_lock); + #endif ++ team->work_shares_to_free = &team->work_shares[0]; + gomp_init_work_share (&team->work_shares[0], false, nthreads); + team->work_shares[0].next_alloc = NULL; + team->work_share_list_free = NULL; +@@ -167,7 +172,10 @@ gomp_new_team (unsigned nthreads) + gomp_mutex_init (&team->task_lock); + team->task_queue = NULL; + team->task_count = 0; ++ team->task_queued_count = 0; + team->task_running_count = 0; ++ team->work_share_cancelled = 0; ++ team->team_cancelled = 0; + + return team; + } +@@ -199,16 +207,19 @@ static struct gomp_thread_pool *gomp_new + static void + gomp_free_pool_helper (void *thread_pool) + { ++ struct gomp_thread *thr = gomp_thread (); + struct gomp_thread_pool *pool + = (struct gomp_thread_pool *) thread_pool; + gomp_barrier_wait_last (&pool->threads_dock); +- gomp_sem_destroy (&gomp_thread ()->release); ++ gomp_sem_destroy (&thr->release); ++ thr->thread_pool = NULL; ++ thr->task = NULL; + pthread_exit (NULL); + } + + /* Free a thread pool and release its threads. */ + +-static void ++void + gomp_free_thread (void *arg __attribute__((unused))) + { + struct gomp_thread *thr = gomp_thread (); +@@ -236,9 +247,9 @@ gomp_free_thread (void *arg __attribute_ + __sync_fetch_and_add (&gomp_managed_threads, + 1L - pool->threads_used); + #else +- gomp_mutex_lock (&gomp_remaining_threads_lock); ++ gomp_mutex_lock (&gomp_managed_threads_lock); + gomp_managed_threads -= pool->threads_used - 1L; +- gomp_mutex_unlock (&gomp_remaining_threads_lock); ++ gomp_mutex_unlock (&gomp_managed_threads_lock); + #endif + } + free (pool->threads); +@@ -259,7 +270,7 @@ gomp_free_thread (void *arg __attribute_ + + void + gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, +- struct gomp_team *team) ++ unsigned flags, struct gomp_team *team) + { + struct gomp_thread_start_data *start_data; + struct gomp_thread *thr, *nthr; +@@ -270,17 +281,24 @@ gomp_team_start (void (*fn) (void *), vo + unsigned i, n, old_threads_used = 0; + pthread_attr_t thread_attr, *attr; + unsigned long nthreads_var; ++ char bind, bind_var; ++ unsigned int s = 0, rest = 0, p = 0, k = 0; ++ unsigned int affinity_count = 0; ++ struct gomp_thread **affinity_thr = NULL; + + thr = gomp_thread (); + nested = thr->ts.team != NULL; + if (__builtin_expect (thr->thread_pool == NULL, 0)) + { + thr->thread_pool = gomp_new_thread_pool (); ++ thr->thread_pool->threads_busy = nthreads; + pthread_setspecific (gomp_thread_destructor, thr); + } + pool = thr->thread_pool; + task = thr->task; + icv = task ? &task->icv : &gomp_global_icv; ++ if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0) ++ gomp_init_affinity (); + + /* Always save the previous state, even if this isn't a nested team. + In particular, we should save any work share state from an outer +@@ -303,14 +321,90 @@ gomp_team_start (void (*fn) (void *), vo + if (__builtin_expect (gomp_nthreads_var_list != NULL, 0) + && thr->ts.level < gomp_nthreads_var_list_len) + nthreads_var = gomp_nthreads_var_list[thr->ts.level]; ++ bind_var = icv->bind_var; ++ if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false) ++ bind_var = flags & 7; ++ bind = bind_var; ++ if (__builtin_expect (gomp_bind_var_list != NULL, 0) ++ && thr->ts.level < gomp_bind_var_list_len) ++ bind_var = gomp_bind_var_list[thr->ts.level]; + gomp_init_task (thr->task, task, icv); + team->implicit_task[0].icv.nthreads_var = nthreads_var; ++ team->implicit_task[0].icv.bind_var = bind_var; + + if (nthreads == 1) + return; + + i = 1; + ++ if (__builtin_expect (gomp_places_list != NULL, 0)) ++ { ++ /* Depending on chosen proc_bind model, set subpartition ++ for the master thread and initialize helper variables ++ P and optionally S, K and/or REST used by later place ++ computation for each additional thread. */ ++ p = thr->place - 1; ++ switch (bind) ++ { ++ case omp_proc_bind_true: ++ case omp_proc_bind_close: ++ if (nthreads > thr->ts.place_partition_len) ++ { ++ /* T > P. S threads will be placed in each place, ++ and the final REM threads placed one by one ++ into the already occupied places. */ ++ s = nthreads / thr->ts.place_partition_len; ++ rest = nthreads % thr->ts.place_partition_len; ++ } ++ else ++ s = 1; ++ k = 1; ++ break; ++ case omp_proc_bind_master: ++ /* Each thread will be bound to master's place. */ ++ break; ++ case omp_proc_bind_spread: ++ if (nthreads <= thr->ts.place_partition_len) ++ { ++ /* T <= P. Each subpartition will have in between s ++ and s+1 places (subpartitions starting at or ++ after rest will have s places, earlier s+1 places), ++ each thread will be bound to the first place in ++ its subpartition (except for the master thread ++ that can be bound to another place in its ++ subpartition). */ ++ s = thr->ts.place_partition_len / nthreads; ++ rest = thr->ts.place_partition_len % nthreads; ++ rest = (s + 1) * rest + thr->ts.place_partition_off; ++ if (p < rest) ++ { ++ p -= (p - thr->ts.place_partition_off) % (s + 1); ++ thr->ts.place_partition_len = s + 1; ++ } ++ else ++ { ++ p -= (p - rest) % s; ++ thr->ts.place_partition_len = s; ++ } ++ thr->ts.place_partition_off = p; ++ } ++ else ++ { ++ /* T > P. Each subpartition will have just a single ++ place and we'll place between s and s+1 ++ threads into each subpartition. */ ++ s = nthreads / thr->ts.place_partition_len; ++ rest = nthreads % thr->ts.place_partition_len; ++ thr->ts.place_partition_off = p; ++ thr->ts.place_partition_len = 1; ++ k = 1; ++ } ++ break; ++ } ++ } ++ else ++ bind = omp_proc_bind_false; ++ + /* We only allow the reuse of idle threads for non-nested PARALLEL + regions. This appears to be implied by the semantics of + threadprivate variables, but perhaps that's reading too much into +@@ -341,47 +435,244 @@ gomp_team_start (void (*fn) (void *), vo + team will exit. */ + pool->threads_used = nthreads; + ++ /* If necessary, expand the size of the gomp_threads array. It is ++ expected that changes in the number of threads are rare, thus we ++ make no effort to expand gomp_threads_size geometrically. */ ++ if (nthreads >= pool->threads_size) ++ { ++ pool->threads_size = nthreads + 1; ++ pool->threads ++ = gomp_realloc (pool->threads, ++ pool->threads_size ++ * sizeof (struct gomp_thread_data *)); ++ } ++ + /* Release existing idle threads. */ + for (; i < n; ++i) + { +- nthr = pool->threads[i]; ++ unsigned int place_partition_off = thr->ts.place_partition_off; ++ unsigned int place_partition_len = thr->ts.place_partition_len; ++ unsigned int place = 0; ++ if (__builtin_expect (gomp_places_list != NULL, 0)) ++ { ++ switch (bind) ++ { ++ case omp_proc_bind_true: ++ case omp_proc_bind_close: ++ if (k == s) ++ { ++ ++p; ++ if (p == (team->prev_ts.place_partition_off ++ + team->prev_ts.place_partition_len)) ++ p = team->prev_ts.place_partition_off; ++ k = 1; ++ if (i == nthreads - rest) ++ s = 1; ++ } ++ else ++ ++k; ++ break; ++ case omp_proc_bind_master: ++ break; ++ case omp_proc_bind_spread: ++ if (k == 0) ++ { ++ /* T <= P. */ ++ if (p < rest) ++ p += s + 1; ++ else ++ p += s; ++ if (p == (team->prev_ts.place_partition_off ++ + team->prev_ts.place_partition_len)) ++ p = team->prev_ts.place_partition_off; ++ place_partition_off = p; ++ if (p < rest) ++ place_partition_len = s + 1; ++ else ++ place_partition_len = s; ++ } ++ else ++ { ++ /* T > P. */ ++ if (k == s) ++ { ++ ++p; ++ if (p == (team->prev_ts.place_partition_off ++ + team->prev_ts.place_partition_len)) ++ p = team->prev_ts.place_partition_off; ++ k = 1; ++ if (i == nthreads - rest) ++ s = 1; ++ } ++ else ++ ++k; ++ place_partition_off = p; ++ place_partition_len = 1; ++ } ++ break; ++ } ++ if (affinity_thr != NULL ++ || (bind != omp_proc_bind_true ++ && pool->threads[i]->place != p + 1) ++ || pool->threads[i]->place <= place_partition_off ++ || pool->threads[i]->place > (place_partition_off ++ + place_partition_len)) ++ { ++ unsigned int l; ++ if (affinity_thr == NULL) ++ { ++ unsigned int j; ++ ++ if (team->prev_ts.place_partition_len > 64) ++ affinity_thr ++ = gomp_malloc (team->prev_ts.place_partition_len ++ * sizeof (struct gomp_thread *)); ++ else ++ affinity_thr ++ = gomp_alloca (team->prev_ts.place_partition_len ++ * sizeof (struct gomp_thread *)); ++ memset (affinity_thr, '\0', ++ team->prev_ts.place_partition_len ++ * sizeof (struct gomp_thread *)); ++ for (j = i; j < old_threads_used; j++) ++ { ++ if (pool->threads[j]->place ++ > team->prev_ts.place_partition_off ++ && (pool->threads[j]->place ++ <= (team->prev_ts.place_partition_off ++ + team->prev_ts.place_partition_len))) ++ { ++ l = pool->threads[j]->place - 1 ++ - team->prev_ts.place_partition_off; ++ pool->threads[j]->data = affinity_thr[l]; ++ affinity_thr[l] = pool->threads[j]; ++ } ++ pool->threads[j] = NULL; ++ } ++ if (nthreads > old_threads_used) ++ memset (&pool->threads[old_threads_used], ++ '\0', ((nthreads - old_threads_used) ++ * sizeof (struct gomp_thread *))); ++ n = nthreads; ++ affinity_count = old_threads_used - i; ++ } ++ if (affinity_count == 0) ++ break; ++ l = p; ++ if (affinity_thr[l - team->prev_ts.place_partition_off] ++ == NULL) ++ { ++ if (bind != omp_proc_bind_true) ++ continue; ++ for (l = place_partition_off; ++ l < place_partition_off + place_partition_len; ++ l++) ++ if (affinity_thr[l - team->prev_ts.place_partition_off] ++ != NULL) ++ break; ++ if (l == place_partition_off + place_partition_len) ++ continue; ++ } ++ nthr = affinity_thr[l - team->prev_ts.place_partition_off]; ++ affinity_thr[l - team->prev_ts.place_partition_off] ++ = (struct gomp_thread *) nthr->data; ++ affinity_count--; ++ pool->threads[i] = nthr; ++ } ++ else ++ nthr = pool->threads[i]; ++ place = p + 1; ++ } ++ else ++ nthr = pool->threads[i]; + nthr->ts.team = team; + nthr->ts.work_share = &team->work_shares[0]; + nthr->ts.last_work_share = NULL; + nthr->ts.team_id = i; + nthr->ts.level = team->prev_ts.level + 1; + nthr->ts.active_level = thr->ts.active_level; ++ nthr->ts.place_partition_off = place_partition_off; ++ nthr->ts.place_partition_len = place_partition_len; + #ifdef HAVE_SYNC_BUILTINS + nthr->ts.single_count = 0; + #endif + nthr->ts.static_trip = 0; + nthr->task = &team->implicit_task[i]; ++ nthr->place = place; + gomp_init_task (nthr->task, task, icv); + team->implicit_task[i].icv.nthreads_var = nthreads_var; ++ team->implicit_task[i].icv.bind_var = bind_var; + nthr->fn = fn; + nthr->data = data; + team->ordered_release[i] = &nthr->release; + } + ++ if (__builtin_expect (affinity_thr != NULL, 0)) ++ { ++ /* If AFFINITY_THR is non-NULL just because we had to ++ permute some threads in the pool, but we've managed ++ to find exactly as many old threads as we'd find ++ without affinity, we don't need to handle this ++ specially anymore. */ ++ if (nthreads <= old_threads_used ++ ? (affinity_count == old_threads_used - nthreads) ++ : (i == old_threads_used)) ++ { ++ if (team->prev_ts.place_partition_len > 64) ++ free (affinity_thr); ++ affinity_thr = NULL; ++ affinity_count = 0; ++ } ++ else ++ { ++ i = 1; ++ /* We are going to compute the places/subpartitions ++ again from the beginning. So, we need to reinitialize ++ vars modified by the switch (bind) above inside ++ of the loop, to the state they had after the initial ++ switch (bind). */ ++ switch (bind) ++ { ++ case omp_proc_bind_true: ++ case omp_proc_bind_close: ++ if (nthreads > thr->ts.place_partition_len) ++ /* T > P. S has been changed, so needs ++ to be recomputed. */ ++ s = nthreads / thr->ts.place_partition_len; ++ k = 1; ++ p = thr->place - 1; ++ break; ++ case omp_proc_bind_master: ++ /* No vars have been changed. */ ++ break; ++ case omp_proc_bind_spread: ++ p = thr->ts.place_partition_off; ++ if (k != 0) ++ { ++ /* T > P. */ ++ s = nthreads / team->prev_ts.place_partition_len; ++ k = 1; ++ } ++ break; ++ } ++ ++ /* Increase the barrier threshold to make sure all new ++ threads and all the threads we're going to let die ++ arrive before the team is released. */ ++ if (affinity_count) ++ gomp_barrier_reinit (&pool->threads_dock, ++ nthreads + affinity_count); ++ } ++ } ++ + if (i == nthreads) + goto do_release; + +- /* If necessary, expand the size of the gomp_threads array. It is +- expected that changes in the number of threads are rare, thus we +- make no effort to expand gomp_threads_size geometrically. */ +- if (nthreads >= pool->threads_size) +- { +- pool->threads_size = nthreads + 1; +- pool->threads +- = gomp_realloc (pool->threads, +- pool->threads_size +- * sizeof (struct gomp_thread_data *)); +- } + } + +- if (__builtin_expect (nthreads > old_threads_used, 0)) ++ if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0)) + { +- long diff = (long) nthreads - (long) old_threads_used; ++ long diff = (long) (nthreads + affinity_count) - (long) old_threads_used; + + if (old_threads_used == 0) + --diff; +@@ -389,14 +680,14 @@ gomp_team_start (void (*fn) (void *), vo + #ifdef HAVE_SYNC_BUILTINS + __sync_fetch_and_add (&gomp_managed_threads, diff); + #else +- gomp_mutex_lock (&gomp_remaining_threads_lock); ++ gomp_mutex_lock (&gomp_managed_threads_lock); + gomp_managed_threads += diff; +- gomp_mutex_unlock (&gomp_remaining_threads_lock); ++ gomp_mutex_unlock (&gomp_managed_threads_lock); + #endif + } + + attr = &gomp_thread_attr; +- if (__builtin_expect (gomp_cpu_affinity != NULL, 0)) ++ if (__builtin_expect (gomp_places_list != NULL, 0)) + { + size_t stacksize; + pthread_attr_init (&thread_attr); +@@ -410,11 +701,78 @@ gomp_team_start (void (*fn) (void *), vo + * (nthreads-i)); + + /* Launch new threads. */ +- for (; i < nthreads; ++i, ++start_data) ++ for (; i < nthreads; ++i) + { + pthread_t pt; + int err; + ++ start_data->ts.place_partition_off = thr->ts.place_partition_off; ++ start_data->ts.place_partition_len = thr->ts.place_partition_len; ++ start_data->place = 0; ++ if (__builtin_expect (gomp_places_list != NULL, 0)) ++ { ++ switch (bind) ++ { ++ case omp_proc_bind_true: ++ case omp_proc_bind_close: ++ if (k == s) ++ { ++ ++p; ++ if (p == (team->prev_ts.place_partition_off ++ + team->prev_ts.place_partition_len)) ++ p = team->prev_ts.place_partition_off; ++ k = 1; ++ if (i == nthreads - rest) ++ s = 1; ++ } ++ else ++ ++k; ++ break; ++ case omp_proc_bind_master: ++ break; ++ case omp_proc_bind_spread: ++ if (k == 0) ++ { ++ /* T <= P. */ ++ if (p < rest) ++ p += s + 1; ++ else ++ p += s; ++ if (p == (team->prev_ts.place_partition_off ++ + team->prev_ts.place_partition_len)) ++ p = team->prev_ts.place_partition_off; ++ start_data->ts.place_partition_off = p; ++ if (p < rest) ++ start_data->ts.place_partition_len = s + 1; ++ else ++ start_data->ts.place_partition_len = s; ++ } ++ else ++ { ++ /* T > P. */ ++ if (k == s) ++ { ++ ++p; ++ if (p == (team->prev_ts.place_partition_off ++ + team->prev_ts.place_partition_len)) ++ p = team->prev_ts.place_partition_off; ++ k = 1; ++ if (i == nthreads - rest) ++ s = 1; ++ } ++ else ++ ++k; ++ start_data->ts.place_partition_off = p; ++ start_data->ts.place_partition_len = 1; ++ } ++ break; ++ } ++ start_data->place = p + 1; ++ if (affinity_thr != NULL && pool->threads[i] != NULL) ++ continue; ++ gomp_init_thread_affinity (attr, p); ++ } ++ + start_data->fn = fn; + start_data->fn_data = data; + start_data->ts.team = team; +@@ -430,18 +788,16 @@ gomp_team_start (void (*fn) (void *), vo + start_data->task = &team->implicit_task[i]; + gomp_init_task (start_data->task, task, icv); + team->implicit_task[i].icv.nthreads_var = nthreads_var; ++ team->implicit_task[i].icv.bind_var = bind_var; + start_data->thread_pool = pool; + start_data->nested = nested; + +- if (gomp_cpu_affinity != NULL) +- gomp_init_thread_affinity (attr); +- +- err = pthread_create (&pt, attr, gomp_thread_start, start_data); ++ err = pthread_create (&pt, attr, gomp_thread_start, start_data++); + if (err != 0) + gomp_fatal ("Thread creation failed: %s", strerror (err)); + } + +- if (__builtin_expect (gomp_cpu_affinity != NULL, 0)) ++ if (__builtin_expect (gomp_places_list != NULL, 0)) + pthread_attr_destroy (&thread_attr); + + do_release: +@@ -450,21 +806,32 @@ gomp_team_start (void (*fn) (void *), vo + /* Decrease the barrier threshold to match the number of threads + that should arrive back at the end of this team. The extra + threads should be exiting. Note that we arrange for this test +- to never be true for nested teams. */ +- if (__builtin_expect (nthreads < old_threads_used, 0)) ++ to never be true for nested teams. If AFFINITY_COUNT is non-zero, ++ the barrier as well as gomp_managed_threads was temporarily ++ set to NTHREADS + AFFINITY_COUNT. For NTHREADS < OLD_THREADS_COUNT, ++ AFFINITY_COUNT if non-zero will be always at least ++ OLD_THREADS_COUNT - NTHREADS. */ ++ if (__builtin_expect (nthreads < old_threads_used, 0) ++ || __builtin_expect (affinity_count, 0)) + { + long diff = (long) nthreads - (long) old_threads_used; + ++ if (affinity_count) ++ diff = -affinity_count; ++ + gomp_barrier_reinit (&pool->threads_dock, nthreads); + + #ifdef HAVE_SYNC_BUILTINS + __sync_fetch_and_add (&gomp_managed_threads, diff); + #else +- gomp_mutex_lock (&gomp_remaining_threads_lock); ++ gomp_mutex_lock (&gomp_managed_threads_lock); + gomp_managed_threads += diff; +- gomp_mutex_unlock (&gomp_remaining_threads_lock); ++ gomp_mutex_unlock (&gomp_managed_threads_lock); + #endif + } ++ if (__builtin_expect (affinity_thr != NULL, 0) ++ && team->prev_ts.place_partition_len > 64) ++ free (affinity_thr); + } + + +@@ -477,9 +844,26 @@ gomp_team_end (void) + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + +- /* This barrier handles all pending explicit threads. */ +- gomp_team_barrier_wait (&team->barrier); +- gomp_fini_work_share (thr->ts.work_share); ++ /* This barrier handles all pending explicit threads. ++ As #pragma omp cancel parallel might get awaited count in ++ team->barrier in a inconsistent state, we need to use a different ++ counter here. */ ++ gomp_team_barrier_wait_final (&team->barrier); ++ if (__builtin_expect (team->team_cancelled, 0)) ++ { ++ struct gomp_work_share *ws = team->work_shares_to_free; ++ do ++ { ++ struct gomp_work_share *next_ws = gomp_ptrlock_get (&ws->next_ws); ++ if (next_ws == NULL) ++ gomp_ptrlock_set (&ws->next_ws, ws); ++ gomp_fini_work_share (ws); ++ ws = next_ws; ++ } ++ while (ws != NULL); ++ } ++ else ++ gomp_fini_work_share (thr->ts.work_share); + + gomp_end_task (); + thr->ts = team->prev_ts; +@@ -489,9 +873,9 @@ gomp_team_end (void) + #ifdef HAVE_SYNC_BUILTINS + __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads); + #else +- gomp_mutex_lock (&gomp_remaining_threads_lock); ++ gomp_mutex_lock (&gomp_managed_threads_lock); + gomp_managed_threads -= team->nthreads - 1L; +- gomp_mutex_unlock (&gomp_remaining_threads_lock); ++ gomp_mutex_unlock (&gomp_managed_threads_lock); + #endif + /* This barrier has gomp_barrier_wait_last counterparts + and ensures the team can be safely destroyed. */ +@@ -532,8 +916,6 @@ gomp_team_end (void) + static void __attribute__((constructor)) + initialize_team (void) + { +- struct gomp_thread *thr; +- + #ifndef HAVE_TLS + static struct gomp_thread initial_thread_tls_data; + +@@ -543,13 +925,6 @@ initialize_team (void) + + if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0) + gomp_fatal ("could not create thread pool destructor."); +- +-#ifdef HAVE_TLS +- thr = &gomp_tls_data; +-#else +- thr = &initial_thread_tls_data; +-#endif +- gomp_sem_init (&thr->release, 0); + } + + static void __attribute__((destructor)) +--- libgomp/sections.c (revision 210461) ++++ libgomp/sections.c (revision 210462) +@@ -139,11 +139,27 @@ GOMP_parallel_sections_start (void (*fn) + num_threads = gomp_resolve_num_threads (num_threads, count); + team = gomp_new_team (num_threads); + gomp_sections_init (&team->work_shares[0], count); +- gomp_team_start (fn, data, num_threads, team); ++ gomp_team_start (fn, data, num_threads, 0, team); ++} ++ ++ialias_redirect (GOMP_parallel_end) ++ ++void ++GOMP_parallel_sections (void (*fn) (void *), void *data, ++ unsigned num_threads, unsigned count, unsigned flags) ++{ ++ struct gomp_team *team; ++ ++ num_threads = gomp_resolve_num_threads (num_threads, count); ++ team = gomp_new_team (num_threads); ++ gomp_sections_init (&team->work_shares[0], count); ++ gomp_team_start (fn, data, num_threads, flags, team); ++ fn (data); ++ GOMP_parallel_end (); + } + + /* The GOMP_section_end* routines are called after the thread is told +- that all sections are complete. This first version synchronizes ++ that all sections are complete. The first two versions synchronize + all threads; the nowait version does not. */ + + void +@@ -152,6 +168,12 @@ GOMP_sections_end (void) + gomp_work_share_end (); + } + ++bool ++GOMP_sections_end_cancel (void) ++{ ++ return gomp_work_share_end_cancel (); ++} ++ + void + GOMP_sections_end_nowait (void) + { +--- libgomp/env.c (revision 210461) ++++ libgomp/env.c (revision 210462) +@@ -29,6 +29,10 @@ + #include "libgomp_f.h" + #include + #include ++#include ++#ifdef HAVE_INTTYPES_H ++# include /* For PRIu64. */ ++#endif + #ifdef STRING_WITH_STRINGS + # include + # include +@@ -50,23 +54,28 @@ + + struct gomp_task_icv gomp_global_icv = { + .nthreads_var = 1, ++ .thread_limit_var = UINT_MAX, + .run_sched_var = GFS_DYNAMIC, + .run_sched_modifier = 1, ++ .default_device_var = 0, + .dyn_var = false, +- .nest_var = false ++ .nest_var = false, ++ .bind_var = omp_proc_bind_false, ++ .target_data = NULL + }; + +-unsigned short *gomp_cpu_affinity; +-size_t gomp_cpu_affinity_len; + unsigned long gomp_max_active_levels_var = INT_MAX; +-unsigned long gomp_thread_limit_var = ULONG_MAX; +-unsigned long gomp_remaining_threads_count; ++bool gomp_cancel_var = false; + #ifndef HAVE_SYNC_BUILTINS +-gomp_mutex_t gomp_remaining_threads_lock; ++gomp_mutex_t gomp_managed_threads_lock; + #endif + unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1; + unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var; + unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len; ++char *gomp_bind_var_list; ++unsigned long gomp_bind_var_list_len; ++void **gomp_places_list; ++unsigned long gomp_places_list_len; + + /* Parse the OMP_SCHEDULE environment variable. */ + +@@ -184,6 +193,24 @@ parse_unsigned_long (const char *name, u + return false; + } + ++/* Parse a positive int environment variable. Return true if one was ++ present and it was successfully parsed. */ ++ ++static bool ++parse_int (const char *name, int *pvalue, bool allow_zero) ++{ ++ unsigned long value; ++ if (!parse_unsigned_long (name, &value, allow_zero)) ++ return false; ++ if (value > INT_MAX) ++ { ++ gomp_error ("Invalid value for environment variable %s", name); ++ return false; ++ } ++ *pvalue = (int) value; ++ return true; ++} ++ + /* Parse an unsigned long list environment variable. Return true if one was + present and it was successfully parsed. */ + +@@ -273,6 +300,416 @@ parse_unsigned_long_list (const char *na + return false; + } + ++/* Parse environment variable set to a boolean or list of omp_proc_bind_t ++ enum values. Return true if one was present and it was successfully ++ parsed. */ ++ ++static bool ++parse_bind_var (const char *name, char *p1stvalue, ++ char **pvalues, unsigned long *pnvalues) ++{ ++ char *env; ++ char value = omp_proc_bind_false, *values = NULL; ++ int i; ++ static struct proc_bind_kinds ++ { ++ const char name[7]; ++ const char len; ++ omp_proc_bind_t kind; ++ } kinds[] = ++ { ++ { "false", 5, omp_proc_bind_false }, ++ { "true", 4, omp_proc_bind_true }, ++ { "master", 6, omp_proc_bind_master }, ++ { "close", 5, omp_proc_bind_close }, ++ { "spread", 6, omp_proc_bind_spread } ++ }; ++ ++ env = getenv (name); ++ if (env == NULL) ++ return false; ++ ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env == '\0') ++ goto invalid; ++ ++ for (i = 0; i < 5; i++) ++ if (strncasecmp (env, kinds[i].name, kinds[i].len) == 0) ++ { ++ value = kinds[i].kind; ++ env += kinds[i].len; ++ break; ++ } ++ if (i == 5) ++ goto invalid; ++ ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env != '\0') ++ { ++ if (*env == ',') ++ { ++ unsigned long nvalues = 0, nalloced = 0; ++ ++ if (value == omp_proc_bind_false ++ || value == omp_proc_bind_true) ++ goto invalid; ++ ++ do ++ { ++ env++; ++ if (nvalues == nalloced) ++ { ++ char *n; ++ nalloced = nalloced ? nalloced * 2 : 16; ++ n = realloc (values, nalloced); ++ if (n == NULL) ++ { ++ free (values); ++ gomp_error ("Out of memory while trying to parse" ++ " environment variable %s", name); ++ return false; ++ } ++ values = n; ++ if (nvalues == 0) ++ values[nvalues++] = value; ++ } ++ ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env == '\0') ++ goto invalid; ++ ++ for (i = 2; i < 5; i++) ++ if (strncasecmp (env, kinds[i].name, kinds[i].len) == 0) ++ { ++ value = kinds[i].kind; ++ env += kinds[i].len; ++ break; ++ } ++ if (i == 5) ++ goto invalid; ++ ++ values[nvalues++] = value; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env == '\0') ++ break; ++ if (*env != ',') ++ goto invalid; ++ } ++ while (1); ++ *p1stvalue = values[0]; ++ *pvalues = values; ++ *pnvalues = nvalues; ++ return true; ++ } ++ goto invalid; ++ } ++ ++ *p1stvalue = value; ++ return true; ++ ++ invalid: ++ free (values); ++ gomp_error ("Invalid value for environment variable %s", name); ++ return false; ++} ++ ++static bool ++parse_one_place (char **envp, bool *negatep, unsigned long *lenp, ++ long *stridep) ++{ ++ char *env = *envp, *start; ++ void *p = gomp_places_list ? gomp_places_list[gomp_places_list_len] : NULL; ++ unsigned long len = 1; ++ long stride = 1; ++ int pass; ++ bool any_negate = false; ++ *negatep = false; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env == '!') ++ { ++ *negatep = true; ++ ++env; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ } ++ if (*env != '{') ++ return false; ++ ++env; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ start = env; ++ for (pass = 0; pass < (any_negate ? 2 : 1); pass++) ++ { ++ env = start; ++ do ++ { ++ unsigned long this_num, this_len = 1; ++ long this_stride = 1; ++ bool this_negate = (*env == '!'); ++ if (this_negate) ++ { ++ if (gomp_places_list) ++ any_negate = true; ++ ++env; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ } ++ ++ errno = 0; ++ this_num = strtoul (env, &env, 10); ++ if (errno) ++ return false; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env == ':') ++ { ++ ++env; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ errno = 0; ++ this_len = strtoul (env, &env, 10); ++ if (errno || this_len == 0) ++ return false; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env == ':') ++ { ++ ++env; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ errno = 0; ++ this_stride = strtol (env, &env, 10); ++ if (errno) ++ return false; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ } ++ } ++ if (this_negate && this_len != 1) ++ return false; ++ if (gomp_places_list && pass == this_negate) ++ { ++ if (this_negate) ++ { ++ if (!gomp_affinity_remove_cpu (p, this_num)) ++ return false; ++ } ++ else if (!gomp_affinity_add_cpus (p, this_num, this_len, ++ this_stride, false)) ++ return false; ++ } ++ if (*env == '}') ++ break; ++ if (*env != ',') ++ return false; ++ ++env; ++ } ++ while (1); ++ } ++ ++ ++env; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env == ':') ++ { ++ ++env; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ errno = 0; ++ len = strtoul (env, &env, 10); ++ if (errno || len == 0 || len >= 65536) ++ return false; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env == ':') ++ { ++ ++env; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ errno = 0; ++ stride = strtol (env, &env, 10); ++ if (errno) ++ return false; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ } ++ } ++ if (*negatep && len != 1) ++ return false; ++ *envp = env; ++ *lenp = len; ++ *stridep = stride; ++ return true; ++} ++ ++static bool ++parse_places_var (const char *name, bool ignore) ++{ ++ char *env = getenv (name), *end; ++ bool any_negate = false; ++ int level = 0; ++ unsigned long count = 0; ++ if (env == NULL) ++ return false; ++ ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env == '\0') ++ goto invalid; ++ ++ if (strncasecmp (env, "threads", 7) == 0) ++ { ++ env += 7; ++ level = 1; ++ } ++ else if (strncasecmp (env, "cores", 5) == 0) ++ { ++ env += 5; ++ level = 2; ++ } ++ else if (strncasecmp (env, "sockets", 7) == 0) ++ { ++ env += 7; ++ level = 3; ++ } ++ if (level) ++ { ++ count = ULONG_MAX; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env != '\0') ++ { ++ if (*env++ != '(') ++ goto invalid; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ ++ errno = 0; ++ count = strtoul (env, &end, 10); ++ if (errno) ++ goto invalid; ++ env = end; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env != ')') ++ goto invalid; ++ ++env; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env != '\0') ++ goto invalid; ++ } ++ ++ if (ignore) ++ return false; ++ ++ return gomp_affinity_init_level (level, count, false); ++ } ++ ++ count = 0; ++ end = env; ++ do ++ { ++ bool negate; ++ unsigned long len; ++ long stride; ++ if (!parse_one_place (&end, &negate, &len, &stride)) ++ goto invalid; ++ if (negate) ++ { ++ if (!any_negate) ++ count++; ++ any_negate = true; ++ } ++ else ++ count += len; ++ if (count > 65536) ++ goto invalid; ++ if (*end == '\0') ++ break; ++ if (*end != ',') ++ goto invalid; ++ end++; ++ } ++ while (1); ++ ++ if (ignore) ++ return false; ++ ++ gomp_places_list_len = 0; ++ gomp_places_list = gomp_affinity_alloc (count, false); ++ if (gomp_places_list == NULL) ++ return false; ++ ++ do ++ { ++ bool negate; ++ unsigned long len; ++ long stride; ++ gomp_affinity_init_place (gomp_places_list[gomp_places_list_len]); ++ if (!parse_one_place (&env, &negate, &len, &stride)) ++ goto invalid; ++ if (negate) ++ { ++ void *p; ++ for (count = 0; count < gomp_places_list_len; count++) ++ if (gomp_affinity_same_place ++ (gomp_places_list[count], ++ gomp_places_list[gomp_places_list_len])) ++ break; ++ if (count == gomp_places_list_len) ++ { ++ gomp_error ("Trying to remove a non-existing place from list " ++ "of places"); ++ goto invalid; ++ } ++ p = gomp_places_list[count]; ++ memmove (&gomp_places_list[count], ++ &gomp_places_list[count + 1], ++ (gomp_places_list_len - count - 1) * sizeof (void *)); ++ --gomp_places_list_len; ++ gomp_places_list[gomp_places_list_len] = p; ++ } ++ else if (len == 1) ++ ++gomp_places_list_len; ++ else ++ { ++ for (count = 0; count < len - 1; count++) ++ if (!gomp_affinity_copy_place ++ (gomp_places_list[gomp_places_list_len + count + 1], ++ gomp_places_list[gomp_places_list_len + count], ++ stride)) ++ goto invalid; ++ gomp_places_list_len += len; ++ } ++ if (*env == '\0') ++ break; ++ env++; ++ } ++ while (1); ++ ++ if (gomp_places_list_len == 0) ++ { ++ gomp_error ("All places have been removed"); ++ goto invalid; ++ } ++ if (!gomp_affinity_finalize_place_list (false)) ++ goto invalid; ++ return true; ++ ++ invalid: ++ free (gomp_places_list); ++ gomp_places_list = NULL; ++ gomp_places_list_len = 0; ++ gomp_error ("Invalid value for environment variable %s", name); ++ return false; ++} ++ + /* Parse the OMP_STACKSIZE environment varible. Return true if one was + present and it was successfully parsed. */ + +@@ -478,86 +915,95 @@ parse_wait_policy (void) + present and it was successfully parsed. */ + + static bool +-parse_affinity (void) ++parse_affinity (bool ignore) + { +- char *env, *end; ++ char *env, *end, *start; ++ int pass; + unsigned long cpu_beg, cpu_end, cpu_stride; +- unsigned short *cpus = NULL; +- size_t allocated = 0, used = 0, needed; ++ size_t count = 0, needed; + + env = getenv ("GOMP_CPU_AFFINITY"); + if (env == NULL) + return false; + +- do ++ start = env; ++ for (pass = 0; pass < 2; pass++) + { +- while (*env == ' ' || *env == '\t') +- env++; +- +- cpu_beg = strtoul (env, &end, 0); +- cpu_end = cpu_beg; +- cpu_stride = 1; +- if (env == end || cpu_beg >= 65536) +- goto invalid; ++ env = start; ++ if (pass == 1) ++ { ++ if (ignore) ++ return false; + +- env = end; +- if (*env == '-') ++ gomp_places_list_len = 0; ++ gomp_places_list = gomp_affinity_alloc (count, true); ++ if (gomp_places_list == NULL) ++ return false; ++ } ++ do + { +- cpu_end = strtoul (++env, &end, 0); +- if (env == end || cpu_end >= 65536 || cpu_end < cpu_beg) ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ ++ errno = 0; ++ cpu_beg = strtoul (env, &end, 0); ++ if (errno || cpu_beg >= 65536) + goto invalid; ++ cpu_end = cpu_beg; ++ cpu_stride = 1; + + env = end; +- if (*env == ':') ++ if (*env == '-') + { +- cpu_stride = strtoul (++env, &end, 0); +- if (env == end || cpu_stride == 0 || cpu_stride >= 65536) ++ errno = 0; ++ cpu_end = strtoul (++env, &end, 0); ++ if (errno || cpu_end >= 65536 || cpu_end < cpu_beg) + goto invalid; + + env = end; +- } +- } ++ if (*env == ':') ++ { ++ errno = 0; ++ cpu_stride = strtoul (++env, &end, 0); ++ if (errno || cpu_stride == 0 || cpu_stride >= 65536) ++ goto invalid; + +- needed = (cpu_end - cpu_beg) / cpu_stride + 1; +- if (used + needed >= allocated) +- { +- unsigned short *new_cpus; ++ env = end; ++ } ++ } + +- if (allocated < 64) +- allocated = 64; +- if (allocated > needed) +- allocated <<= 1; ++ needed = (cpu_end - cpu_beg) / cpu_stride + 1; ++ if (pass == 0) ++ count += needed; + else +- allocated += 2 * needed; +- new_cpus = realloc (cpus, allocated * sizeof (unsigned short)); +- if (new_cpus == NULL) + { +- free (cpus); +- gomp_error ("not enough memory to store GOMP_CPU_AFFINITY list"); +- return false; ++ while (needed--) ++ { ++ void *p = gomp_places_list[gomp_places_list_len]; ++ gomp_affinity_init_place (p); ++ if (gomp_affinity_add_cpus (p, cpu_beg, 1, 0, true)) ++ ++gomp_places_list_len; ++ cpu_beg += cpu_stride; ++ } + } + +- cpus = new_cpus; +- } ++ while (isspace ((unsigned char) *env)) ++ ++env; + +- while (needed--) +- { +- cpus[used++] = cpu_beg; +- cpu_beg += cpu_stride; ++ if (*env == ',') ++ env++; ++ else if (*env == '\0') ++ break; + } +- +- while (*env == ' ' || *env == '\t') +- env++; +- +- if (*env == ',') +- env++; +- else if (*env == '\0') +- break; ++ while (1); + } +- while (1); + +- gomp_cpu_affinity = cpus; +- gomp_cpu_affinity_len = used; ++ if (gomp_places_list_len == 0) ++ { ++ free (gomp_places_list); ++ gomp_places_list = NULL; ++ return false; ++ } + return true; + + invalid: +@@ -565,12 +1011,160 @@ parse_affinity (void) + return false; + } + ++ ++static void ++handle_omp_display_env (unsigned long stacksize, int wait_policy) ++{ ++ const char *env; ++ bool display = false; ++ bool verbose = false; ++ int i; ++ ++ env = getenv ("OMP_DISPLAY_ENV"); ++ if (env == NULL) ++ return; ++ ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (strncasecmp (env, "true", 4) == 0) ++ { ++ display = true; ++ env += 4; ++ } ++ else if (strncasecmp (env, "false", 5) == 0) ++ { ++ display = false; ++ env += 5; ++ } ++ else if (strncasecmp (env, "verbose", 7) == 0) ++ { ++ display = true; ++ verbose = true; ++ env += 7; ++ } ++ else ++ env = "X"; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env != '\0') ++ gomp_error ("Invalid value for environment variable OMP_DISPLAY_ENV"); ++ ++ if (!display) ++ return; ++ ++ fputs ("\nOPENMP DISPLAY ENVIRONMENT BEGIN\n", stderr); ++ ++ fputs (" _OPENMP = '201307'\n", stderr); ++ fprintf (stderr, " OMP_DYNAMIC = '%s'\n", ++ gomp_global_icv.dyn_var ? "TRUE" : "FALSE"); ++ fprintf (stderr, " OMP_NESTED = '%s'\n", ++ gomp_global_icv.nest_var ? "TRUE" : "FALSE"); ++ ++ fprintf (stderr, " OMP_NUM_THREADS = '%lu", gomp_global_icv.nthreads_var); ++ for (i = 1; i < gomp_nthreads_var_list_len; i++) ++ fprintf (stderr, ",%lu", gomp_nthreads_var_list[i]); ++ fputs ("'\n", stderr); ++ ++ fprintf (stderr, " OMP_SCHEDULE = '"); ++ switch (gomp_global_icv.run_sched_var) ++ { ++ case GFS_RUNTIME: ++ fputs ("RUNTIME", stderr); ++ break; ++ case GFS_STATIC: ++ fputs ("STATIC", stderr); ++ break; ++ case GFS_DYNAMIC: ++ fputs ("DYNAMIC", stderr); ++ break; ++ case GFS_GUIDED: ++ fputs ("GUIDED", stderr); ++ break; ++ case GFS_AUTO: ++ fputs ("AUTO", stderr); ++ break; ++ } ++ fputs ("'\n", stderr); ++ ++ fputs (" OMP_PROC_BIND = '", stderr); ++ switch (gomp_global_icv.bind_var) ++ { ++ case omp_proc_bind_false: ++ fputs ("FALSE", stderr); ++ break; ++ case omp_proc_bind_true: ++ fputs ("TRUE", stderr); ++ break; ++ case omp_proc_bind_master: ++ fputs ("MASTER", stderr); ++ break; ++ case omp_proc_bind_close: ++ fputs ("CLOSE", stderr); ++ break; ++ case omp_proc_bind_spread: ++ fputs ("SPREAD", stderr); ++ break; ++ } ++ for (i = 1; i < gomp_bind_var_list_len; i++) ++ switch (gomp_bind_var_list[i]) ++ { ++ case omp_proc_bind_master: ++ fputs (",MASTER", stderr); ++ break; ++ case omp_proc_bind_close: ++ fputs (",CLOSE", stderr); ++ break; ++ case omp_proc_bind_spread: ++ fputs (",SPREAD", stderr); ++ break; ++ } ++ fputs ("'\n", stderr); ++ fputs (" OMP_PLACES = '", stderr); ++ for (i = 0; i < gomp_places_list_len; i++) ++ { ++ fputs ("{", stderr); ++ gomp_affinity_print_place (gomp_places_list[i]); ++ fputs (i + 1 == gomp_places_list_len ? "}" : "},", stderr); ++ } ++ fputs ("'\n", stderr); ++ ++ fprintf (stderr, " OMP_STACKSIZE = '%lu'\n", stacksize); ++ ++ /* GOMP's default value is actually neither active nor passive. */ ++ fprintf (stderr, " OMP_WAIT_POLICY = '%s'\n", ++ wait_policy > 0 ? "ACTIVE" : "PASSIVE"); ++ fprintf (stderr, " OMP_THREAD_LIMIT = '%u'\n", ++ gomp_global_icv.thread_limit_var); ++ fprintf (stderr, " OMP_MAX_ACTIVE_LEVELS = '%lu'\n", ++ gomp_max_active_levels_var); ++ ++ fprintf (stderr, " OMP_CANCELLATION = '%s'\n", ++ gomp_cancel_var ? "TRUE" : "FALSE"); ++ fprintf (stderr, " OMP_DEFAULT_DEVICE = '%d'\n", ++ gomp_global_icv.default_device_var); ++ ++ if (verbose) ++ { ++ fputs (" GOMP_CPU_AFFINITY = ''\n", stderr); ++ fprintf (stderr, " GOMP_STACKSIZE = '%lu'\n", stacksize); ++#ifdef HAVE_INTTYPES_H ++ fprintf (stderr, " GOMP_SPINCOUNT = '%"PRIu64"'\n", ++ (uint64_t) gomp_spin_count_var); ++#else ++ fprintf (stderr, " GOMP_SPINCOUNT = '%lu'\n", ++ (unsigned long) gomp_spin_count_var); ++#endif ++ } ++ ++ fputs ("OPENMP DISPLAY ENVIRONMENT END\n", stderr); ++} ++ ++ + static void __attribute__((constructor)) + initialize_env (void) + { +- unsigned long stacksize; ++ unsigned long thread_limit_var, stacksize; + int wait_policy; +- bool bind_var = false; + + /* Do a compile time check that mkomp_h.pl did good job. */ + omp_check_defines (); +@@ -578,14 +1172,17 @@ initialize_env (void) + parse_schedule (); + parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var); + parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var); +- parse_boolean ("OMP_PROC_BIND", &bind_var); ++ parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var); ++ parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true); + parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var, + true); +- parse_unsigned_long ("OMP_THREAD_LIMIT", &gomp_thread_limit_var, false); +- if (gomp_thread_limit_var != ULONG_MAX) +- gomp_remaining_threads_count = gomp_thread_limit_var - 1; ++ if (parse_unsigned_long ("OMP_THREAD_LIMIT", &thread_limit_var, false)) ++ { ++ gomp_global_icv.thread_limit_var ++ = thread_limit_var > INT_MAX ? UINT_MAX : thread_limit_var; ++ } + #ifndef HAVE_SYNC_BUILTINS +- gomp_mutex_init (&gomp_remaining_threads_lock); ++ gomp_mutex_init (&gomp_managed_threads_lock); + #endif + gomp_init_num_threads (); + gomp_available_cpus = gomp_global_icv.nthreads_var; +@@ -594,7 +1191,34 @@ initialize_env (void) + &gomp_nthreads_var_list, + &gomp_nthreads_var_list_len)) + gomp_global_icv.nthreads_var = gomp_available_cpus; +- if (parse_affinity () || bind_var) ++ bool ignore = false; ++ if (parse_bind_var ("OMP_PROC_BIND", ++ &gomp_global_icv.bind_var, ++ &gomp_bind_var_list, ++ &gomp_bind_var_list_len) ++ && gomp_global_icv.bind_var == omp_proc_bind_false) ++ ignore = true; ++ /* Make sure OMP_PLACES and GOMP_CPU_AFFINITY env vars are always ++ parsed if present in the environment. If OMP_PROC_BIND was set ++ explictly to false, don't populate places list though. If places ++ list was successfully set from OMP_PLACES, only parse but don't process ++ GOMP_CPU_AFFINITY. If OMP_PROC_BIND was not set in the environment, ++ default to OMP_PROC_BIND=true if OMP_PLACES or GOMP_CPU_AFFINITY ++ was successfully parsed into a places list, otherwise to ++ OMP_PROC_BIND=false. */ ++ if (parse_places_var ("OMP_PLACES", ignore)) ++ { ++ if (gomp_global_icv.bind_var == omp_proc_bind_false) ++ gomp_global_icv.bind_var = true; ++ ignore = true; ++ } ++ if (parse_affinity (ignore)) ++ { ++ if (gomp_global_icv.bind_var == omp_proc_bind_false) ++ gomp_global_icv.bind_var = true; ++ ignore = true; ++ } ++ if (gomp_global_icv.bind_var != omp_proc_bind_false) + gomp_init_affinity (); + wait_policy = parse_wait_policy (); + if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var)) +@@ -645,6 +1269,8 @@ initialize_env (void) + if (err != 0) + gomp_error ("Stack size change failed: %s", strerror (err)); + } ++ ++ handle_omp_display_env (stacksize, wait_policy); + } + + +@@ -728,7 +1354,8 @@ omp_get_max_threads (void) + int + omp_get_thread_limit (void) + { +- return gomp_thread_limit_var > INT_MAX ? INT_MAX : gomp_thread_limit_var; ++ struct gomp_task_icv *icv = gomp_icv (false); ++ return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var; + } + + void +@@ -744,6 +1371,60 @@ omp_get_max_active_levels (void) + return gomp_max_active_levels_var; + } + ++int ++omp_get_cancellation (void) ++{ ++ return gomp_cancel_var; ++} ++ ++omp_proc_bind_t ++omp_get_proc_bind (void) ++{ ++ struct gomp_task_icv *icv = gomp_icv (false); ++ return icv->bind_var; ++} ++ ++void ++omp_set_default_device (int device_num) ++{ ++ struct gomp_task_icv *icv = gomp_icv (true); ++ icv->default_device_var = device_num >= 0 ? device_num : 0; ++} ++ ++int ++omp_get_default_device (void) ++{ ++ struct gomp_task_icv *icv = gomp_icv (false); ++ return icv->default_device_var; ++} ++ ++int ++omp_get_num_devices (void) ++{ ++ return gomp_get_num_devices (); ++} ++ ++int ++omp_get_num_teams (void) ++{ ++ /* Hardcoded to 1 on host, MIC, HSAIL? Maybe variable on PTX. */ ++ return 1; ++} ++ ++int ++omp_get_team_num (void) ++{ ++ /* Hardcoded to 0 on host, MIC, HSAIL? Maybe variable on PTX. */ ++ return 0; ++} ++ ++int ++omp_is_initial_device (void) ++{ ++ /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX. */ ++ return 1; ++} ++ + ialias (omp_set_dynamic) + ialias (omp_set_nested) + ialias (omp_set_num_threads) +@@ -755,3 +1436,11 @@ ialias (omp_get_max_threads) + ialias (omp_get_thread_limit) + ialias (omp_set_max_active_levels) + ialias (omp_get_max_active_levels) ++ialias (omp_get_cancellation) ++ialias (omp_get_proc_bind) ++ialias (omp_set_default_device) ++ialias (omp_get_default_device) ++ialias (omp_get_num_devices) ++ialias (omp_get_num_teams) ++ialias (omp_get_team_num) ++ialias (omp_is_initial_device) +--- libgomp/libgomp.h (revision 210461) ++++ libgomp/libgomp.h (revision 213654) +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005-2013 Free Software Foundation, Inc. ++/* Copyright (C) 2005-2014 Free Software Foundation, Inc. + Contributed by Richard Henderson . + + This file is part of the GNU OpenMP Library (libgomp). +@@ -39,6 +39,7 @@ + + #include + #include ++#include + + #ifdef HAVE_ATTRIBUTE_VISIBILITY + # pragma GCC visibility push(hidden) +@@ -201,6 +202,10 @@ struct gomp_team_state + /* Active nesting level. Only active parallel regions are counted. */ + unsigned active_level; + ++ /* Place-partition-var, offset and length into gomp_places_list array. */ ++ unsigned place_partition_off; ++ unsigned place_partition_len; ++ + #ifdef HAVE_SYNC_BUILTINS + /* Number of single stmts encountered. */ + unsigned long single_count; +@@ -214,30 +219,40 @@ struct gomp_team_state + unsigned long static_trip; + }; + +-/* These are the OpenMP 3.0 Internal Control Variables described in ++struct target_mem_desc; ++ ++/* These are the OpenMP 4.0 Internal Control Variables described in + section 2.3.1. Those described as having one copy per task are + stored within the structure; those described as having one copy + for the whole program are (naturally) global variables. */ +- ++ + struct gomp_task_icv + { + unsigned long nthreads_var; + enum gomp_schedule_type run_sched_var; + int run_sched_modifier; ++ int default_device_var; ++ unsigned int thread_limit_var; + bool dyn_var; + bool nest_var; ++ char bind_var; ++ /* Internal ICV. */ ++ struct target_mem_desc *target_data; + }; + + extern struct gomp_task_icv gomp_global_icv; +-extern unsigned long gomp_thread_limit_var; +-extern unsigned long gomp_remaining_threads_count; + #ifndef HAVE_SYNC_BUILTINS +-extern gomp_mutex_t gomp_remaining_threads_lock; ++extern gomp_mutex_t gomp_managed_threads_lock; + #endif + extern unsigned long gomp_max_active_levels_var; ++extern bool gomp_cancel_var; + extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var; + extern unsigned long gomp_available_cpus, gomp_managed_threads; + extern unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len; ++extern char *gomp_bind_var_list; ++extern unsigned long gomp_bind_var_list_len; ++extern void **gomp_places_list; ++extern unsigned long gomp_places_list_len; + + enum gomp_task_kind + { +@@ -247,6 +262,39 @@ enum gomp_task_kind + GOMP_TASK_TIED + }; + ++struct gomp_task; ++struct gomp_taskgroup; ++struct htab; ++ ++struct gomp_task_depend_entry ++{ ++ void *addr; ++ struct gomp_task_depend_entry *next; ++ struct gomp_task_depend_entry *prev; ++ struct gomp_task *task; ++ bool is_in; ++ bool redundant; ++ bool redundant_out; ++}; ++ ++struct gomp_dependers_vec ++{ ++ size_t n_elem; ++ size_t allocated; ++ struct gomp_task *elem[]; ++}; ++ ++/* Used when in GOMP_taskwait or in gomp_task_maybe_wait_for_dependencies. */ ++ ++struct gomp_taskwait ++{ ++ bool in_taskwait; ++ bool in_depend_wait; ++ size_t n_depend; ++ struct gomp_task *last_parent_depends_on; ++ gomp_sem_t taskwait_sem; ++}; ++ + /* This structure describes a "task" to be run by a thread. */ + + struct gomp_task +@@ -257,14 +305,33 @@ struct gomp_task + struct gomp_task *prev_child; + struct gomp_task *next_queue; + struct gomp_task *prev_queue; ++ struct gomp_task *next_taskgroup; ++ struct gomp_task *prev_taskgroup; ++ struct gomp_taskgroup *taskgroup; ++ struct gomp_dependers_vec *dependers; ++ struct htab *depend_hash; ++ struct gomp_taskwait *taskwait; ++ size_t depend_count; ++ size_t num_dependees; + struct gomp_task_icv icv; + void (*fn) (void *); + void *fn_data; + enum gomp_task_kind kind; +- bool in_taskwait; + bool in_tied_task; + bool final_task; +- gomp_sem_t taskwait_sem; ++ bool copy_ctors_done; ++ bool parent_depends_on; ++ struct gomp_task_depend_entry depend[]; ++}; ++ ++struct gomp_taskgroup ++{ ++ struct gomp_taskgroup *prev; ++ struct gomp_task *children; ++ bool in_taskgroup_wait; ++ bool cancelled; ++ gomp_sem_t taskgroup_sem; ++ size_t num_children; + }; + + /* This structure describes a "team" of threads. These are the threads +@@ -293,6 +360,12 @@ struct gomp_team + of the threads in the team. */ + gomp_sem_t **ordered_release; + ++ /* List of work shares on which gomp_fini_work_share hasn't been ++ called yet. If the team hasn't been cancelled, this should be ++ equal to each thr->ts.work_share, but otherwise it can be a possibly ++ long list of workshares. */ ++ struct gomp_work_share *work_shares_to_free; ++ + /* List of gomp_work_share structs chained through next_free fields. + This is populated and taken off only by the first thread in the + team encountering a new work sharing construct, in a critical +@@ -324,8 +397,20 @@ struct gomp_team + + gomp_mutex_t task_lock; + struct gomp_task *task_queue; +- int task_count; +- int task_running_count; ++ /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team. */ ++ unsigned int task_count; ++ /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled. */ ++ unsigned int task_queued_count; ++ /* Number of GOMP_TASK_{WAITING,TIED} tasks currently running ++ directly in gomp_barrier_handle_tasks; tasks spawned ++ from e.g. GOMP_taskwait or GOMP_taskgroup_end don't count, even when ++ that is called from a task run from gomp_barrier_handle_tasks. ++ task_running_count should be always <= team->nthreads, ++ and if current task isn't in_tied_task, then it will be ++ even < team->nthreads. */ ++ unsigned int task_running_count; ++ int work_share_cancelled; ++ int team_cancelled; + + /* This array contains structures for implicit tasks. */ + struct gomp_task implicit_task[]; +@@ -350,7 +435,11 @@ struct gomp_thread + /* This semaphore is used for ordered loops. */ + gomp_sem_t release; + +- /* user pthread thread pool */ ++ /* Place this thread is bound to plus one, or zero if not bound ++ to any place. */ ++ unsigned int place; ++ ++ /* User pthread thread pool */ + struct gomp_thread_pool *thread_pool; + }; + +@@ -363,11 +452,23 @@ struct gomp_thread_pool + unsigned threads_size; + unsigned threads_used; + struct gomp_team *last_team; ++ /* Number of threads running in this contention group. */ ++ unsigned long threads_busy; + + /* This barrier holds and releases threads waiting in threads. */ + gomp_barrier_t threads_dock; + }; + ++enum gomp_cancel_kind ++{ ++ GOMP_CANCEL_PARALLEL = 1, ++ GOMP_CANCEL_LOOP = 2, ++ GOMP_CANCEL_FOR = GOMP_CANCEL_LOOP, ++ GOMP_CANCEL_DO = GOMP_CANCEL_LOOP, ++ GOMP_CANCEL_SECTIONS = 4, ++ GOMP_CANCEL_TASKGROUP = 8 ++}; ++ + /* ... and here is that TLS data. */ + + #ifdef HAVE_TLS +@@ -402,17 +503,22 @@ static inline struct gomp_task_icv *gomp + /* The attributes to be used during thread creation. */ + extern pthread_attr_t gomp_thread_attr; + +-/* Other variables. */ +- +-extern unsigned short *gomp_cpu_affinity; +-extern size_t gomp_cpu_affinity_len; +- + /* Function prototypes. */ + + /* affinity.c */ + + extern void gomp_init_affinity (void); +-extern void gomp_init_thread_affinity (pthread_attr_t *); ++extern void gomp_init_thread_affinity (pthread_attr_t *, unsigned int); ++extern void **gomp_affinity_alloc (unsigned long, bool); ++extern void gomp_affinity_init_place (void *); ++extern bool gomp_affinity_add_cpus (void *, unsigned long, unsigned long, ++ long, bool); ++extern bool gomp_affinity_remove_cpu (void *, unsigned long); ++extern bool gomp_affinity_copy_place (void *, void *, long); ++extern bool gomp_affinity_same_place (void *, void *); ++extern bool gomp_affinity_finalize_place_list (bool); ++extern bool gomp_affinity_init_level (int, unsigned long, bool); ++extern void gomp_affinity_print_place (void *); + + /* alloc.c */ + +@@ -486,15 +592,21 @@ extern void gomp_barrier_handle_tasks (g + static void inline + gomp_finish_task (struct gomp_task *task) + { +- gomp_sem_destroy (&task->taskwait_sem); ++ if (__builtin_expect (task->depend_hash != NULL, 0)) ++ free (task->depend_hash); + } + + /* team.c */ + + extern struct gomp_team *gomp_new_team (unsigned); + extern void gomp_team_start (void (*) (void *), void *, unsigned, +- struct gomp_team *); ++ unsigned, struct gomp_team *); + extern void gomp_team_end (void); ++extern void gomp_free_thread (void *); ++ ++/* target.c */ ++ ++extern int gomp_get_num_devices (void); + + /* work.c */ + +@@ -502,6 +614,7 @@ extern void gomp_init_work_share (struct + extern void gomp_fini_work_share (struct gomp_work_share *); + extern bool gomp_work_share_start (bool); + extern void gomp_work_share_end (void); ++extern bool gomp_work_share_end_cancel (void); + extern void gomp_work_share_end_nowait (void); + + static inline void +@@ -524,6 +637,26 @@ gomp_work_share_init_done (void) + #define _LIBGOMP_OMP_LOCK_DEFINED 1 + #include "omp.h.in" + ++typedef enum omp_proc_bind_t ++{ ++ omp_proc_bind_false = 0, ++ omp_proc_bind_true = 1, ++ omp_proc_bind_master = 2, ++ omp_proc_bind_close = 3, ++ omp_proc_bind_spread = 4 ++} omp_proc_bind_t; ++ ++extern int omp_get_cancellation (void) __GOMP_NOTHROW; ++extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW; ++ ++extern void omp_set_default_device (int) __GOMP_NOTHROW; ++extern int omp_get_default_device (void) __GOMP_NOTHROW; ++extern int omp_get_num_devices (void) __GOMP_NOTHROW; ++extern int omp_get_num_teams (void) __GOMP_NOTHROW; ++extern int omp_get_team_num (void) __GOMP_NOTHROW; ++ ++extern int omp_is_initial_device (void) __GOMP_NOTHROW; ++ + #if !defined (HAVE_ATTRIBUTE_VISIBILITY) \ + || !defined (HAVE_ATTRIBUTE_ALIAS) \ + || !defined (HAVE_AS_SYMVER_DIRECTIVE) \ +@@ -580,11 +713,19 @@ extern int gomp_test_nest_lock_25 (omp_n + #endif + + #ifdef HAVE_ATTRIBUTE_ALIAS ++# define ialias_ulp ialias_str1(__USER_LABEL_PREFIX__) ++# define ialias_str1(x) ialias_str2(x) ++# define ialias_str2(x) #x + # define ialias(fn) \ + extern __typeof (fn) gomp_ialias_##fn \ + __attribute__ ((alias (#fn))) attribute_hidden; ++# define ialias_redirect(fn) \ ++ extern __typeof (fn) fn __asm__ (ialias_ulp "gomp_ialias_" #fn) attribute_hidden; ++# define ialias_call(fn) gomp_ialias_ ## fn + #else + # define ialias(fn) ++# define ialias_redirect(fn) ++# define ialias_call(fn) fn + #endif + + #endif /* LIBGOMP_H */ +--- libgomp/task.c (revision 210461) ++++ libgomp/task.c (revision 213654) +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2007-2013 Free Software Foundation, Inc. ++/* Copyright (C) 2007-2014 Free Software Foundation, Inc. + Contributed by Richard Henderson . + + This file is part of the GNU OpenMP Library (libgomp). +@@ -29,6 +29,33 @@ + #include + #include + ++typedef struct gomp_task_depend_entry *hash_entry_type; ++ ++static inline void * ++htab_alloc (size_t size) ++{ ++ return gomp_malloc (size); ++} ++ ++static inline void ++htab_free (void *ptr) ++{ ++ free (ptr); ++} ++ ++#include "hashtab.h" ++ ++static inline hashval_t ++htab_hash (hash_entry_type element) ++{ ++ return hash_pointer (element->addr); ++} ++ ++static inline bool ++htab_eq (hash_entry_type x, hash_entry_type y) ++{ ++ return x->addr == y->addr; ++} + + /* Create a new task data structure. */ + +@@ -39,11 +66,16 @@ gomp_init_task (struct gomp_task *task, + task->parent = parent_task; + task->icv = *prev_icv; + task->kind = GOMP_TASK_IMPLICIT; +- task->in_taskwait = false; ++ task->taskwait = NULL; + task->in_tied_task = false; + task->final_task = false; ++ task->copy_ctors_done = false; ++ task->parent_depends_on = false; + task->children = NULL; +- gomp_sem_init (&task->taskwait_sem, 0); ++ task->taskgroup = NULL; ++ task->dependers = NULL; ++ task->depend_hash = NULL; ++ task->depend_count = 0; + } + + /* Clean up a task, after completing it. */ +@@ -72,13 +104,16 @@ gomp_clear_parent (struct gomp_task *chi + while (task != children); + } + ++static void gomp_task_maybe_wait_for_dependencies (void **depend); ++ + /* Called when encountering an explicit task directive. If IF_CLAUSE is + false, then we must not delay in executing the task. If UNTIED is true, + then the task may be executed by any member of the team. */ + + void + GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), +- long arg_size, long arg_align, bool if_clause, unsigned flags) ++ long arg_size, long arg_align, bool if_clause, unsigned flags, ++ void **depend) + { + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; +@@ -94,17 +129,35 @@ GOMP_task (void (*fn) (void *), void *da + flags &= ~1; + #endif + ++ /* If parallel or taskgroup has been cancelled, don't start new tasks. */ ++ if (team ++ && (gomp_team_barrier_cancelled (&team->barrier) ++ || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) ++ return; ++ + if (!if_clause || team == NULL + || (thr->task && thr->task->final_task) + || team->task_count > 64 * team->nthreads) + { + struct gomp_task task; + ++ /* If there are depend clauses and earlier deferred sibling tasks ++ with depend clauses, check if there isn't a dependency. If there ++ is, we need to wait for them. There is no need to handle ++ depend clauses for non-deferred tasks other than this, because ++ the parent task is suspended until the child task finishes and thus ++ it can't start further child tasks. */ ++ if ((flags & 8) && thr->task && thr->task->depend_hash) ++ gomp_task_maybe_wait_for_dependencies (depend); ++ + gomp_init_task (&task, thr->task, gomp_icv (false)); + task.kind = GOMP_TASK_IFFALSE; + task.final_task = (thr->task && thr->task->final_task) || (flags & 2); + if (thr->task) +- task.in_tied_task = thr->task->in_tied_task; ++ { ++ task.in_tied_task = thr->task->in_tied_task; ++ task.taskgroup = thr->task->taskgroup; ++ } + thr->task = &task; + if (__builtin_expect (cpyfn != NULL, 0)) + { +@@ -137,27 +190,174 @@ GOMP_task (void (*fn) (void *), void *da + { + struct gomp_task *task; + struct gomp_task *parent = thr->task; ++ struct gomp_taskgroup *taskgroup = parent->taskgroup; + char *arg; + bool do_wake; ++ size_t depend_size = 0; + +- task = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1); +- arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1) ++ if (flags & 8) ++ depend_size = ((uintptr_t) depend[0] ++ * sizeof (struct gomp_task_depend_entry)); ++ task = gomp_malloc (sizeof (*task) + depend_size ++ + arg_size + arg_align - 1); ++ arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1) + & ~(uintptr_t) (arg_align - 1)); + gomp_init_task (task, parent, gomp_icv (false)); + task->kind = GOMP_TASK_IFFALSE; + task->in_tied_task = parent->in_tied_task; ++ task->taskgroup = taskgroup; + thr->task = task; + if (cpyfn) +- cpyfn (arg, data); ++ { ++ cpyfn (arg, data); ++ task->copy_ctors_done = true; ++ } + else + memcpy (arg, data, arg_size); + thr->task = parent; + task->kind = GOMP_TASK_WAITING; + task->fn = fn; + task->fn_data = arg; +- task->in_tied_task = true; + task->final_task = (flags & 2) >> 1; + gomp_mutex_lock (&team->task_lock); ++ /* If parallel or taskgroup has been cancelled, don't start new ++ tasks. */ ++ if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier) ++ || (taskgroup && taskgroup->cancelled)) ++ && !task->copy_ctors_done, 0)) ++ { ++ gomp_mutex_unlock (&team->task_lock); ++ gomp_finish_task (task); ++ free (task); ++ return; ++ } ++ if (taskgroup) ++ taskgroup->num_children++; ++ if (depend_size) ++ { ++ size_t ndepend = (uintptr_t) depend[0]; ++ size_t nout = (uintptr_t) depend[1]; ++ size_t i; ++ hash_entry_type ent; ++ ++ task->depend_count = ndepend; ++ task->num_dependees = 0; ++ if (parent->depend_hash == NULL) ++ parent->depend_hash ++ = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); ++ for (i = 0; i < ndepend; i++) ++ { ++ task->depend[i].addr = depend[2 + i]; ++ task->depend[i].next = NULL; ++ task->depend[i].prev = NULL; ++ task->depend[i].task = task; ++ task->depend[i].is_in = i >= nout; ++ task->depend[i].redundant = false; ++ task->depend[i].redundant_out = false; ++ ++ hash_entry_type *slot ++ = htab_find_slot (&parent->depend_hash, &task->depend[i], ++ INSERT); ++ hash_entry_type out = NULL, last = NULL; ++ if (*slot) ++ { ++ /* If multiple depends on the same task are the ++ same, all but the first one are redundant. ++ As inout/out come first, if any of them is ++ inout/out, it will win, which is the right ++ semantics. */ ++ if ((*slot)->task == task) ++ { ++ task->depend[i].redundant = true; ++ continue; ++ } ++ for (ent = *slot; ent; ent = ent->next) ++ { ++ if (ent->redundant_out) ++ break; ++ ++ last = ent; ++ ++ /* depend(in:...) doesn't depend on earlier ++ depend(in:...). */ ++ if (i >= nout && ent->is_in) ++ continue; ++ ++ if (!ent->is_in) ++ out = ent; ++ ++ struct gomp_task *tsk = ent->task; ++ if (tsk->dependers == NULL) ++ { ++ tsk->dependers ++ = gomp_malloc (sizeof (struct gomp_dependers_vec) ++ + 6 * sizeof (struct gomp_task *)); ++ tsk->dependers->n_elem = 1; ++ tsk->dependers->allocated = 6; ++ tsk->dependers->elem[0] = task; ++ task->num_dependees++; ++ continue; ++ } ++ /* We already have some other dependency on tsk ++ from earlier depend clause. */ ++ else if (tsk->dependers->n_elem ++ && (tsk->dependers->elem[tsk->dependers->n_elem ++ - 1] ++ == task)) ++ continue; ++ else if (tsk->dependers->n_elem ++ == tsk->dependers->allocated) ++ { ++ tsk->dependers->allocated ++ = tsk->dependers->allocated * 2 + 2; ++ tsk->dependers ++ = gomp_realloc (tsk->dependers, ++ sizeof (struct gomp_dependers_vec) ++ + (tsk->dependers->allocated ++ * sizeof (struct gomp_task *))); ++ } ++ tsk->dependers->elem[tsk->dependers->n_elem++] = task; ++ task->num_dependees++; ++ } ++ task->depend[i].next = *slot; ++ (*slot)->prev = &task->depend[i]; ++ } ++ *slot = &task->depend[i]; ++ ++ /* There is no need to store more than one depend({,in}out:) ++ task per address in the hash table chain for the purpose ++ of creation of deferred tasks, because each out ++ depends on all earlier outs, thus it is enough to record ++ just the last depend({,in}out:). For depend(in:), we need ++ to keep all of the previous ones not terminated yet, because ++ a later depend({,in}out:) might need to depend on all of ++ them. So, if the new task's clause is depend({,in}out:), ++ we know there is at most one other depend({,in}out:) clause ++ in the list (out). For non-deferred tasks we want to see ++ all outs, so they are moved to the end of the chain, ++ after first redundant_out entry all following entries ++ should be redundant_out. */ ++ if (!task->depend[i].is_in && out) ++ { ++ if (out != last) ++ { ++ out->next->prev = out->prev; ++ out->prev->next = out->next; ++ out->next = last->next; ++ out->prev = last; ++ last->next = out; ++ if (out->next) ++ out->next->prev = out; ++ } ++ out->redundant_out = true; ++ } ++ } ++ if (task->num_dependees) ++ { ++ gomp_mutex_unlock (&team->task_lock); ++ return; ++ } ++ } + if (parent->children) + { + task->next_child = parent->children; +@@ -171,6 +371,22 @@ GOMP_task (void (*fn) (void *), void *da + task->prev_child = task; + } + parent->children = task; ++ if (taskgroup) ++ { ++ if (taskgroup->children) ++ { ++ task->next_taskgroup = taskgroup->children; ++ task->prev_taskgroup = taskgroup->children->prev_taskgroup; ++ task->next_taskgroup->prev_taskgroup = task; ++ task->prev_taskgroup->next_taskgroup = task; ++ } ++ else ++ { ++ task->next_taskgroup = task; ++ task->prev_taskgroup = task; ++ } ++ taskgroup->children = task; ++ } + if (team->task_queue) + { + task->next_queue = team->task_queue; +@@ -185,6 +401,7 @@ GOMP_task (void (*fn) (void *), void *da + team->task_queue = task; + } + ++team->task_count; ++ ++team->task_queued_count; + gomp_team_barrier_set_task_pending (&team->barrier); + do_wake = team->task_running_count + !parent->in_tied_task + < team->nthreads; +@@ -194,6 +411,265 @@ GOMP_task (void (*fn) (void *), void *da + } + } + ++static inline bool ++gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent, ++ struct gomp_taskgroup *taskgroup, struct gomp_team *team) ++{ ++ if (parent) ++ { ++ if (parent->children == child_task) ++ parent->children = child_task->next_child; ++ if (__builtin_expect (child_task->parent_depends_on, 0) ++ && parent->taskwait->last_parent_depends_on == child_task) ++ { ++ if (child_task->prev_child->kind == GOMP_TASK_WAITING ++ && child_task->prev_child->parent_depends_on) ++ parent->taskwait->last_parent_depends_on = child_task->prev_child; ++ else ++ parent->taskwait->last_parent_depends_on = NULL; ++ } ++ } ++ if (taskgroup && taskgroup->children == child_task) ++ taskgroup->children = child_task->next_taskgroup; ++ child_task->prev_queue->next_queue = child_task->next_queue; ++ child_task->next_queue->prev_queue = child_task->prev_queue; ++ if (team->task_queue == child_task) ++ { ++ if (child_task->next_queue != child_task) ++ team->task_queue = child_task->next_queue; ++ else ++ team->task_queue = NULL; ++ } ++ child_task->kind = GOMP_TASK_TIED; ++ if (--team->task_queued_count == 0) ++ gomp_team_barrier_clear_task_pending (&team->barrier); ++ if ((gomp_team_barrier_cancelled (&team->barrier) ++ || (taskgroup && taskgroup->cancelled)) ++ && !child_task->copy_ctors_done) ++ return true; ++ return false; ++} ++ ++static void ++gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task) ++{ ++ struct gomp_task *parent = child_task->parent; ++ size_t i; ++ ++ for (i = 0; i < child_task->depend_count; i++) ++ if (!child_task->depend[i].redundant) ++ { ++ if (child_task->depend[i].next) ++ child_task->depend[i].next->prev = child_task->depend[i].prev; ++ if (child_task->depend[i].prev) ++ child_task->depend[i].prev->next = child_task->depend[i].next; ++ else ++ { ++ hash_entry_type *slot ++ = htab_find_slot (&parent->depend_hash, &child_task->depend[i], ++ NO_INSERT); ++ if (*slot != &child_task->depend[i]) ++ abort (); ++ if (child_task->depend[i].next) ++ *slot = child_task->depend[i].next; ++ else ++ htab_clear_slot (parent->depend_hash, slot); ++ } ++ } ++} ++ ++static size_t ++gomp_task_run_post_handle_dependers (struct gomp_task *child_task, ++ struct gomp_team *team) ++{ ++ struct gomp_task *parent = child_task->parent; ++ size_t i, count = child_task->dependers->n_elem, ret = 0; ++ for (i = 0; i < count; i++) ++ { ++ struct gomp_task *task = child_task->dependers->elem[i]; ++ if (--task->num_dependees != 0) ++ continue; ++ ++ struct gomp_taskgroup *taskgroup = task->taskgroup; ++ if (parent) ++ { ++ if (parent->children) ++ { ++ /* If parent is in gomp_task_maybe_wait_for_dependencies ++ and it doesn't need to wait for this task, put it after ++ all ready to run tasks it needs to wait for. */ ++ if (parent->taskwait && parent->taskwait->last_parent_depends_on ++ && !task->parent_depends_on) ++ { ++ struct gomp_task *last_parent_depends_on ++ = parent->taskwait->last_parent_depends_on; ++ task->next_child = last_parent_depends_on->next_child; ++ task->prev_child = last_parent_depends_on; ++ } ++ else ++ { ++ task->next_child = parent->children; ++ task->prev_child = parent->children->prev_child; ++ parent->children = task; ++ } ++ task->next_child->prev_child = task; ++ task->prev_child->next_child = task; ++ } ++ else ++ { ++ task->next_child = task; ++ task->prev_child = task; ++ parent->children = task; ++ } ++ if (parent->taskwait) ++ { ++ if (parent->taskwait->in_taskwait) ++ { ++ parent->taskwait->in_taskwait = false; ++ gomp_sem_post (&parent->taskwait->taskwait_sem); ++ } ++ else if (parent->taskwait->in_depend_wait) ++ { ++ parent->taskwait->in_depend_wait = false; ++ gomp_sem_post (&parent->taskwait->taskwait_sem); ++ } ++ if (parent->taskwait->last_parent_depends_on == NULL ++ && task->parent_depends_on) ++ parent->taskwait->last_parent_depends_on = task; ++ } ++ } ++ if (taskgroup) ++ { ++ if (taskgroup->children) ++ { ++ task->next_taskgroup = taskgroup->children; ++ task->prev_taskgroup = taskgroup->children->prev_taskgroup; ++ task->next_taskgroup->prev_taskgroup = task; ++ task->prev_taskgroup->next_taskgroup = task; ++ } ++ else ++ { ++ task->next_taskgroup = task; ++ task->prev_taskgroup = task; ++ } ++ taskgroup->children = task; ++ if (taskgroup->in_taskgroup_wait) ++ { ++ taskgroup->in_taskgroup_wait = false; ++ gomp_sem_post (&taskgroup->taskgroup_sem); ++ } ++ } ++ if (team->task_queue) ++ { ++ task->next_queue = team->task_queue; ++ task->prev_queue = team->task_queue->prev_queue; ++ task->next_queue->prev_queue = task; ++ task->prev_queue->next_queue = task; ++ } ++ else ++ { ++ task->next_queue = task; ++ task->prev_queue = task; ++ team->task_queue = task; ++ } ++ ++team->task_count; ++ ++team->task_queued_count; ++ ++ret; ++ } ++ free (child_task->dependers); ++ child_task->dependers = NULL; ++ if (ret > 1) ++ gomp_team_barrier_set_task_pending (&team->barrier); ++ return ret; ++} ++ ++static inline size_t ++gomp_task_run_post_handle_depend (struct gomp_task *child_task, ++ struct gomp_team *team) ++{ ++ if (child_task->depend_count == 0) ++ return 0; ++ ++ /* If parent is gone already, the hash table is freed and nothing ++ will use the hash table anymore, no need to remove anything from it. */ ++ if (child_task->parent != NULL) ++ gomp_task_run_post_handle_depend_hash (child_task); ++ ++ if (child_task->dependers == NULL) ++ return 0; ++ ++ return gomp_task_run_post_handle_dependers (child_task, team); ++} ++ ++static inline void ++gomp_task_run_post_remove_parent (struct gomp_task *child_task) ++{ ++ struct gomp_task *parent = child_task->parent; ++ if (parent == NULL) ++ return; ++ if (__builtin_expect (child_task->parent_depends_on, 0) ++ && --parent->taskwait->n_depend == 0 ++ && parent->taskwait->in_depend_wait) ++ { ++ parent->taskwait->in_depend_wait = false; ++ gomp_sem_post (&parent->taskwait->taskwait_sem); ++ } ++ child_task->prev_child->next_child = child_task->next_child; ++ child_task->next_child->prev_child = child_task->prev_child; ++ if (parent->children != child_task) ++ return; ++ if (child_task->next_child != child_task) ++ parent->children = child_task->next_child; ++ else ++ { ++ /* We access task->children in GOMP_taskwait ++ outside of the task lock mutex region, so ++ need a release barrier here to ensure memory ++ written by child_task->fn above is flushed ++ before the NULL is written. */ ++ __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE); ++ if (parent->taskwait && parent->taskwait->in_taskwait) ++ { ++ parent->taskwait->in_taskwait = false; ++ gomp_sem_post (&parent->taskwait->taskwait_sem); ++ } ++ } ++} ++ ++static inline void ++gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task) ++{ ++ struct gomp_taskgroup *taskgroup = child_task->taskgroup; ++ if (taskgroup == NULL) ++ return; ++ child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup; ++ child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup; ++ if (taskgroup->num_children > 1) ++ --taskgroup->num_children; ++ else ++ { ++ /* We access taskgroup->num_children in GOMP_taskgroup_end ++ outside of the task lock mutex region, so ++ need a release barrier here to ensure memory ++ written by child_task->fn above is flushed ++ before the NULL is written. */ ++ __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE); ++ } ++ if (taskgroup->children != child_task) ++ return; ++ if (child_task->next_taskgroup != child_task) ++ taskgroup->children = child_task->next_taskgroup; ++ else ++ { ++ taskgroup->children = NULL; ++ if (taskgroup->in_taskgroup_wait) ++ { ++ taskgroup->in_taskgroup_wait = false; ++ gomp_sem_post (&taskgroup->taskgroup_sem); ++ } ++ } ++} ++ + void + gomp_barrier_handle_tasks (gomp_barrier_state_t state) + { +@@ -202,6 +678,7 @@ gomp_barrier_handle_tasks (gomp_barrier_ + struct gomp_task *task = thr->task; + struct gomp_task *child_task = NULL; + struct gomp_task *to_free = NULL; ++ int do_wake = 0; + + gomp_mutex_lock (&team->task_lock); + if (gomp_barrier_last_thread (state)) +@@ -218,26 +695,31 @@ gomp_barrier_handle_tasks (gomp_barrier_ + + while (1) + { ++ bool cancelled = false; + if (team->task_queue != NULL) + { +- struct gomp_task *parent; +- + child_task = team->task_queue; +- parent = child_task->parent; +- if (parent && parent->children == child_task) +- parent->children = child_task->next_child; +- child_task->prev_queue->next_queue = child_task->next_queue; +- child_task->next_queue->prev_queue = child_task->prev_queue; +- if (child_task->next_queue != child_task) +- team->task_queue = child_task->next_queue; +- else +- team->task_queue = NULL; +- child_task->kind = GOMP_TASK_TIED; ++ cancelled = gomp_task_run_pre (child_task, child_task->parent, ++ child_task->taskgroup, team); ++ if (__builtin_expect (cancelled, 0)) ++ { ++ if (to_free) ++ { ++ gomp_finish_task (to_free); ++ free (to_free); ++ to_free = NULL; ++ } ++ goto finish_cancelled; ++ } + team->task_running_count++; +- if (team->task_count == team->task_running_count) +- gomp_team_barrier_clear_task_pending (&team->barrier); ++ child_task->in_tied_task = true; + } + gomp_mutex_unlock (&team->task_lock); ++ if (do_wake) ++ { ++ gomp_team_barrier_wake (&team->barrier, do_wake); ++ do_wake = 0; ++ } + if (to_free) + { + gomp_finish_task (to_free); +@@ -255,33 +737,22 @@ gomp_barrier_handle_tasks (gomp_barrier_ + gomp_mutex_lock (&team->task_lock); + if (child_task) + { +- struct gomp_task *parent = child_task->parent; +- if (parent) +- { +- child_task->prev_child->next_child = child_task->next_child; +- child_task->next_child->prev_child = child_task->prev_child; +- if (parent->children == child_task) +- { +- if (child_task->next_child != child_task) +- parent->children = child_task->next_child; +- else +- { +- /* We access task->children in GOMP_taskwait +- outside of the task lock mutex region, so +- need a release barrier here to ensure memory +- written by child_task->fn above is flushed +- before the NULL is written. */ +- __atomic_store_n (&parent->children, NULL, +- MEMMODEL_RELEASE); +- if (parent->in_taskwait) +- gomp_sem_post (&parent->taskwait_sem); +- } +- } +- } ++ finish_cancelled:; ++ size_t new_tasks ++ = gomp_task_run_post_handle_depend (child_task, team); ++ gomp_task_run_post_remove_parent (child_task); + gomp_clear_parent (child_task->children); ++ gomp_task_run_post_remove_taskgroup (child_task); + to_free = child_task; + child_task = NULL; +- team->task_running_count--; ++ if (!cancelled) ++ team->task_running_count--; ++ if (new_tasks > 1) ++ { ++ do_wake = team->nthreads - team->task_running_count; ++ if (do_wake > new_tasks) ++ do_wake = new_tasks; ++ } + if (--team->task_count == 0 + && gomp_team_barrier_waiting_for_tasks (&team->barrier)) + { +@@ -304,9 +775,11 @@ GOMP_taskwait (void) + struct gomp_task *task = thr->task; + struct gomp_task *child_task = NULL; + struct gomp_task *to_free = NULL; ++ struct gomp_taskwait taskwait; ++ int do_wake = 0; + + /* The acquire barrier on load of task->children here synchronizes +- with the write of a NULL in gomp_barrier_handle_tasks. It is ++ with the write of a NULL in gomp_task_run_post_remove_parent. It is + not necessary that we synchronize with other non-NULL writes at + this point, but we must ensure that all writes to memory by a + child thread task work function are seen before we exit from +@@ -315,42 +788,60 @@ GOMP_taskwait (void) + || __atomic_load_n (&task->children, MEMMODEL_ACQUIRE) == NULL) + return; + ++ memset (&taskwait, 0, sizeof (taskwait)); + gomp_mutex_lock (&team->task_lock); + while (1) + { ++ bool cancelled = false; + if (task->children == NULL) + { ++ bool destroy_taskwait = task->taskwait != NULL; ++ task->taskwait = NULL; + gomp_mutex_unlock (&team->task_lock); + if (to_free) + { + gomp_finish_task (to_free); + free (to_free); + } ++ if (destroy_taskwait) ++ gomp_sem_destroy (&taskwait.taskwait_sem); + return; + } + if (task->children->kind == GOMP_TASK_WAITING) + { + child_task = task->children; +- task->children = child_task->next_child; +- child_task->prev_queue->next_queue = child_task->next_queue; +- child_task->next_queue->prev_queue = child_task->prev_queue; +- if (team->task_queue == child_task) ++ cancelled ++ = gomp_task_run_pre (child_task, task, child_task->taskgroup, ++ team); ++ if (__builtin_expect (cancelled, 0)) + { +- if (child_task->next_queue != child_task) +- team->task_queue = child_task->next_queue; +- else +- team->task_queue = NULL; ++ if (to_free) ++ { ++ gomp_finish_task (to_free); ++ free (to_free); ++ to_free = NULL; ++ } ++ goto finish_cancelled; + } +- child_task->kind = GOMP_TASK_TIED; +- team->task_running_count++; +- if (team->task_count == team->task_running_count) +- gomp_team_barrier_clear_task_pending (&team->barrier); + } + else +- /* All tasks we are waiting for are already running +- in other threads. Wait for them. */ +- task->in_taskwait = true; ++ { ++ /* All tasks we are waiting for are already running ++ in other threads. Wait for them. */ ++ if (task->taskwait == NULL) ++ { ++ taskwait.in_depend_wait = false; ++ gomp_sem_init (&taskwait.taskwait_sem, 0); ++ task->taskwait = &taskwait; ++ } ++ taskwait.in_taskwait = true; ++ } + gomp_mutex_unlock (&team->task_lock); ++ if (do_wake) ++ { ++ gomp_team_barrier_wake (&team->barrier, do_wake); ++ do_wake = 0; ++ } + if (to_free) + { + gomp_finish_task (to_free); +@@ -364,14 +855,178 @@ GOMP_taskwait (void) + thr->task = task; + } + else ++ gomp_sem_wait (&taskwait.taskwait_sem); ++ gomp_mutex_lock (&team->task_lock); ++ if (child_task) + { +- gomp_sem_wait (&task->taskwait_sem); +- task->in_taskwait = false; ++ finish_cancelled:; ++ size_t new_tasks ++ = gomp_task_run_post_handle_depend (child_task, team); ++ child_task->prev_child->next_child = child_task->next_child; ++ child_task->next_child->prev_child = child_task->prev_child; ++ if (task->children == child_task) ++ { ++ if (child_task->next_child != child_task) ++ task->children = child_task->next_child; ++ else ++ task->children = NULL; ++ } ++ gomp_clear_parent (child_task->children); ++ gomp_task_run_post_remove_taskgroup (child_task); ++ to_free = child_task; ++ child_task = NULL; ++ team->task_count--; ++ if (new_tasks > 1) ++ { ++ do_wake = team->nthreads - team->task_running_count ++ - !task->in_tied_task; ++ if (do_wake > new_tasks) ++ do_wake = new_tasks; ++ } ++ } ++ } ++} ++ ++/* This is like GOMP_taskwait, but we only wait for tasks that the ++ upcoming task depends on. */ ++ ++static void ++gomp_task_maybe_wait_for_dependencies (void **depend) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_task *task = thr->task; ++ struct gomp_team *team = thr->ts.team; ++ struct gomp_task_depend_entry elem, *ent = NULL; ++ struct gomp_taskwait taskwait; ++ struct gomp_task *last_parent_depends_on = NULL; ++ size_t ndepend = (uintptr_t) depend[0]; ++ size_t nout = (uintptr_t) depend[1]; ++ size_t i; ++ size_t num_awaited = 0; ++ struct gomp_task *child_task = NULL; ++ struct gomp_task *to_free = NULL; ++ int do_wake = 0; ++ ++ gomp_mutex_lock (&team->task_lock); ++ for (i = 0; i < ndepend; i++) ++ { ++ elem.addr = depend[i + 2]; ++ ent = htab_find (task->depend_hash, &elem); ++ for (; ent; ent = ent->next) ++ if (i >= nout && ent->is_in) ++ continue; ++ else ++ { ++ struct gomp_task *tsk = ent->task; ++ if (!tsk->parent_depends_on) ++ { ++ tsk->parent_depends_on = true; ++ ++num_awaited; ++ if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING) ++ { ++ /* If a task we need to wait for is not already ++ running and is ready to be scheduled, move it ++ to front, so that we run it as soon as possible. */ ++ if (last_parent_depends_on) ++ { ++ tsk->prev_child->next_child = tsk->next_child; ++ tsk->next_child->prev_child = tsk->prev_child; ++ tsk->prev_child = last_parent_depends_on; ++ tsk->next_child = last_parent_depends_on->next_child; ++ tsk->prev_child->next_child = tsk; ++ tsk->next_child->prev_child = tsk; ++ } ++ else if (tsk != task->children) ++ { ++ tsk->prev_child->next_child = tsk->next_child; ++ tsk->next_child->prev_child = tsk->prev_child; ++ tsk->prev_child = task->children; ++ tsk->next_child = task->children->next_child; ++ task->children = tsk; ++ tsk->prev_child->next_child = tsk; ++ tsk->next_child->prev_child = tsk; ++ } ++ last_parent_depends_on = tsk; ++ } ++ } ++ } ++ } ++ if (num_awaited == 0) ++ { ++ gomp_mutex_unlock (&team->task_lock); ++ return; ++ } ++ ++ memset (&taskwait, 0, sizeof (taskwait)); ++ taskwait.n_depend = num_awaited; ++ taskwait.last_parent_depends_on = last_parent_depends_on; ++ gomp_sem_init (&taskwait.taskwait_sem, 0); ++ task->taskwait = &taskwait; ++ ++ while (1) ++ { ++ bool cancelled = false; ++ if (taskwait.n_depend == 0) ++ { ++ task->taskwait = NULL; ++ gomp_mutex_unlock (&team->task_lock); ++ if (to_free) ++ { ++ gomp_finish_task (to_free); ++ free (to_free); ++ } ++ gomp_sem_destroy (&taskwait.taskwait_sem); + return; + } ++ if (task->children->kind == GOMP_TASK_WAITING) ++ { ++ child_task = task->children; ++ cancelled ++ = gomp_task_run_pre (child_task, task, child_task->taskgroup, ++ team); ++ if (__builtin_expect (cancelled, 0)) ++ { ++ if (to_free) ++ { ++ gomp_finish_task (to_free); ++ free (to_free); ++ to_free = NULL; ++ } ++ goto finish_cancelled; ++ } ++ } ++ else ++ /* All tasks we are waiting for are already running ++ in other threads. Wait for them. */ ++ taskwait.in_depend_wait = true; ++ gomp_mutex_unlock (&team->task_lock); ++ if (do_wake) ++ { ++ gomp_team_barrier_wake (&team->barrier, do_wake); ++ do_wake = 0; ++ } ++ if (to_free) ++ { ++ gomp_finish_task (to_free); ++ free (to_free); ++ to_free = NULL; ++ } ++ if (child_task) ++ { ++ thr->task = child_task; ++ child_task->fn (child_task->fn_data); ++ thr->task = task; ++ } ++ else ++ gomp_sem_wait (&taskwait.taskwait_sem); + gomp_mutex_lock (&team->task_lock); + if (child_task) + { ++ finish_cancelled:; ++ size_t new_tasks ++ = gomp_task_run_post_handle_depend (child_task, team); ++ if (child_task->parent_depends_on) ++ --taskwait.n_depend; + child_task->prev_child->next_child = child_task->next_child; + child_task->next_child->prev_child = child_task->prev_child; + if (task->children == child_task) +@@ -382,10 +1037,17 @@ GOMP_taskwait (void) + task->children = NULL; + } + gomp_clear_parent (child_task->children); ++ gomp_task_run_post_remove_taskgroup (child_task); + to_free = child_task; + child_task = NULL; + team->task_count--; +- team->task_running_count--; ++ if (new_tasks > 1) ++ { ++ do_wake = team->nthreads - team->task_running_count ++ - !task->in_tied_task; ++ if (do_wake > new_tasks) ++ do_wake = new_tasks; ++ } + } + } + } +@@ -398,6 +1060,151 @@ GOMP_taskyield (void) + /* Nothing at the moment. */ + } + ++void ++GOMP_taskgroup_start (void) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ struct gomp_task *task = thr->task; ++ struct gomp_taskgroup *taskgroup; ++ ++ /* If team is NULL, all tasks are executed as ++ GOMP_TASK_IFFALSE tasks and thus all children tasks of ++ taskgroup and their descendant tasks will be finished ++ by the time GOMP_taskgroup_end is called. */ ++ if (team == NULL) ++ return; ++ taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup)); ++ taskgroup->prev = task->taskgroup; ++ taskgroup->children = NULL; ++ taskgroup->in_taskgroup_wait = false; ++ taskgroup->cancelled = false; ++ taskgroup->num_children = 0; ++ gomp_sem_init (&taskgroup->taskgroup_sem, 0); ++ task->taskgroup = taskgroup; ++} ++ ++void ++GOMP_taskgroup_end (void) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ struct gomp_task *task = thr->task; ++ struct gomp_taskgroup *taskgroup; ++ struct gomp_task *child_task = NULL; ++ struct gomp_task *to_free = NULL; ++ int do_wake = 0; ++ ++ if (team == NULL) ++ return; ++ taskgroup = task->taskgroup; ++ ++ /* The acquire barrier on load of taskgroup->num_children here ++ synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup. ++ It is not necessary that we synchronize with other non-0 writes at ++ this point, but we must ensure that all writes to memory by a ++ child thread task work function are seen before we exit from ++ GOMP_taskgroup_end. */ ++ if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0) ++ goto finish; ++ ++ gomp_mutex_lock (&team->task_lock); ++ while (1) ++ { ++ bool cancelled = false; ++ if (taskgroup->children == NULL) ++ { ++ if (taskgroup->num_children) ++ { ++ if (task->children == NULL) ++ goto do_wait; ++ child_task = task->children; ++ } ++ else ++ { ++ gomp_mutex_unlock (&team->task_lock); ++ if (to_free) ++ { ++ gomp_finish_task (to_free); ++ free (to_free); ++ } ++ goto finish; ++ } ++ } ++ else ++ child_task = taskgroup->children; ++ if (child_task->kind == GOMP_TASK_WAITING) ++ { ++ cancelled ++ = gomp_task_run_pre (child_task, child_task->parent, taskgroup, ++ team); ++ if (__builtin_expect (cancelled, 0)) ++ { ++ if (to_free) ++ { ++ gomp_finish_task (to_free); ++ free (to_free); ++ to_free = NULL; ++ } ++ goto finish_cancelled; ++ } ++ } ++ else ++ { ++ child_task = NULL; ++ do_wait: ++ /* All tasks we are waiting for are already running ++ in other threads. Wait for them. */ ++ taskgroup->in_taskgroup_wait = true; ++ } ++ gomp_mutex_unlock (&team->task_lock); ++ if (do_wake) ++ { ++ gomp_team_barrier_wake (&team->barrier, do_wake); ++ do_wake = 0; ++ } ++ if (to_free) ++ { ++ gomp_finish_task (to_free); ++ free (to_free); ++ to_free = NULL; ++ } ++ if (child_task) ++ { ++ thr->task = child_task; ++ child_task->fn (child_task->fn_data); ++ thr->task = task; ++ } ++ else ++ gomp_sem_wait (&taskgroup->taskgroup_sem); ++ gomp_mutex_lock (&team->task_lock); ++ if (child_task) ++ { ++ finish_cancelled:; ++ size_t new_tasks ++ = gomp_task_run_post_handle_depend (child_task, team); ++ gomp_task_run_post_remove_parent (child_task); ++ gomp_clear_parent (child_task->children); ++ gomp_task_run_post_remove_taskgroup (child_task); ++ to_free = child_task; ++ child_task = NULL; ++ team->task_count--; ++ if (new_tasks > 1) ++ { ++ do_wake = team->nthreads - team->task_running_count ++ - !task->in_tied_task; ++ if (do_wake > new_tasks) ++ do_wake = new_tasks; ++ } ++ } ++ } ++ ++ finish: ++ task->taskgroup = taskgroup->prev; ++ gomp_sem_destroy (&taskgroup->taskgroup_sem); ++ free (taskgroup); ++} ++ + int + omp_in_final (void) + { +--- libgomp/testsuite/libgomp.fortran/lib3.f (revision 210461) ++++ libgomp/testsuite/libgomp.fortran/lib3.f (revision 210462) +@@ -66,6 +66,7 @@ C$OMP END PARALLEL + C$OMP PARALLEL REDUCTION (.OR.:L) IF (.TRUE.) + L = .NOT. OMP_IN_PARALLEL () + C$OMP END PARALLEL ++ IF (L) CALL ABORT + + E = OMP_GET_WTIME () + IF (D .GT. E) CALL ABORT +--- libgomp/testsuite/libgomp.fortran/lib1.f90 (revision 210461) ++++ libgomp/testsuite/libgomp.fortran/lib1.f90 (revision 210462) +@@ -66,6 +66,7 @@ + !$omp parallel reduction (.or.:l) if (.true.) + l = .not. omp_in_parallel () + !$omp end parallel ++ if (l) call abort + + e = omp_get_wtime () + if (d .gt. e) call abort +--- libgomp/testsuite/libgomp.fortran/lib2.f (revision 210461) ++++ libgomp/testsuite/libgomp.fortran/lib2.f (revision 210462) +@@ -66,6 +66,7 @@ C$OMP END PARALLEL + C$OMP PARALLEL REDUCTION (.OR.:L) IF (.TRUE.) + L = .NOT. OMP_IN_PARALLEL () + C$OMP END PARALLEL ++ IF (L) CALL ABORT + + E = OMP_GET_WTIME () + IF (D .GT. E) CALL ABORT +--- libgomp/testsuite/libgomp.c/atomic-14.c (revision 210461) ++++ libgomp/testsuite/libgomp.c/atomic-14.c (revision 210462) +@@ -16,7 +16,7 @@ main () + #pragma omp atomic update + x = x + 7; + #pragma omp atomic +- x = x + 7 + 6; ++ x = x + (7 + 6); + #pragma omp atomic update + x = x + 2 * 3; + #pragma omp atomic +@@ -65,7 +65,7 @@ main () + if (v != -8) + abort (); + #pragma omp atomic +- x = x * -4 / 2; ++ x = x * (-4 / 2); + #pragma omp atomic read + v = x; + if (v != 16) +--- libgomp/testsuite/libgomp.c/lib-1.c (revision 210461) ++++ libgomp/testsuite/libgomp.c/lib-1.c (revision 210462) +@@ -85,6 +85,8 @@ main (void) + l = ! omp_in_parallel (); + #pragma omp parallel reduction (|:l) if (1) + l = ! omp_in_parallel (); ++ if (l) ++ abort (); + + e = omp_get_wtime (); + if (d > e) +--- libgomp/loop.c (revision 210461) ++++ libgomp/loop.c (revision 210462) +@@ -439,14 +439,14 @@ static void + gomp_parallel_loop_start (void (*fn) (void *), void *data, + unsigned num_threads, long start, long end, + long incr, enum gomp_schedule_type sched, +- long chunk_size) ++ long chunk_size, unsigned int flags) + { + struct gomp_team *team; + + num_threads = gomp_resolve_num_threads (num_threads, 0); + team = gomp_new_team (num_threads); + gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size); +- gomp_team_start (fn, data, num_threads, team); ++ gomp_team_start (fn, data, num_threads, flags, team); + } + + void +@@ -455,7 +455,7 @@ GOMP_parallel_loop_static_start (void (* + long incr, long chunk_size) + { + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, +- GFS_STATIC, chunk_size); ++ GFS_STATIC, chunk_size, 0); + } + + void +@@ -464,7 +464,7 @@ GOMP_parallel_loop_dynamic_start (void ( + long incr, long chunk_size) + { + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, +- GFS_DYNAMIC, chunk_size); ++ GFS_DYNAMIC, chunk_size, 0); + } + + void +@@ -473,7 +473,7 @@ GOMP_parallel_loop_guided_start (void (* + long incr, long chunk_size) + { + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, +- GFS_GUIDED, chunk_size); ++ GFS_GUIDED, chunk_size, 0); + } + + void +@@ -483,11 +483,59 @@ GOMP_parallel_loop_runtime_start (void ( + { + struct gomp_task_icv *icv = gomp_icv (false); + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, +- icv->run_sched_var, icv->run_sched_modifier); ++ icv->run_sched_var, icv->run_sched_modifier, 0); ++} ++ ++ialias_redirect (GOMP_parallel_end) ++ ++void ++GOMP_parallel_loop_static (void (*fn) (void *), void *data, ++ unsigned num_threads, long start, long end, ++ long incr, long chunk_size, unsigned flags) ++{ ++ gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, ++ GFS_STATIC, chunk_size, flags); ++ fn (data); ++ GOMP_parallel_end (); ++} ++ ++void ++GOMP_parallel_loop_dynamic (void (*fn) (void *), void *data, ++ unsigned num_threads, long start, long end, ++ long incr, long chunk_size, unsigned flags) ++{ ++ gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, ++ GFS_DYNAMIC, chunk_size, flags); ++ fn (data); ++ GOMP_parallel_end (); ++} ++ ++void ++GOMP_parallel_loop_guided (void (*fn) (void *), void *data, ++ unsigned num_threads, long start, long end, ++ long incr, long chunk_size, unsigned flags) ++{ ++ gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, ++ GFS_GUIDED, chunk_size, flags); ++ fn (data); ++ GOMP_parallel_end (); ++} ++ ++void ++GOMP_parallel_loop_runtime (void (*fn) (void *), void *data, ++ unsigned num_threads, long start, long end, ++ long incr, unsigned flags) ++{ ++ struct gomp_task_icv *icv = gomp_icv (false); ++ gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, ++ icv->run_sched_var, icv->run_sched_modifier, ++ flags); ++ fn (data); ++ GOMP_parallel_end (); + } + + /* The GOMP_loop_end* routines are called after the thread is told that +- all loop iterations are complete. This first version synchronizes ++ all loop iterations are complete. The first two versions synchronize + all threads; the nowait version does not. */ + + void +@@ -496,6 +544,12 @@ GOMP_loop_end (void) + gomp_work_share_end (); + } + ++bool ++GOMP_loop_end_cancel (void) ++{ ++ return gomp_work_share_end_cancel (); ++} ++ + void + GOMP_loop_end_nowait (void) + { +--- libgomp/hashtab.h (revision 0) ++++ libgomp/hashtab.h (revision 210462) +@@ -0,0 +1,443 @@ ++/* An expandable hash tables datatype. ++ Copyright (C) 1999-2013 ++ Free Software Foundation, Inc. ++ Contributed by Vladimir Makarov . ++ ++This program is free software; you can redistribute it and/or modify ++it under the terms of the GNU General Public License as published by ++the Free Software Foundation; either version 2 of the License, or ++(at your option) any later version. ++ ++This program is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with this program; if not, write to the Free Software ++Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ ++ ++/* The hash table code copied from include/hashtab.[hc] and adjusted, ++ so that the hash table entries are in the flexible array at the end ++ of the control structure, no callbacks are used and the elements in the ++ table are of the hash_entry_type type. ++ Before including this file, define hash_entry_type type and ++ htab_alloc and htab_free functions. After including it, define ++ htab_hash and htab_eq inline functions. */ ++ ++/* This package implements basic hash table functionality. It is possible ++ to search for an entry, create an entry and destroy an entry. ++ ++ Elements in the table are generic pointers. ++ ++ The size of the table is not fixed; if the occupancy of the table ++ grows too high the hash table will be expanded. ++ ++ The abstract data implementation is based on generalized Algorithm D ++ from Knuth's book "The art of computer programming". Hash table is ++ expanded by creation of new hash table and transferring elements from ++ the old table to the new table. */ ++ ++/* The type for a hash code. */ ++typedef unsigned int hashval_t; ++ ++static inline hashval_t htab_hash (hash_entry_type); ++static inline bool htab_eq (hash_entry_type, hash_entry_type); ++ ++/* This macro defines reserved value for empty table entry. */ ++ ++#define HTAB_EMPTY_ENTRY ((hash_entry_type) 0) ++ ++/* This macro defines reserved value for table entry which contained ++ a deleted element. */ ++ ++#define HTAB_DELETED_ENTRY ((hash_entry_type) 1) ++ ++/* Hash tables are of the following type. The structure ++ (implementation) of this type is not needed for using the hash ++ tables. All work with hash table should be executed only through ++ functions mentioned below. The size of this structure is subject to ++ change. */ ++ ++struct htab { ++ /* Current size (in entries) of the hash table. */ ++ size_t size; ++ ++ /* Current number of elements including also deleted elements. */ ++ size_t n_elements; ++ ++ /* Current number of deleted elements in the table. */ ++ size_t n_deleted; ++ ++ /* Current size (in entries) of the hash table, as an index into the ++ table of primes. */ ++ unsigned int size_prime_index; ++ ++ /* Table itself. */ ++ hash_entry_type entries[]; ++}; ++ ++typedef struct htab *htab_t; ++ ++/* An enum saying whether we insert into the hash table or not. */ ++enum insert_option {NO_INSERT, INSERT}; ++ ++/* Table of primes and multiplicative inverses. ++ ++ Note that these are not minimally reduced inverses. Unlike when generating ++ code to divide by a constant, we want to be able to use the same algorithm ++ all the time. All of these inverses (are implied to) have bit 32 set. ++ ++ For the record, the function that computed the table is in ++ libiberty/hashtab.c. */ ++ ++struct prime_ent ++{ ++ hashval_t prime; ++ hashval_t inv; ++ hashval_t inv_m2; /* inverse of prime-2 */ ++ hashval_t shift; ++}; ++ ++static struct prime_ent const prime_tab[] = { ++ { 7, 0x24924925, 0x9999999b, 2 }, ++ { 13, 0x3b13b13c, 0x745d1747, 3 }, ++ { 31, 0x08421085, 0x1a7b9612, 4 }, ++ { 61, 0x0c9714fc, 0x15b1e5f8, 5 }, ++ { 127, 0x02040811, 0x0624dd30, 6 }, ++ { 251, 0x05197f7e, 0x073260a5, 7 }, ++ { 509, 0x01824366, 0x02864fc8, 8 }, ++ { 1021, 0x00c0906d, 0x014191f7, 9 }, ++ { 2039, 0x0121456f, 0x0161e69e, 10 }, ++ { 4093, 0x00300902, 0x00501908, 11 }, ++ { 8191, 0x00080041, 0x00180241, 12 }, ++ { 16381, 0x000c0091, 0x00140191, 13 }, ++ { 32749, 0x002605a5, 0x002a06e6, 14 }, ++ { 65521, 0x000f00e2, 0x00110122, 15 }, ++ { 131071, 0x00008001, 0x00018003, 16 }, ++ { 262139, 0x00014002, 0x0001c004, 17 }, ++ { 524287, 0x00002001, 0x00006001, 18 }, ++ { 1048573, 0x00003001, 0x00005001, 19 }, ++ { 2097143, 0x00004801, 0x00005801, 20 }, ++ { 4194301, 0x00000c01, 0x00001401, 21 }, ++ { 8388593, 0x00001e01, 0x00002201, 22 }, ++ { 16777213, 0x00000301, 0x00000501, 23 }, ++ { 33554393, 0x00001381, 0x00001481, 24 }, ++ { 67108859, 0x00000141, 0x000001c1, 25 }, ++ { 134217689, 0x000004e1, 0x00000521, 26 }, ++ { 268435399, 0x00000391, 0x000003b1, 27 }, ++ { 536870909, 0x00000019, 0x00000029, 28 }, ++ { 1073741789, 0x0000008d, 0x00000095, 29 }, ++ { 2147483647, 0x00000003, 0x00000007, 30 }, ++ /* Avoid "decimal constant so large it is unsigned" for 4294967291. */ ++ { 0xfffffffb, 0x00000006, 0x00000008, 31 } ++}; ++ ++/* The following function returns an index into the above table of the ++ nearest prime number which is greater than N, and near a power of two. */ ++ ++static unsigned int ++higher_prime_index (unsigned long n) ++{ ++ unsigned int low = 0; ++ unsigned int high = sizeof(prime_tab) / sizeof(prime_tab[0]); ++ ++ while (low != high) ++ { ++ unsigned int mid = low + (high - low) / 2; ++ if (n > prime_tab[mid].prime) ++ low = mid + 1; ++ else ++ high = mid; ++ } ++ ++ /* If we've run out of primes, abort. */ ++ if (n > prime_tab[low].prime) ++ abort (); ++ ++ return low; ++} ++ ++/* Return the current size of given hash table. */ ++ ++static inline size_t ++htab_size (htab_t htab) ++{ ++ return htab->size; ++} ++ ++/* Return the current number of elements in given hash table. */ ++ ++static inline size_t ++htab_elements (htab_t htab) ++{ ++ return htab->n_elements - htab->n_deleted; ++} ++ ++/* Return X % Y. */ ++ ++static inline hashval_t ++htab_mod_1 (hashval_t x, hashval_t y, hashval_t inv, int shift) ++{ ++ /* The multiplicative inverses computed above are for 32-bit types, and ++ requires that we be able to compute a highpart multiply. */ ++ if (sizeof (hashval_t) * __CHAR_BIT__ <= 32) ++ { ++ hashval_t t1, t2, t3, t4, q, r; ++ ++ t1 = ((unsigned long long)x * inv) >> 32; ++ t2 = x - t1; ++ t3 = t2 >> 1; ++ t4 = t1 + t3; ++ q = t4 >> shift; ++ r = x - (q * y); ++ ++ return r; ++ } ++ ++ /* Otherwise just use the native division routines. */ ++ return x % y; ++} ++ ++/* Compute the primary hash for HASH given HTAB's current size. */ ++ ++static inline hashval_t ++htab_mod (hashval_t hash, htab_t htab) ++{ ++ const struct prime_ent *p = &prime_tab[htab->size_prime_index]; ++ return htab_mod_1 (hash, p->prime, p->inv, p->shift); ++} ++ ++/* Compute the secondary hash for HASH given HTAB's current size. */ ++ ++static inline hashval_t ++htab_mod_m2 (hashval_t hash, htab_t htab) ++{ ++ const struct prime_ent *p = &prime_tab[htab->size_prime_index]; ++ return 1 + htab_mod_1 (hash, p->prime - 2, p->inv_m2, p->shift); ++} ++ ++/* Create hash table of size SIZE. */ ++ ++static htab_t ++htab_create (size_t size) ++{ ++ htab_t result; ++ unsigned int size_prime_index; ++ ++ size_prime_index = higher_prime_index (size); ++ size = prime_tab[size_prime_index].prime; ++ ++ result = (htab_t) htab_alloc (sizeof (struct htab) ++ + size * sizeof (hash_entry_type)); ++ result->size = size; ++ result->n_elements = 0; ++ result->n_deleted = 0; ++ result->size_prime_index = size_prime_index; ++ memset (result->entries, 0, size * sizeof (hash_entry_type)); ++ return result; ++} ++ ++/* Similar to htab_find_slot, but without several unwanted side effects: ++ - Does not call htab_eq when it finds an existing entry. ++ - Does not change the count of elements in the hash table. ++ This function also assumes there are no deleted entries in the table. ++ HASH is the hash value for the element to be inserted. */ ++ ++static hash_entry_type * ++find_empty_slot_for_expand (htab_t htab, hashval_t hash) ++{ ++ hashval_t index = htab_mod (hash, htab); ++ size_t size = htab_size (htab); ++ hash_entry_type *slot = htab->entries + index; ++ hashval_t hash2; ++ ++ if (*slot == HTAB_EMPTY_ENTRY) ++ return slot; ++ else if (*slot == HTAB_DELETED_ENTRY) ++ abort (); ++ ++ hash2 = htab_mod_m2 (hash, htab); ++ for (;;) ++ { ++ index += hash2; ++ if (index >= size) ++ index -= size; ++ ++ slot = htab->entries + index; ++ if (*slot == HTAB_EMPTY_ENTRY) ++ return slot; ++ else if (*slot == HTAB_DELETED_ENTRY) ++ abort (); ++ } ++} ++ ++/* The following function changes size of memory allocated for the ++ entries and repeatedly inserts the table elements. The occupancy ++ of the table after the call will be about 50%. Naturally the hash ++ table must already exist. Remember also that the place of the ++ table entries is changed. */ ++ ++static htab_t ++htab_expand (htab_t htab) ++{ ++ htab_t nhtab; ++ hash_entry_type *olimit; ++ hash_entry_type *p; ++ size_t osize, elts; ++ ++ osize = htab->size; ++ olimit = htab->entries + osize; ++ elts = htab_elements (htab); ++ ++ /* Resize only when table after removal of unused elements is either ++ too full or too empty. */ ++ if (elts * 2 > osize || (elts * 8 < osize && osize > 32)) ++ nhtab = htab_create (elts * 2); ++ else ++ nhtab = htab_create (osize - 1); ++ nhtab->n_elements = htab->n_elements - htab->n_deleted; ++ ++ p = htab->entries; ++ do ++ { ++ hash_entry_type x = *p; ++ ++ if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY) ++ *find_empty_slot_for_expand (nhtab, htab_hash (x)) = x; ++ ++ p++; ++ } ++ while (p < olimit); ++ ++ htab_free (htab); ++ return nhtab; ++} ++ ++/* This function searches for a hash table entry equal to the given ++ element. It cannot be used to insert or delete an element. */ ++ ++static hash_entry_type ++htab_find (htab_t htab, const hash_entry_type element) ++{ ++ hashval_t index, hash2, hash = htab_hash (element); ++ size_t size; ++ hash_entry_type entry; ++ ++ size = htab_size (htab); ++ index = htab_mod (hash, htab); ++ ++ entry = htab->entries[index]; ++ if (entry == HTAB_EMPTY_ENTRY ++ || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element))) ++ return entry; ++ ++ hash2 = htab_mod_m2 (hash, htab); ++ for (;;) ++ { ++ index += hash2; ++ if (index >= size) ++ index -= size; ++ ++ entry = htab->entries[index]; ++ if (entry == HTAB_EMPTY_ENTRY ++ || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element))) ++ return entry; ++ } ++} ++ ++/* This function searches for a hash table slot containing an entry ++ equal to the given element. To delete an entry, call this with ++ insert=NO_INSERT, then call htab_clear_slot on the slot returned ++ (possibly after doing some checks). To insert an entry, call this ++ with insert=INSERT, then write the value you want into the returned ++ slot. */ ++ ++static hash_entry_type * ++htab_find_slot (htab_t *htabp, const hash_entry_type element, ++ enum insert_option insert) ++{ ++ hash_entry_type *first_deleted_slot; ++ hashval_t index, hash2, hash = htab_hash (element); ++ size_t size; ++ hash_entry_type entry; ++ htab_t htab = *htabp; ++ ++ size = htab_size (htab); ++ if (insert == INSERT && size * 3 <= htab->n_elements * 4) ++ { ++ htab = *htabp = htab_expand (htab); ++ size = htab_size (htab); ++ } ++ ++ index = htab_mod (hash, htab); ++ ++ first_deleted_slot = NULL; ++ ++ entry = htab->entries[index]; ++ if (entry == HTAB_EMPTY_ENTRY) ++ goto empty_entry; ++ else if (entry == HTAB_DELETED_ENTRY) ++ first_deleted_slot = &htab->entries[index]; ++ else if (htab_eq (entry, element)) ++ return &htab->entries[index]; ++ ++ hash2 = htab_mod_m2 (hash, htab); ++ for (;;) ++ { ++ index += hash2; ++ if (index >= size) ++ index -= size; ++ ++ entry = htab->entries[index]; ++ if (entry == HTAB_EMPTY_ENTRY) ++ goto empty_entry; ++ else if (entry == HTAB_DELETED_ENTRY) ++ { ++ if (!first_deleted_slot) ++ first_deleted_slot = &htab->entries[index]; ++ } ++ else if (htab_eq (entry, element)) ++ return &htab->entries[index]; ++ } ++ ++ empty_entry: ++ if (insert == NO_INSERT) ++ return NULL; ++ ++ if (first_deleted_slot) ++ { ++ htab->n_deleted--; ++ *first_deleted_slot = HTAB_EMPTY_ENTRY; ++ return first_deleted_slot; ++ } ++ ++ htab->n_elements++; ++ return &htab->entries[index]; ++} ++ ++/* This function clears a specified slot in a hash table. It is ++ useful when you've already done the lookup and don't want to do it ++ again. */ ++ ++static inline void ++htab_clear_slot (htab_t htab, hash_entry_type *slot) ++{ ++ if (slot < htab->entries || slot >= htab->entries + htab_size (htab) ++ || *slot == HTAB_EMPTY_ENTRY || *slot == HTAB_DELETED_ENTRY) ++ abort (); ++ ++ *slot = HTAB_DELETED_ENTRY; ++ htab->n_deleted++; ++} ++ ++/* Returns a hash code for pointer P. Simplified version of evahash */ ++ ++static inline hashval_t ++hash_pointer (const void *p) ++{ ++ uintptr_t v = (uintptr_t) p; ++ if (sizeof (v) > sizeof (hashval_t)) ++ v ^= v >> (sizeof (uintptr_t) / 2 * __CHAR_BIT__); ++ return v; ++} +--- libgomp/work.c (revision 210461) ++++ libgomp/work.c (revision 210462) +@@ -221,7 +221,10 @@ gomp_work_share_end (void) + if (gomp_barrier_last_thread (bstate)) + { + if (__builtin_expect (thr->ts.last_work_share != NULL, 1)) +- free_work_share (team, thr->ts.last_work_share); ++ { ++ team->work_shares_to_free = thr->ts.work_share; ++ free_work_share (team, thr->ts.last_work_share); ++ } + } + + gomp_team_barrier_wait_end (&team->barrier, bstate); +@@ -229,6 +232,32 @@ gomp_work_share_end (void) + } + + /* The current thread is done with its current work sharing construct. ++ This version implies a cancellable barrier at the end of the work-share. */ ++ ++bool ++gomp_work_share_end_cancel (void) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ gomp_barrier_state_t bstate; ++ ++ /* Cancellable work sharing constructs cannot be orphaned. */ ++ bstate = gomp_barrier_wait_cancel_start (&team->barrier); ++ ++ if (gomp_barrier_last_thread (bstate)) ++ { ++ if (__builtin_expect (thr->ts.last_work_share != NULL, 1)) ++ { ++ team->work_shares_to_free = thr->ts.work_share; ++ free_work_share (team, thr->ts.last_work_share); ++ } ++ } ++ thr->ts.last_work_share = NULL; ++ ++ return gomp_team_barrier_wait_cancel_end (&team->barrier, bstate); ++} ++ ++/* The current thread is done with its current work sharing construct. + This version does NOT imply a barrier at the end of the work-share. */ + + void +@@ -259,6 +288,9 @@ gomp_work_share_end_nowait (void) + #endif + + if (completed == team->nthreads) +- free_work_share (team, thr->ts.last_work_share); ++ { ++ team->work_shares_to_free = thr->ts.work_share; ++ free_work_share (team, thr->ts.last_work_share); ++ } + thr->ts.last_work_share = NULL; + } +--- libgomp/config/linux/proc.c (revision 210461) ++++ libgomp/config/linux/proc.c (revision 210462) +@@ -30,6 +30,7 @@ + #endif + #include "libgomp.h" + #include "proc.h" ++#include + #include + #include + #ifdef HAVE_GETLOADAVG +@@ -39,19 +40,28 @@ + #endif + + #ifdef HAVE_PTHREAD_AFFINITY_NP ++unsigned long gomp_cpuset_size; ++static unsigned long gomp_get_cpuset_size; ++cpu_set_t *gomp_cpusetp; ++ + unsigned long +-gomp_cpuset_popcount (cpu_set_t *cpusetp) ++gomp_cpuset_popcount (unsigned long cpusetsize, cpu_set_t *cpusetp) + { +-#ifdef CPU_COUNT +- /* glibc 2.6 and above provide a macro for this. */ +- return CPU_COUNT (cpusetp); ++#ifdef CPU_COUNT_S ++ /* glibc 2.7 and above provide a macro for this. */ ++ return CPU_COUNT_S (cpusetsize, cpusetp); + #else ++#ifdef CPU_COUNT ++ if (cpusetsize == sizeof (cpu_set_t)) ++ /* glibc 2.6 and above provide a macro for this. */ ++ return CPU_COUNT (cpusetp); ++#endif + size_t i; + unsigned long ret = 0; +- extern int check[sizeof (cpusetp->__bits[0]) == sizeof (unsigned long int)]; ++ extern int check[sizeof (cpusetp->__bits[0]) == sizeof (unsigned long int) ++ ? 1 : -1] __attribute__((unused)); + +- (void) check; +- for (i = 0; i < sizeof (*cpusetp) / sizeof (cpusetp->__bits[0]); i++) ++ for (i = 0; i < cpusetsize / sizeof (cpusetp->__bits[0]); i++) + { + unsigned long int mask = cpusetp->__bits[i]; + if (mask == 0) +@@ -70,16 +80,63 @@ void + gomp_init_num_threads (void) + { + #ifdef HAVE_PTHREAD_AFFINITY_NP +- cpu_set_t cpuset; ++#if defined (_SC_NPROCESSORS_CONF) && defined (CPU_ALLOC_SIZE) ++ gomp_cpuset_size = sysconf (_SC_NPROCESSORS_CONF); ++ gomp_cpuset_size = CPU_ALLOC_SIZE (gomp_cpuset_size); ++#else ++ gomp_cpuset_size = sizeof (cpu_set_t); ++#endif + +- if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset), &cpuset) == 0) ++ gomp_cpusetp = (cpu_set_t *) gomp_malloc (gomp_cpuset_size); ++ do + { +- /* Count only the CPUs this process can use. */ +- gomp_global_icv.nthreads_var = gomp_cpuset_popcount (&cpuset); +- if (gomp_global_icv.nthreads_var == 0) +- gomp_global_icv.nthreads_var = 1; +- return; ++ int ret = pthread_getaffinity_np (pthread_self (), gomp_cpuset_size, ++ gomp_cpusetp); ++ if (ret == 0) ++ { ++ /* Count only the CPUs this process can use. */ ++ gomp_global_icv.nthreads_var ++ = gomp_cpuset_popcount (gomp_cpuset_size, gomp_cpusetp); ++ if (gomp_global_icv.nthreads_var == 0) ++ break; ++ gomp_get_cpuset_size = gomp_cpuset_size; ++#ifdef CPU_ALLOC_SIZE ++ unsigned long i; ++ for (i = gomp_cpuset_size * 8; i; i--) ++ if (CPU_ISSET_S (i - 1, gomp_cpuset_size, gomp_cpusetp)) ++ break; ++ gomp_cpuset_size = CPU_ALLOC_SIZE (i); ++#endif ++ return; ++ } ++ if (ret != EINVAL) ++ break; ++#ifdef CPU_ALLOC_SIZE ++ if (gomp_cpuset_size < sizeof (cpu_set_t)) ++ gomp_cpuset_size = sizeof (cpu_set_t); ++ else ++ gomp_cpuset_size = gomp_cpuset_size * 2; ++ if (gomp_cpuset_size < 8 * sizeof (cpu_set_t)) ++ gomp_cpusetp ++ = (cpu_set_t *) gomp_realloc (gomp_cpusetp, gomp_cpuset_size); ++ else ++ { ++ /* Avoid gomp_fatal if too large memory allocation would be ++ requested, e.g. kernel returning EINVAL all the time. */ ++ void *p = realloc (gomp_cpusetp, gomp_cpuset_size); ++ if (p == NULL) ++ break; ++ gomp_cpusetp = (cpu_set_t *) p; ++ } ++#else ++ break; ++#endif + } ++ while (1); ++ gomp_cpuset_size = 0; ++ gomp_global_icv.nthreads_var = 1; ++ free (gomp_cpusetp); ++ gomp_cpusetp = NULL; + #endif + #ifdef _SC_NPROCESSORS_ONLN + gomp_global_icv.nthreads_var = sysconf (_SC_NPROCESSORS_ONLN); +@@ -90,15 +147,14 @@ static int + get_num_procs (void) + { + #ifdef HAVE_PTHREAD_AFFINITY_NP +- cpu_set_t cpuset; +- +- if (gomp_cpu_affinity == NULL) ++ if (gomp_places_list == NULL) + { + /* Count only the CPUs this process can use. */ +- if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset), +- &cpuset) == 0) ++ if (gomp_cpusetp ++ && pthread_getaffinity_np (pthread_self (), gomp_get_cpuset_size, ++ gomp_cpusetp) == 0) + { +- int ret = gomp_cpuset_popcount (&cpuset); ++ int ret = gomp_cpuset_popcount (gomp_get_cpuset_size, gomp_cpusetp); + return ret != 0 ? ret : 1; + } + } +--- libgomp/config/linux/bar.c (revision 210461) ++++ libgomp/config/linux/bar.c (revision 210462) +@@ -33,11 +33,11 @@ + void + gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) + { +- if (__builtin_expect ((state & 1) != 0, 0)) ++ if (__builtin_expect (state & BAR_WAS_LAST, 0)) + { + /* Next time we'll be awaiting TOTAL threads again. */ + bar->awaited = bar->total; +- __atomic_store_n (&bar->generation, bar->generation + 4, ++ __atomic_store_n (&bar->generation, bar->generation + BAR_INCR, + MEMMODEL_RELEASE); + futex_wake ((int *) &bar->generation, INT_MAX); + } +@@ -66,7 +66,7 @@ void + gomp_barrier_wait_last (gomp_barrier_t *bar) + { + gomp_barrier_state_t state = gomp_barrier_wait_start (bar); +- if (state & 1) ++ if (state & BAR_WAS_LAST) + gomp_barrier_wait_end (bar, state); + } + +@@ -81,40 +81,43 @@ gomp_team_barrier_wait_end (gomp_barrier + { + unsigned int generation, gen; + +- if (__builtin_expect ((state & 1) != 0, 0)) ++ if (__builtin_expect (state & BAR_WAS_LAST, 0)) + { + /* Next time we'll be awaiting TOTAL threads again. */ + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + + bar->awaited = bar->total; ++ team->work_share_cancelled = 0; + if (__builtin_expect (team->task_count, 0)) + { + gomp_barrier_handle_tasks (state); +- state &= ~1; ++ state &= ~BAR_WAS_LAST; + } + else + { +- __atomic_store_n (&bar->generation, state + 3, MEMMODEL_RELEASE); ++ state &= ~BAR_CANCELLED; ++ state += BAR_INCR - BAR_WAS_LAST; ++ __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); + futex_wake ((int *) &bar->generation, INT_MAX); + return; + } + } + + generation = state; ++ state &= ~BAR_CANCELLED; + do + { + do_wait ((int *) &bar->generation, generation); + gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); +- if (__builtin_expect (gen & 1, 0)) ++ if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) + { + gomp_barrier_handle_tasks (state); + gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); + } +- if ((gen & 2) != 0) +- generation |= 2; ++ generation |= gen & BAR_WAITING_FOR_TASK; + } +- while (gen != state + 4); ++ while (gen != state + BAR_INCR); + } + + void +@@ -122,3 +125,86 @@ gomp_team_barrier_wait (gomp_barrier_t * + { + gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar)); + } ++ ++void ++gomp_team_barrier_wait_final (gomp_barrier_t *bar) ++{ ++ gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar); ++ if (__builtin_expect (state & BAR_WAS_LAST, 0)) ++ bar->awaited_final = bar->total; ++ gomp_team_barrier_wait_end (bar, state); ++} ++ ++bool ++gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, ++ gomp_barrier_state_t state) ++{ ++ unsigned int generation, gen; ++ ++ if (__builtin_expect (state & BAR_WAS_LAST, 0)) ++ { ++ /* Next time we'll be awaiting TOTAL threads again. */ ++ /* BAR_CANCELLED should never be set in state here, because ++ cancellation means that at least one of the threads has been ++ cancelled, thus on a cancellable barrier we should never see ++ all threads to arrive. */ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ ++ bar->awaited = bar->total; ++ team->work_share_cancelled = 0; ++ if (__builtin_expect (team->task_count, 0)) ++ { ++ gomp_barrier_handle_tasks (state); ++ state &= ~BAR_WAS_LAST; ++ } ++ else ++ { ++ state += BAR_INCR - BAR_WAS_LAST; ++ __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); ++ futex_wake ((int *) &bar->generation, INT_MAX); ++ return false; ++ } ++ } ++ ++ if (__builtin_expect (state & BAR_CANCELLED, 0)) ++ return true; ++ ++ generation = state; ++ do ++ { ++ do_wait ((int *) &bar->generation, generation); ++ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); ++ if (__builtin_expect (gen & BAR_CANCELLED, 0)) ++ return true; ++ if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) ++ { ++ gomp_barrier_handle_tasks (state); ++ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); ++ } ++ generation |= gen & BAR_WAITING_FOR_TASK; ++ } ++ while (gen != state + BAR_INCR); ++ ++ return false; ++} ++ ++bool ++gomp_team_barrier_wait_cancel (gomp_barrier_t *bar) ++{ ++ return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar)); ++} ++ ++void ++gomp_team_barrier_cancel (struct gomp_team *team) ++{ ++ gomp_mutex_lock (&team->task_lock); ++ if (team->barrier.generation & BAR_CANCELLED) ++ { ++ gomp_mutex_unlock (&team->task_lock); ++ return; ++ } ++ team->barrier.generation |= BAR_CANCELLED; ++ gomp_mutex_unlock (&team->task_lock); ++ futex_wake ((int *) &team->barrier.generation, INT_MAX); ++} +--- libgomp/config/linux/proc.h (revision 210461) ++++ libgomp/config/linux/proc.h (revision 210462) +@@ -28,7 +28,10 @@ + #include + + #ifdef HAVE_PTHREAD_AFFINITY_NP +-extern unsigned long gomp_cpuset_popcount (cpu_set_t *); ++extern unsigned long gomp_cpuset_size attribute_hidden; ++extern cpu_set_t *gomp_cpusetp attribute_hidden; ++extern unsigned long gomp_cpuset_popcount (unsigned long, cpu_set_t *) ++ attribute_hidden; + #endif + + #endif /* GOMP_PROC_H */ +--- libgomp/config/linux/affinity.c (revision 210461) ++++ libgomp/config/linux/affinity.c (revision 210462) +@@ -29,90 +29,327 @@ + #endif + #include "libgomp.h" + #include "proc.h" ++#include + #include ++#include ++#include + #include + + #ifdef HAVE_PTHREAD_AFFINITY_NP + +-static unsigned int affinity_counter; ++#ifndef CPU_ALLOC_SIZE ++#define CPU_ISSET_S(idx, size, set) CPU_ISSET(idx, set) ++#define CPU_ZERO_S(size, set) CPU_ZERO(set) ++#define CPU_SET_S(idx, size, set) CPU_SET(idx, set) ++#define CPU_CLR_S(idx, size, set) CPU_CLR(idx, set) ++#endif + + void + gomp_init_affinity (void) + { +- cpu_set_t cpuset, cpusetnew; +- size_t idx, widx; +- unsigned long cpus = 0; +- +- if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset), &cpuset)) +- { +- gomp_error ("could not get CPU affinity set"); +- free (gomp_cpu_affinity); +- gomp_cpu_affinity = NULL; +- gomp_cpu_affinity_len = 0; +- return; +- } +- +- CPU_ZERO (&cpusetnew); +- if (gomp_cpu_affinity_len == 0) +- { +- unsigned long count = gomp_cpuset_popcount (&cpuset); +- if (count >= 65536) +- count = 65536; +- gomp_cpu_affinity = malloc (count * sizeof (unsigned short)); +- if (gomp_cpu_affinity == NULL) ++ if (gomp_places_list == NULL) ++ { ++ if (!gomp_affinity_init_level (1, ULONG_MAX, true)) ++ return; ++ } ++ ++ struct gomp_thread *thr = gomp_thread (); ++ pthread_setaffinity_np (pthread_self (), gomp_cpuset_size, ++ (cpu_set_t *) gomp_places_list[0]); ++ thr->place = 1; ++ thr->ts.place_partition_off = 0; ++ thr->ts.place_partition_len = gomp_places_list_len; ++} ++ ++void ++gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place) ++{ ++ pthread_attr_setaffinity_np (attr, gomp_cpuset_size, ++ (cpu_set_t *) gomp_places_list[place]); ++} ++ ++void ** ++gomp_affinity_alloc (unsigned long count, bool quiet) ++{ ++ unsigned long i; ++ void **ret; ++ char *p; ++ ++ if (gomp_cpusetp == NULL) ++ { ++ if (!quiet) ++ gomp_error ("Could not get CPU affinity set"); ++ return NULL; ++ } ++ ++ ret = malloc (count * sizeof (void *) + count * gomp_cpuset_size); ++ if (ret == NULL) ++ { ++ if (!quiet) ++ gomp_error ("Out of memory trying to allocate places list"); ++ return NULL; ++ } ++ ++ p = (char *) (ret + count); ++ for (i = 0; i < count; i++, p += gomp_cpuset_size) ++ ret[i] = p; ++ return ret; ++} ++ ++void ++gomp_affinity_init_place (void *p) ++{ ++ cpu_set_t *cpusetp = (cpu_set_t *) p; ++ CPU_ZERO_S (gomp_cpuset_size, cpusetp); ++} ++ ++bool ++gomp_affinity_add_cpus (void *p, unsigned long num, ++ unsigned long len, long stride, bool quiet) ++{ ++ cpu_set_t *cpusetp = (cpu_set_t *) p; ++ unsigned long max = 8 * gomp_cpuset_size; ++ for (;;) ++ { ++ if (num >= max) + { +- gomp_error ("not enough memory to store CPU affinity list"); +- return; ++ if (!quiet) ++ gomp_error ("Logical CPU number %lu out of range", num); ++ return false; + } +- for (widx = idx = 0; widx < count && idx < 65536; idx++) +- if (CPU_ISSET (idx, &cpuset)) ++ CPU_SET_S (num, gomp_cpuset_size, cpusetp); ++ if (--len == 0) ++ return true; ++ if ((stride < 0 && num + stride > num) ++ || (stride > 0 && num + stride < num)) ++ { ++ if (!quiet) ++ gomp_error ("Logical CPU number %lu+%ld out of range", ++ num, stride); ++ return false; ++ } ++ num += stride; ++ } ++} ++ ++bool ++gomp_affinity_remove_cpu (void *p, unsigned long num) ++{ ++ cpu_set_t *cpusetp = (cpu_set_t *) p; ++ if (num >= 8 * gomp_cpuset_size) ++ { ++ gomp_error ("Logical CPU number %lu out of range", num); ++ return false; ++ } ++ if (!CPU_ISSET_S (num, gomp_cpuset_size, cpusetp)) ++ { ++ gomp_error ("Logical CPU %lu to be removed is not in the set", num); ++ return false; ++ } ++ CPU_CLR_S (num, gomp_cpuset_size, cpusetp); ++ return true; ++} ++ ++bool ++gomp_affinity_copy_place (void *p, void *q, long stride) ++{ ++ unsigned long i, max = 8 * gomp_cpuset_size; ++ cpu_set_t *destp = (cpu_set_t *) p; ++ cpu_set_t *srcp = (cpu_set_t *) q; ++ ++ CPU_ZERO_S (gomp_cpuset_size, destp); ++ for (i = 0; i < max; i++) ++ if (CPU_ISSET_S (i, gomp_cpuset_size, srcp)) ++ { ++ if ((stride < 0 && i + stride > i) ++ || (stride > 0 && (i + stride < i || i + stride >= max))) ++ { ++ gomp_error ("Logical CPU number %lu+%ld out of range", i, stride); ++ return false; ++ } ++ CPU_SET_S (i + stride, gomp_cpuset_size, destp); ++ } ++ return true; ++} ++ ++bool ++gomp_affinity_same_place (void *p, void *q) ++{ ++#ifdef CPU_EQUAL_S ++ return CPU_EQUAL_S (gomp_cpuset_size, (cpu_set_t *) p, (cpu_set_t *) q); ++#else ++ return memcmp (p, q, gomp_cpuset_size) == 0; ++#endif ++} ++ ++bool ++gomp_affinity_finalize_place_list (bool quiet) ++{ ++ unsigned long i, j; ++ ++ for (i = 0, j = 0; i < gomp_places_list_len; i++) ++ { ++ cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[i]; ++ bool nonempty = false; ++#ifdef CPU_AND_S ++ CPU_AND_S (gomp_cpuset_size, cpusetp, cpusetp, gomp_cpusetp); ++ nonempty = gomp_cpuset_popcount (gomp_cpuset_size, cpusetp) != 0; ++#else ++ unsigned long k, max = gomp_cpuset_size / sizeof (cpusetp->__bits[0]); ++ for (k = 0; k < max; k++) ++ if ((cpusetp->__bits[k] &= gomp_cpusetp->__bits[k]) != 0) ++ nonempty = true; ++#endif ++ if (nonempty) ++ gomp_places_list[j++] = gomp_places_list[i]; ++ } ++ ++ if (j == 0) ++ { ++ if (!quiet) ++ gomp_error ("None of the places contain usable logical CPUs"); ++ return false; ++ } ++ else if (j < gomp_places_list_len) ++ { ++ if (!quiet) ++ gomp_error ("Number of places reduced from %ld to %ld because some " ++ "places didn't contain any usable logical CPUs", ++ gomp_places_list_len, j); ++ gomp_places_list_len = j; ++ } ++ return true; ++} ++ ++bool ++gomp_affinity_init_level (int level, unsigned long count, bool quiet) ++{ ++ unsigned long i, max = 8 * gomp_cpuset_size; ++ ++ if (gomp_cpusetp) ++ { ++ unsigned long maxcount ++ = gomp_cpuset_popcount (gomp_cpuset_size, gomp_cpusetp); ++ if (count > maxcount) ++ count = maxcount; ++ } ++ gomp_places_list = gomp_affinity_alloc (count, quiet); ++ gomp_places_list_len = 0; ++ if (gomp_places_list == NULL) ++ return false; ++ /* SMT (threads). */ ++ if (level == 1) ++ { ++ for (i = 0; i < max && gomp_places_list_len < count; i++) ++ if (CPU_ISSET_S (i, gomp_cpuset_size, gomp_cpusetp)) + { +- cpus++; +- gomp_cpu_affinity[widx++] = idx; ++ gomp_affinity_init_place (gomp_places_list[gomp_places_list_len]); ++ gomp_affinity_add_cpus (gomp_places_list[gomp_places_list_len], ++ i, 1, 0, true); ++ ++gomp_places_list_len; + } ++ return true; + } + else +- for (widx = idx = 0; idx < gomp_cpu_affinity_len; idx++) +- if (gomp_cpu_affinity[idx] < CPU_SETSIZE +- && CPU_ISSET (gomp_cpu_affinity[idx], &cpuset)) ++ { ++ char name[sizeof ("/sys/devices/system/cpu/cpu/topology/" ++ "thread_siblings_list") + 3 * sizeof (unsigned long)]; ++ size_t prefix_len = sizeof ("/sys/devices/system/cpu/cpu") - 1; ++ cpu_set_t *copy = gomp_alloca (gomp_cpuset_size); ++ FILE *f; ++ char *line = NULL; ++ size_t linelen = 0; ++ ++ memcpy (name, "/sys/devices/system/cpu/cpu", prefix_len); ++ memcpy (copy, gomp_cpusetp, gomp_cpuset_size); ++ for (i = 0; i < max && gomp_places_list_len < count; i++) ++ if (CPU_ISSET_S (i, gomp_cpuset_size, copy)) ++ { ++ sprintf (name + prefix_len, "%lu/topology/%s_siblings_list", ++ i, level == 2 ? "thread" : "core"); ++ f = fopen (name, "r"); ++ if (f != NULL) ++ { ++ if (getline (&line, &linelen, f) > 0) ++ { ++ char *p = line; ++ bool seen_i = false; ++ void *pl = gomp_places_list[gomp_places_list_len]; ++ gomp_affinity_init_place (pl); ++ while (*p && *p != '\n') ++ { ++ unsigned long first, last; ++ errno = 0; ++ first = strtoul (p, &p, 10); ++ if (errno) ++ break; ++ last = first; ++ if (*p == '-') ++ { ++ errno = 0; ++ last = strtoul (p + 1, &p, 10); ++ if (errno || last < first) ++ break; ++ } ++ for (; first <= last; first++) ++ if (CPU_ISSET_S (first, gomp_cpuset_size, copy) ++ && gomp_affinity_add_cpus (pl, first, 1, 0, ++ true)) ++ { ++ CPU_CLR_S (first, gomp_cpuset_size, copy); ++ if (first == i) ++ seen_i = true; ++ } ++ if (*p == ',') ++ ++p; ++ } ++ if (seen_i) ++ gomp_places_list_len++; ++ } ++ fclose (f); ++ } ++ } ++ if (gomp_places_list_len == 0) + { +- if (! CPU_ISSET (gomp_cpu_affinity[idx], &cpusetnew)) +- { +- cpus++; +- CPU_SET (gomp_cpu_affinity[idx], &cpusetnew); +- } +- gomp_cpu_affinity[widx++] = gomp_cpu_affinity[idx]; ++ if (!quiet) ++ gomp_error ("Error reading %s topology", ++ level == 2 ? "core" : "socket"); ++ free (gomp_places_list); ++ gomp_places_list = NULL; ++ return false; + } +- +- if (widx == 0) +- { +- gomp_error ("no CPUs left for affinity setting"); +- free (gomp_cpu_affinity); +- gomp_cpu_affinity = NULL; +- gomp_cpu_affinity_len = 0; +- return; +- } +- +- gomp_cpu_affinity_len = widx; +- if (cpus < gomp_available_cpus) +- gomp_available_cpus = cpus; +- CPU_ZERO (&cpuset); +- CPU_SET (gomp_cpu_affinity[0], &cpuset); +- pthread_setaffinity_np (pthread_self (), sizeof (cpuset), &cpuset); +- affinity_counter = 1; ++ return true; ++ } ++ return false; + } + + void +-gomp_init_thread_affinity (pthread_attr_t *attr) ++gomp_affinity_print_place (void *p) + { +- unsigned int cpu; +- cpu_set_t cpuset; ++ unsigned long i, max = 8 * gomp_cpuset_size, len; ++ cpu_set_t *cpusetp = (cpu_set_t *) p; ++ bool notfirst = false; + +- cpu = __atomic_fetch_add (&affinity_counter, 1, MEMMODEL_RELAXED); +- cpu %= gomp_cpu_affinity_len; +- CPU_ZERO (&cpuset); +- CPU_SET (gomp_cpu_affinity[cpu], &cpuset); +- pthread_attr_setaffinity_np (attr, sizeof (cpu_set_t), &cpuset); ++ for (i = 0, len = 0; i < max; i++) ++ if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp)) ++ { ++ if (len == 0) ++ { ++ if (notfirst) ++ fputc (',', stderr); ++ notfirst = true; ++ fprintf (stderr, "%lu", i); ++ } ++ ++len; ++ } ++ else ++ { ++ if (len > 1) ++ fprintf (stderr, ":%lu", len); ++ len = 0; ++ } ++ if (len > 1) ++ fprintf (stderr, ":%lu", len); + } + + #else +--- libgomp/config/linux/bar.h (revision 210461) ++++ libgomp/config/linux/bar.h (revision 210462) +@@ -38,13 +38,25 @@ typedef struct + unsigned total __attribute__((aligned (64))); + unsigned generation; + unsigned awaited __attribute__((aligned (64))); ++ unsigned awaited_final; + } gomp_barrier_t; ++ + typedef unsigned int gomp_barrier_state_t; + ++/* The generation field contains a counter in the high bits, with a few ++ low bits dedicated to flags. Note that TASK_PENDING and WAS_LAST can ++ share space because WAS_LAST is never stored back to generation. */ ++#define BAR_TASK_PENDING 1 ++#define BAR_WAS_LAST 1 ++#define BAR_WAITING_FOR_TASK 2 ++#define BAR_CANCELLED 4 ++#define BAR_INCR 8 ++ + static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count) + { + bar->total = count; + bar->awaited = count; ++ bar->awaited_final = count; + bar->generation = 0; + } + +@@ -62,27 +74,55 @@ extern void gomp_barrier_wait (gomp_barr + extern void gomp_barrier_wait_last (gomp_barrier_t *); + extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t); + extern void gomp_team_barrier_wait (gomp_barrier_t *); ++extern void gomp_team_barrier_wait_final (gomp_barrier_t *); + extern void gomp_team_barrier_wait_end (gomp_barrier_t *, + gomp_barrier_state_t); ++extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *); ++extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *, ++ gomp_barrier_state_t); + extern void gomp_team_barrier_wake (gomp_barrier_t *, int); ++struct gomp_team; ++extern void gomp_team_barrier_cancel (struct gomp_team *); + + static inline gomp_barrier_state_t + gomp_barrier_wait_start (gomp_barrier_t *bar) + { +- unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE) & ~3; ++ unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); ++ ret &= -BAR_INCR | BAR_CANCELLED; + /* A memory barrier is needed before exiting from the various forms + of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section + 2.8.6 flush Construct, which says there is an implicit flush during + a barrier region. This is a convenient place to add the barrier, + so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE. */ +- ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0; ++ if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0) ++ ret |= BAR_WAS_LAST; ++ return ret; ++} ++ ++static inline gomp_barrier_state_t ++gomp_barrier_wait_cancel_start (gomp_barrier_t *bar) ++{ ++ return gomp_barrier_wait_start (bar); ++} ++ ++/* This is like gomp_barrier_wait_start, except it decrements ++ bar->awaited_final rather than bar->awaited and should be used ++ for the gomp_team_end barrier only. */ ++static inline gomp_barrier_state_t ++gomp_barrier_wait_final_start (gomp_barrier_t *bar) ++{ ++ unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); ++ ret &= -BAR_INCR | BAR_CANCELLED; ++ /* See above gomp_barrier_wait_start comment. */ ++ if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_ACQ_REL) == 0) ++ ret |= BAR_WAS_LAST; + return ret; + } + + static inline bool + gomp_barrier_last_thread (gomp_barrier_state_t state) + { +- return state & 1; ++ return state & BAR_WAS_LAST; + } + + /* All the inlines below must be called with team->task_lock +@@ -91,31 +131,37 @@ gomp_barrier_last_thread (gomp_barrier_s + static inline void + gomp_team_barrier_set_task_pending (gomp_barrier_t *bar) + { +- bar->generation |= 1; ++ bar->generation |= BAR_TASK_PENDING; + } + + static inline void + gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar) + { +- bar->generation &= ~1; ++ bar->generation &= ~BAR_TASK_PENDING; + } + + static inline void + gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar) + { +- bar->generation |= 2; ++ bar->generation |= BAR_WAITING_FOR_TASK; + } + + static inline bool + gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar) + { +- return (bar->generation & 2) != 0; ++ return (bar->generation & BAR_WAITING_FOR_TASK) != 0; ++} ++ ++static inline bool ++gomp_team_barrier_cancelled (gomp_barrier_t *bar) ++{ ++ return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0); + } + + static inline void + gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state) + { +- bar->generation = (state & ~3) + 4; ++ bar->generation = (state & -BAR_INCR) + BAR_INCR; + } + + #endif /* GOMP_BARRIER_H */ +--- libgomp/config/posix/bar.c (revision 210461) ++++ libgomp/config/posix/bar.c (revision 210462) +@@ -42,6 +42,7 @@ gomp_barrier_init (gomp_barrier_t *bar, + bar->total = count; + bar->arrived = 0; + bar->generation = 0; ++ bar->cancellable = false; + } + + void +@@ -72,7 +73,7 @@ gomp_barrier_wait_end (gomp_barrier_t *b + { + unsigned int n; + +- if (state & 1) ++ if (state & BAR_WAS_LAST) + { + n = --bar->arrived; + if (n > 0) +@@ -113,12 +114,14 @@ gomp_team_barrier_wait_end (gomp_barrier + { + unsigned int n; + +- if (state & 1) ++ state &= ~BAR_CANCELLED; ++ if (state & BAR_WAS_LAST) + { + n = --bar->arrived; + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + ++ team->work_share_cancelled = 0; + if (team->task_count) + { + gomp_barrier_handle_tasks (state); +@@ -128,7 +131,7 @@ gomp_team_barrier_wait_end (gomp_barrier + return; + } + +- bar->generation = state + 3; ++ bar->generation = state + BAR_INCR - BAR_WAS_LAST; + if (n > 0) + { + do +@@ -141,13 +144,18 @@ gomp_team_barrier_wait_end (gomp_barrier + else + { + gomp_mutex_unlock (&bar->mutex1); ++ int gen; + do + { + gomp_sem_wait (&bar->sem1); +- if (bar->generation & 1) +- gomp_barrier_handle_tasks (state); ++ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); ++ if (gen & BAR_TASK_PENDING) ++ { ++ gomp_barrier_handle_tasks (state); ++ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); ++ } + } +- while (bar->generation != state + 4); ++ while (gen != state + BAR_INCR); + + #ifdef HAVE_SYNC_BUILTINS + n = __sync_add_and_fetch (&bar->arrived, -1); +@@ -162,6 +170,81 @@ gomp_team_barrier_wait_end (gomp_barrier + } + } + ++bool ++gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, ++ gomp_barrier_state_t state) ++{ ++ unsigned int n; ++ ++ if (state & BAR_WAS_LAST) ++ { ++ bar->cancellable = false; ++ n = --bar->arrived; ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ ++ team->work_share_cancelled = 0; ++ if (team->task_count) ++ { ++ gomp_barrier_handle_tasks (state); ++ if (n > 0) ++ gomp_sem_wait (&bar->sem2); ++ gomp_mutex_unlock (&bar->mutex1); ++ return false; ++ } ++ ++ bar->generation = state + BAR_INCR - BAR_WAS_LAST; ++ if (n > 0) ++ { ++ do ++ gomp_sem_post (&bar->sem1); ++ while (--n != 0); ++ gomp_sem_wait (&bar->sem2); ++ } ++ gomp_mutex_unlock (&bar->mutex1); ++ } ++ else ++ { ++ if (state & BAR_CANCELLED) ++ { ++ gomp_mutex_unlock (&bar->mutex1); ++ return true; ++ } ++ bar->cancellable = true; ++ gomp_mutex_unlock (&bar->mutex1); ++ int gen; ++ do ++ { ++ gomp_sem_wait (&bar->sem1); ++ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); ++ if (gen & BAR_CANCELLED) ++ break; ++ if (gen & BAR_TASK_PENDING) ++ { ++ gomp_barrier_handle_tasks (state); ++ gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); ++ if (gen & BAR_CANCELLED) ++ break; ++ } ++ } ++ while (gen != state + BAR_INCR); ++ ++#ifdef HAVE_SYNC_BUILTINS ++ n = __sync_add_and_fetch (&bar->arrived, -1); ++#else ++ gomp_mutex_lock (&bar->mutex2); ++ n = --bar->arrived; ++ gomp_mutex_unlock (&bar->mutex2); ++#endif ++ ++ if (n == 0) ++ gomp_sem_post (&bar->sem2); ++ if (gen & BAR_CANCELLED) ++ return true; ++ } ++ return false; ++} ++ + void + gomp_team_barrier_wait (gomp_barrier_t *barrier) + { +@@ -176,3 +259,40 @@ gomp_team_barrier_wake (gomp_barrier_t * + while (count-- > 0) + gomp_sem_post (&bar->sem1); + } ++ ++bool ++gomp_team_barrier_wait_cancel (gomp_barrier_t *bar) ++{ ++ gomp_barrier_state_t state = gomp_barrier_wait_cancel_start (bar); ++ return gomp_team_barrier_wait_cancel_end (bar, state); ++} ++ ++void ++gomp_team_barrier_cancel (struct gomp_team *team) ++{ ++ if (team->barrier.generation & BAR_CANCELLED) ++ return; ++ gomp_mutex_lock (&team->barrier.mutex1); ++ gomp_mutex_lock (&team->task_lock); ++ if (team->barrier.generation & BAR_CANCELLED) ++ { ++ gomp_mutex_unlock (&team->task_lock); ++ gomp_mutex_unlock (&team->barrier.mutex1); ++ return; ++ } ++ team->barrier.generation |= BAR_CANCELLED; ++ gomp_mutex_unlock (&team->task_lock); ++ if (team->barrier.cancellable) ++ { ++ int n = team->barrier.arrived; ++ if (n > 0) ++ { ++ do ++ gomp_sem_post (&team->barrier.sem1); ++ while (--n != 0); ++ gomp_sem_wait (&team->barrier.sem2); ++ } ++ team->barrier.cancellable = false; ++ } ++ gomp_mutex_unlock (&team->barrier.mutex1); ++} +--- libgomp/config/posix/affinity.c (revision 210461) ++++ libgomp/config/posix/affinity.c (revision 210462) +@@ -32,7 +32,84 @@ gomp_init_affinity (void) + } + + void +-gomp_init_thread_affinity (pthread_attr_t *attr) ++gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place) + { + (void) attr; ++ (void) place; ++} ++ ++void ** ++gomp_affinity_alloc (unsigned long count, bool quiet) ++{ ++ (void) count; ++ if (!quiet) ++ gomp_error ("Affinity not supported on this configuration"); ++ return NULL; ++} ++ ++void ++gomp_affinity_init_place (void *p) ++{ ++ (void) p; ++} ++ ++bool ++gomp_affinity_add_cpus (void *p, unsigned long num, ++ unsigned long len, long stride, bool quiet) ++{ ++ (void) p; ++ (void) num; ++ (void) len; ++ (void) stride; ++ (void) quiet; ++ return false; ++} ++ ++bool ++gomp_affinity_remove_cpu (void *p, unsigned long num) ++{ ++ (void) p; ++ (void) num; ++ return false; ++} ++ ++bool ++gomp_affinity_copy_place (void *p, void *q, long stride) ++{ ++ (void) p; ++ (void) q; ++ (void) stride; ++ return false; ++} ++ ++bool ++gomp_affinity_same_place (void *p, void *q) ++{ ++ (void) p; ++ (void) q; ++ return false; ++} ++ ++bool ++gomp_affinity_finalize_place_list (bool quiet) ++{ ++ (void) quiet; ++ return false; ++} ++ ++bool ++gomp_affinity_init_level (int level, unsigned long count, bool quiet) ++{ ++ (void) level; ++ (void) count; ++ (void) quiet; ++ if (!quiet) ++ gomp_error ("Affinity not supported on this configuration"); ++ return NULL; ++} ++ ++void ++gomp_affinity_print_place (void *p) ++{ ++ (void) p; + } +--- libgomp/config/posix/bar.h (revision 210461) ++++ libgomp/config/posix/bar.h (revision 210462) +@@ -43,9 +43,20 @@ typedef struct + unsigned total; + unsigned arrived; + unsigned generation; ++ bool cancellable; + } gomp_barrier_t; ++ + typedef unsigned int gomp_barrier_state_t; + ++/* The generation field contains a counter in the high bits, with a few ++ low bits dedicated to flags. Note that TASK_PENDING and WAS_LAST can ++ share space because WAS_LAST is never stored back to generation. */ ++#define BAR_TASK_PENDING 1 ++#define BAR_WAS_LAST 1 ++#define BAR_WAITING_FOR_TASK 2 ++#define BAR_CANCELLED 4 ++#define BAR_INCR 8 ++ + extern void gomp_barrier_init (gomp_barrier_t *, unsigned); + extern void gomp_barrier_reinit (gomp_barrier_t *, unsigned); + extern void gomp_barrier_destroy (gomp_barrier_t *); +@@ -55,22 +66,47 @@ extern void gomp_barrier_wait_end (gomp_ + extern void gomp_team_barrier_wait (gomp_barrier_t *); + extern void gomp_team_barrier_wait_end (gomp_barrier_t *, + gomp_barrier_state_t); ++extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *); ++extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *, ++ gomp_barrier_state_t); + extern void gomp_team_barrier_wake (gomp_barrier_t *, int); ++struct gomp_team; ++extern void gomp_team_barrier_cancel (struct gomp_team *); + + static inline gomp_barrier_state_t + gomp_barrier_wait_start (gomp_barrier_t *bar) + { + unsigned int ret; + gomp_mutex_lock (&bar->mutex1); +- ret = bar->generation & ~3; +- ret += ++bar->arrived == bar->total; ++ ret = bar->generation & (-BAR_INCR | BAR_CANCELLED); ++ if (++bar->arrived == bar->total) ++ ret |= BAR_WAS_LAST; ++ return ret; ++} ++ ++static inline gomp_barrier_state_t ++gomp_barrier_wait_cancel_start (gomp_barrier_t *bar) ++{ ++ unsigned int ret; ++ gomp_mutex_lock (&bar->mutex1); ++ ret = bar->generation & (-BAR_INCR | BAR_CANCELLED); ++ if (ret & BAR_CANCELLED) ++ return ret; ++ if (++bar->arrived == bar->total) ++ ret |= BAR_WAS_LAST; + return ret; + } + ++static inline void ++gomp_team_barrier_wait_final (gomp_barrier_t *bar) ++{ ++ gomp_team_barrier_wait (bar); ++} ++ + static inline bool + gomp_barrier_last_thread (gomp_barrier_state_t state) + { +- return state & 1; ++ return state & BAR_WAS_LAST; + } + + static inline void +@@ -85,31 +121,37 @@ gomp_barrier_wait_last (gomp_barrier_t * + static inline void + gomp_team_barrier_set_task_pending (gomp_barrier_t *bar) + { +- bar->generation |= 1; ++ bar->generation |= BAR_TASK_PENDING; + } + + static inline void + gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar) + { +- bar->generation &= ~1; ++ bar->generation &= ~BAR_TASK_PENDING; + } + + static inline void + gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar) + { +- bar->generation |= 2; ++ bar->generation |= BAR_WAITING_FOR_TASK; + } + + static inline bool + gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar) + { +- return (bar->generation & 2) != 0; ++ return (bar->generation & BAR_WAITING_FOR_TASK) != 0; ++} ++ ++static inline bool ++gomp_team_barrier_cancelled (gomp_barrier_t *bar) ++{ ++ return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0); + } + + static inline void + gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state) + { +- bar->generation = (state & ~3) + 4; ++ bar->generation = (state & -BAR_INCR) + BAR_INCR; + } + + #endif /* GOMP_BARRIER_H */ +--- libgomp/barrier.c (revision 210461) ++++ libgomp/barrier.c (revision 210462) +@@ -39,3 +39,15 @@ GOMP_barrier (void) + + gomp_team_barrier_wait (&team->barrier); + } ++ ++bool ++GOMP_barrier_cancel (void) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ ++ /* The compiler transforms to barrier_cancel when it sees that the ++ barrier is within a construct that can cancel. Thus we should ++ never have an orphaned cancellable barrier. */ ++ return gomp_team_barrier_wait_cancel (&team->barrier); ++} +--- libgomp/target.c (revision 0) ++++ libgomp/target.c (revision 210462) +@@ -0,0 +1,96 @@ ++/* Copyright (C) 2013 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek . ++ ++ This file is part of the GNU OpenMP Library (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ . */ ++ ++/* This file handles the maintainence of threads in response to team ++ creation and termination. */ ++ ++#include "libgomp.h" ++#include ++#include ++#include ++#include ++ ++attribute_hidden int ++gomp_get_num_devices (void) ++{ ++ return 0; ++} ++ ++/* Called when encountering a target directive. If DEVICE ++ is -1, it means use device-var ICV. If it is -2 (or any other value ++ larger than last available hw device, use host fallback. ++ FN is address of host code, OPENMP_TARGET contains value of the ++ __OPENMP_TARGET__ symbol in the shared library or binary that invokes ++ GOMP_target. HOSTADDRS, SIZES and KINDS are arrays ++ with MAPNUM entries, with addresses of the host objects, ++ sizes of the host objects (resp. for pointer kind pointer bias ++ and assumed sizeof (void *) size) and kinds. */ ++ ++void ++GOMP_target (int device, void (*fn) (void *), const void *openmp_target, ++ size_t mapnum, void **hostaddrs, size_t *sizes, ++ unsigned char *kinds) ++{ ++ /* Host fallback. */ ++ struct gomp_thread old_thr, *thr = gomp_thread (); ++ old_thr = *thr; ++ memset (thr, '\0', sizeof (*thr)); ++ if (gomp_places_list) ++ { ++ thr->place = old_thr.place; ++ thr->ts.place_partition_len = gomp_places_list_len; ++ } ++ fn (hostaddrs); ++ gomp_free_thread (thr); ++ *thr = old_thr; ++} ++ ++void ++GOMP_target_data (int device, const void *openmp_target, size_t mapnum, ++ void **hostaddrs, size_t *sizes, unsigned char *kinds) ++{ ++} ++ ++void ++GOMP_target_end_data (void) ++{ ++} ++ ++void ++GOMP_target_update (int device, const void *openmp_target, size_t mapnum, ++ void **hostaddrs, size_t *sizes, unsigned char *kinds) ++{ ++} ++ ++void ++GOMP_teams (unsigned int num_teams, unsigned int thread_limit) ++{ ++ if (thread_limit) ++ { ++ struct gomp_task_icv *icv = gomp_icv (true); ++ icv->thread_limit_var ++ = thread_limit > INT_MAX ? UINT_MAX : thread_limit; ++ } ++ (void) num_teams; ++} +--- libgomp/parallel.c (revision 210461) ++++ libgomp/parallel.c (revision 210462) +@@ -37,18 +37,19 @@ + unsigned + gomp_resolve_num_threads (unsigned specified, unsigned count) + { +- struct gomp_thread *thread = gomp_thread(); ++ struct gomp_thread *thr = gomp_thread (); + struct gomp_task_icv *icv; + unsigned threads_requested, max_num_threads, num_threads; +- unsigned long remaining; ++ unsigned long busy; ++ struct gomp_thread_pool *pool; + + icv = gomp_icv (false); + + if (specified == 1) + return 1; +- else if (thread->ts.active_level >= 1 && !icv->nest_var) ++ else if (thr->ts.active_level >= 1 && !icv->nest_var) + return 1; +- else if (thread->ts.active_level >= gomp_max_active_levels_var) ++ else if (thr->ts.active_level >= gomp_max_active_levels_var) + return 1; + + /* If NUM_THREADS not specified, use nthreads_var. */ +@@ -72,30 +73,46 @@ gomp_resolve_num_threads (unsigned speci + max_num_threads = count; + } + +- /* ULONG_MAX stands for infinity. */ +- if (__builtin_expect (gomp_thread_limit_var == ULONG_MAX, 1) ++ /* UINT_MAX stands for infinity. */ ++ if (__builtin_expect (icv->thread_limit_var == UINT_MAX, 1) + || max_num_threads == 1) + return max_num_threads; + ++ /* The threads_busy counter lives in thread_pool, if there ++ isn't a thread_pool yet, there must be just one thread ++ in the contention group. If thr->team is NULL, this isn't ++ nested parallel, so there is just one thread in the ++ contention group as well, no need to handle it atomically. */ ++ pool = thr->thread_pool; ++ if (thr->ts.team == NULL) ++ { ++ num_threads = max_num_threads; ++ if (num_threads > icv->thread_limit_var) ++ num_threads = icv->thread_limit_var; ++ if (pool) ++ pool->threads_busy = num_threads; ++ return num_threads; ++ } ++ + #ifdef HAVE_SYNC_BUILTINS + do + { +- remaining = gomp_remaining_threads_count; ++ busy = pool->threads_busy; + num_threads = max_num_threads; +- if (num_threads > remaining) +- num_threads = remaining + 1; ++ if (icv->thread_limit_var - busy + 1 < num_threads) ++ num_threads = icv->thread_limit_var - busy + 1; + } +- while (__sync_val_compare_and_swap (&gomp_remaining_threads_count, +- remaining, remaining - num_threads + 1) +- != remaining); ++ while (__sync_val_compare_and_swap (&pool->threads_busy, ++ busy, busy + num_threads - 1) ++ != busy); + #else +- gomp_mutex_lock (&gomp_remaining_threads_lock); ++ gomp_mutex_lock (&gomp_managed_threads_lock); + num_threads = max_num_threads; +- remaining = gomp_remaining_threads_count; +- if (num_threads > remaining) +- num_threads = remaining + 1; +- gomp_remaining_threads_count -= num_threads - 1; +- gomp_mutex_unlock (&gomp_remaining_threads_lock); ++ busy = pool->threads_busy; ++ if (icv->thread_limit_var - busy + 1 < num_threads) ++ num_threads = icv->thread_limit_var - busy + 1; ++ pool->threads_busy += num_threads - 1; ++ gomp_mutex_unlock (&gomp_managed_threads_lock); + #endif + + return num_threads; +@@ -105,13 +122,14 @@ void + GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads) + { + num_threads = gomp_resolve_num_threads (num_threads, 0); +- gomp_team_start (fn, data, num_threads, gomp_new_team (num_threads)); ++ gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads)); + } + + void + GOMP_parallel_end (void) + { +- if (__builtin_expect (gomp_thread_limit_var != ULONG_MAX, 0)) ++ struct gomp_task_icv *icv = gomp_icv (false); ++ if (__builtin_expect (icv->thread_limit_var != UINT_MAX, 0)) + { + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; +@@ -119,20 +137,98 @@ GOMP_parallel_end (void) + gomp_team_end (); + if (nthreads > 1) + { ++ /* If not nested, there is just one thread in the ++ contention group left, no need for atomicity. */ ++ if (thr->ts.team == NULL) ++ thr->thread_pool->threads_busy = 1; ++ else ++ { + #ifdef HAVE_SYNC_BUILTINS +- __sync_fetch_and_add (&gomp_remaining_threads_count, +- nthreads - 1); ++ __sync_fetch_and_add (&thr->thread_pool->threads_busy, ++ 1UL - nthreads); + #else +- gomp_mutex_lock (&gomp_remaining_threads_lock); +- gomp_remaining_threads_count += nthreads - 1; +- gomp_mutex_unlock (&gomp_remaining_threads_lock); ++ gomp_mutex_lock (&gomp_managed_threads_lock); ++ thr->thread_pool->threads_busy -= nthreads - 1; ++ gomp_mutex_unlock (&gomp_managed_threads_lock); + #endif ++ } + } + } + else + gomp_team_end (); + } ++ialias (GOMP_parallel_end) ++ ++void ++GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags) ++{ ++ num_threads = gomp_resolve_num_threads (num_threads, 0); ++ gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads)); ++ fn (data); ++ ialias_call (GOMP_parallel_end) (); ++} ++ ++bool ++GOMP_cancellation_point (int which) ++{ ++ if (!gomp_cancel_var) ++ return false; + ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ if (which & (GOMP_CANCEL_LOOP | GOMP_CANCEL_SECTIONS)) ++ { ++ if (team == NULL) ++ return false; ++ return team->work_share_cancelled != 0; ++ } ++ else if (which & GOMP_CANCEL_TASKGROUP) ++ { ++ if (thr->task->taskgroup && thr->task->taskgroup->cancelled) ++ return true; ++ /* FALLTHRU into the GOMP_CANCEL_PARALLEL case, ++ as #pragma omp cancel parallel also cancels all explicit ++ tasks. */ ++ } ++ if (team) ++ return gomp_team_barrier_cancelled (&team->barrier); ++ return false; ++} ++ialias (GOMP_cancellation_point) ++ ++bool ++GOMP_cancel (int which, bool do_cancel) ++{ ++ if (!gomp_cancel_var) ++ return false; ++ ++ if (!do_cancel) ++ return ialias_call (GOMP_cancellation_point) (which); ++ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ if (which & (GOMP_CANCEL_LOOP | GOMP_CANCEL_SECTIONS)) ++ { ++ /* In orphaned worksharing region, all we want to cancel ++ is current thread. */ ++ if (team != NULL) ++ team->work_share_cancelled = 1; ++ return true; ++ } ++ else if (which & GOMP_CANCEL_TASKGROUP) ++ { ++ if (thr->task->taskgroup && !thr->task->taskgroup->cancelled) ++ { ++ gomp_mutex_lock (&team->task_lock); ++ thr->task->taskgroup->cancelled = true; ++ gomp_mutex_unlock (&team->task_lock); ++ } ++ return true; ++ } ++ team->team_cancelled = 1; ++ gomp_team_barrier_cancel (team); ++ return true; ++} + + /* The public OpenMP API for thread and team related inquiries. */ + diff --git a/SPECS/gcc.spec b/SPECS/gcc.spec index 7eb2349..2c3f959 100644 --- a/SPECS/gcc.spec +++ b/SPECS/gcc.spec @@ -75,7 +75,7 @@ Summary: Various compilers (C, C++, Objective-C, Java, ...) Name: gcc Version: %{gcc_version} -Release: %{gcc_release}%{?dist} +Release: %{gcc_release}.2%{?dist} # libgcc, libgfortran, libmudflap, libgomp, libstdc++ and crtstuff have # GCC Runtime Exception. License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD @@ -211,6 +211,8 @@ Patch21: gcc48-pr57896.patch Patch22: gcc48-pr60272.patch Patch23: gcc48-pr60233.patch Patch24: gcc48-pr60274.patch +Patch25: gcc48-rh1121077.patch +Patch26: gcc48-pr61801.patch Patch1000: fastjar-0.97-segfault.patch Patch1001: fastjar-0.97-len1.patch @@ -780,6 +782,8 @@ rm -f libgo/go/crypto/elliptic/p224{,_test}.go %patch22 -p0 -b .pr60272~ %patch23 -p0 -b .pr60233~ %patch24 -p0 -b .pr60274~ +%patch25 -p0 -b .rh1121077~ +%patch26 -p0 -b .pr61801~ %if 0%{?_enable_debug_packages} cat > split-debuginfo.sh <<\EOF @@ -3052,6 +3056,15 @@ fi %{_prefix}/libexec/gcc/%{gcc_target_platform}/%{gcc_version}/plugin %changelog +* Wed Aug 6 2014 Jakub Jelinek 4.8.2-16.2 +- backport two further OpenMP 4.0 libgomp tasking fixes (#1121077) +- fix scheduler wrong-code with DEBUG_INSNs containing volatile ASM_OPERANDS + (#1127120, PR rtl-optimization/61801) + +* Fri Jul 18 2014 Jakub Jelinek 4.8.2-16.1 +- backport OpenMP 4.0 support to libgomp (library only; #1121077, + PR libgomp/58691) + * Mon Mar 3 2014 Jakub Jelinek 4.8.2-16 - fix up compare_exchange_* in libatomic too (PR c++/60272)