diff --git a/SOURCES/gcc48-pr61801.patch b/SOURCES/gcc48-pr61801.patch
new file mode 100644
index 0000000..b75696e
--- /dev/null
+++ b/SOURCES/gcc48-pr61801.patch
@@ -0,0 +1,58 @@
+2014-07-17  Richard Biener  <rguenther@suse.de>
+
+	PR rtl-optimization/61801
+	* sched-deps.c (sched_analyze_2): For ASM_OPERANDS and
+	ASM_INPUT don't set reg_pending_barrier if it appears in a
+	debug-insn.
+
+2014-08-06  Jakub Jelinek  <jakub@redhat.com>
+
+	PR rtl-optimization/61801
+	* gcc.target/i386/pr61801.c: Rewritten.
+
+2014-07-28  Richard Biener  <rguenther@suse.de>
+
+	PR rtl-optimization/61801
+	* gcc.target/i386/pr61801.c: Fix testcase.
+
+2014-07-28  Richard Biener  <rguenther@suse.de>
+
+	PR rtl-optimization/61801
+	* gcc.target/i386/pr61801.c: New testcase.
+
+--- gcc/sched-deps.c	(revision 212737)
++++ gcc/sched-deps.c	(revision 212738)
+@@ -2750,7 +2750,8 @@ sched_analyze_2 (struct deps_desc *deps,
+ 	   Consider for instance a volatile asm that changes the fpu rounding
+ 	   mode.  An insn should not be moved across this even if it only uses
+ 	   pseudo-regs because it might give an incorrectly rounded result.  */
+-	if (code != ASM_OPERANDS || MEM_VOLATILE_P (x))
++	if ((code != ASM_OPERANDS || MEM_VOLATILE_P (x))
++	    && !DEBUG_INSN_P (insn))
+ 	  reg_pending_barrier = TRUE_BARRIER;
+ 
+ 	/* For all ASM_OPERANDS, we must traverse the vector of input operands.
+--- gcc/testsuite/gcc.target/i386/pr61801.c	(revision 0)
++++ gcc/testsuite/gcc.target/i386/pr61801.c	(revision 213654)
+@@ -0,0 +1,21 @@
++/* PR rtl-optimization/61801 */
++/* { dg-do compile } */
++/* { dg-options "-Os -fcompare-debug" } */
++
++int a, c;
++int bar (void);
++void baz (void);
++
++void
++foo (void)
++{
++  int d;
++  if (bar ())
++    {
++      int e;
++      baz ();
++      asm volatile ("" : "=a" (e) : "0" (a), "i" (0));
++      d = e;
++    }
++  c = d;
++}
diff --git a/SOURCES/gcc48-rh1121077.patch b/SOURCES/gcc48-rh1121077.patch
new file mode 100644
index 0000000..a610217
--- /dev/null
+++ b/SOURCES/gcc48-rh1121077.patch
@@ -0,0 +1,5990 @@
+2014-08-04  Jakub Jelinek  <jakub@redhat.com>
+
+	* task.c (GOMP_taskgroup_end): If taskgroup->num_children
+	is not zero, but taskgroup->children is NULL and there are
+	any task->children, schedule those instead of waiting.
+
+2014-08-01  Jakub Jelinek  <jakub@redhat.com>
+
+	* libgomp.h (struct gomp_task_depend_entry): Add redundant_out field.
+	(struct gomp_taskwait): New type.
+	(struct gomp_task): Add taskwait and parent_depends_on, remove
+	in_taskwait and taskwait_sem fields.
+	(gomp_finish_task): Don't destroy taskwait_sem.
+	* task.c (gomp_init_task): Don't init in_taskwait, instead init
+	taskwait and parent_depends_on.
+	(GOMP_task): For if (0) tasks with depend clause that depend on
+	earlier tasks don't defer them, instead call
+	gomp_task_maybe_wait_for_dependencies to wait for the dependencies.
+	Initialize redundant_out field, for redundant out entries just
+	move them at the end of linked list instead of removing them
+	completely, and set redundant_out flag instead of redundant.
+	(gomp_task_run_pre): Update last_parent_depends_on if scheduling
+	that task.
+	(gomp_task_run_post_handle_dependers): If parent is in
+	gomp_task_maybe_wait_for_dependencies and newly runnable task
+	is not parent_depends_on, queue it in parent->children linked
+	list after all runnable tasks with parent_depends_on set.
+	Adjust for addition of taskwait indirection.
+	(gomp_task_run_post_remove_parent): If parent is in
+	gomp_task_maybe_wait_for_dependencies and task to be removed
+	is parent_depends_on, decrement n_depend and if needed awake
+	parent.  Adjust for addition of taskwait indirection.
+	(GOMP_taskwait): Adjust for addition of taskwait indirection.
+	(gomp_task_maybe_wait_for_dependencies): New function.
+
+2013-10-14  Jakub Jelinek  <jakub@redhat.com>
+
+	* env.c (parse_bind_var): Initialize value to avoid
+	(false positive) warning.
+
+2013-10-12  Jakub Jelinek  <jakub@redhat.com>
+
+	PR libgomp/58691
+	* config/linux/proc.c (gomp_cpuset_popcount): Add unused attribute
+	to check variable.
+	(gomp_init_num_threads): Move i variable declaration into
+	#ifdef CPU_ALLOC_SIZE block.
+	* config/linux/affinity.c (gomp_affinity_init_level): Test
+	gomp_places_list_len == 0 rather than gomp_places_list == 0
+	when checking for topology reading error.
+	* team.c (gomp_team_start): Don't handle bind == omp_proc_bind_false.
+	* env.c (parse_affinity): Add ignore argument, if true, don't populate
+	gomp_places_list, only parse env var and always return false.
+	(parse_places_var): Likewise.  Don't check gomp_global_icv.bind_var.
+	(initialize_env): Always parse OMP_PLACES and GOMP_CPU_AFFINITY env
+	vars, default to OMP_PROC_BIND=true if OMP_PROC_BIND wasn't specified
+	and either of these variables were parsed correctly into a places
+	list.
+
+2013-10-11  Thomas Schwinge  <thomas@codesourcery.com>
+
+        * testsuite/libgomp.c/lib-1.c (main): Add missing error check.
+        * testsuite/libgomp.fortran/lib1.f90: Likewise.
+        * testsuite/libgomp.fortran/lib2.f: Likewise.
+        * testsuite/libgomp.fortran/lib3.f: Likewise.
+
+2013-10-11  Jakub Jelinek  <jakub@redhat.com>
+	    Tobias Burnus  <burnus@net-b.de>
+	    Richard Henderson  <rth@redhat.com>
+
+	* target.c: New file.
+	* Makefile.am (libgomp_la_SOURCES): Add target.c.
+	* Makefile.in: Regenerated.
+	* libgomp_g.h (GOMP_task): Add depend argument.
+	(GOMP_barrier_cancel, GOMP_loop_end_cancel,
+	GOMP_sections_end_cancel, GOMP_target, GOMP_target_data,
+	GOMP_target_end_data, GOMP_target_update, GOMP_teams,
+	GOMP_parallel_loop_static, GOMP_parallel_loop_dynamic,
+	GOMP_parallel_loop_guided, GOMP_parallel_loop_runtime,
+	GOMP_parallel, GOMP_cancel, GOMP_cancellation_point,
+	GOMP_taskgroup_start, GOMP_taskgroup_end,
+	GOMP_parallel_sections): New prototypes.
+	* fortran.c (omp_is_initial_device): Add ialias_redirect.
+	(omp_is_initial_device_): New function.
+	(ULP, STR1, STR2, ialias_redirect): Removed.
+	(omp_get_cancellation_, omp_get_proc_bind_, omp_set_default_device_,
+	omp_set_default_device_8_, omp_get_default_device_,
+	omp_get_num_devices_, omp_get_num_teams_, omp_get_team_num_): New
+	functions.
+	* libgomp.map (GOMP_barrier_cancel, GOMP_loop_end_cancel,
+	GOMP_sections_end_cancel, GOMP_target, GOMP_target_data,
+	GOMP_target_end_data, GOMP_target_update, GOMP_teams): Export
+	@@GOMP_4.0.
+	(omp_is_initial_device, omp_is_initial_device_, omp_get_cancellation,
+	omp_get_cancellation_, omp_get_proc_bind, omp_get_proc_bind_,
+	omp_set_default_device, omp_set_default_device_,
+	omp_set_default_device_8_, omp_get_default_device,
+	omp_get_default_device_, omp_get_num_devices, omp_get_num_devices_,
+	omp_get_num_teams, omp_get_num_teams_, omp_get_team_num,
+	omp_get_team_num_): Export @@OMP_4.0.
+	* team.c (struct gomp_thread_start_data): Add place field.
+	(gomp_thread_start): Clear thr->thread_pool and
+	thr->task before returning.  Use gomp_team_barrier_wait_final
+	instead of gomp_team_barrier_wait.  Initialize thr->place.
+	(gomp_new_team): Initialize work_shares_to_free, work_share_cancelled,
+	team_cancelled and task_queued_count fields.
+	(gomp_free_pool_helper): Clear thr->thread_pool and thr->task
+	before calling pthread_exit.
+	(gomp_free_thread): No longer static.  Use
+	gomp_managed_threads_lock instead of gomp_remaining_threads_lock.
+	(gomp_team_start): Add flags argument.  Set
+	thr->thread_pool->threads_busy to nthreads immediately after creating
+	new pool.  Use gomp_managed_threads_lock instead of
+	gomp_remaining_threads_lock.  Handle OpenMP 4.0 affinity.
+	(gomp_team_end): Use gomp_managed_threads_lock instead of
+	gomp_remaining_threads_lock.  Use gomp_team_barrier_wait_final instead
+	of gomp_team_barrier_wait.  If team->team_cancelled, call
+	gomp_fini_worshare on ws chain starting at team->work_shares_to_free
+	rather than thr->ts.work_share.
+	(initialize_team): Don't call gomp_sem_init here.
+	* sections.c (GOMP_parallel_sections_start): Adjust gomp_team_start
+	caller.
+	(GOMP_parallel_sections, GOMP_sections_end_cancel): New functions.
+	* env.c (gomp_global_icv): Add default_device_var, target_data and
+	bind_var initializers.
+	(gomp_cpu_affinity, gomp_cpu_affinity_len): Remove.
+	(gomp_bind_var_list, gomp_bind_var_list_len, gomp_places_list,
+	gomp_places_list_len): New variables.
+	(parse_bind_var, parse_one_place, parse_places_var): New functions.
+	(parse_affinity): Rewritten to construct OMP_PLACES list with unit
+	sized places.
+	(gomp_cancel_var): New global variable.
+	(parse_int): New function.
+	(handle_omp_display_env): New function.
+	(initialize_env): Use it.  Initialize default_device_var.
+	Parse OMP_CANCELLATION env var.  Use parse_bind_var to parse
+	OMP_PROC_BIND instead of parse_boolean.  Use parse_places_var for
+	OMP_PLACES parsing.  Don't call parse_affinity if OMP_PLACES has
+	been successfully parsed (and call gomp_init_affinity in that case).
+	(omp_get_cancellation, omp_get_proc_bind, omp_set_default_device,
+	omp_get_default_device, omp_get_num_devices, omp_get_num_teams,
+	omp_get_team_num, omp_is_initial_device): New functions.
+	* libgomp.h: Include stdlib.h.
+	(ialias_ulp, ialias_str1, ialias_str2, ialias_redirect, ialias_call):
+	Define.
+	(struct target_mem_desc): Forward declare.
+	(struct gomp_task_icv): Add default_device_var, target_data, bind_var
+	and thread_limit_var fields.
+	(gomp_get_num_devices): New prototype.
+	(gomp_cancel_var): New extern decl.
+	(struct gomp_team): Add work_shares_to_free, work_share_cancelled,
+	team_cancelled and task_queued_count fields.  Add comments about
+	task_{,queued_,running_}count.
+	(gomp_cancel_kind): New enum.
+	(gomp_work_share_end_cancel): New prototype.
+	(struct gomp_task): Add next_taskgroup, prev_taskgroup, taskgroup,
+	copy_ctors_done, dependers, depend_hash, depend_count, num_dependees
+	and depend fields.
+	(struct gomp_taskgroup): New type.
+	(struct gomp_task_depend_entry,
+	struct gomp_dependers_vec): New types.
+	(gomp_finish_task): Free depend_hash if non-NULL.
+	(struct gomp_team_state): Add place_partition_off
+	and place_partition_len fields.
+	(gomp_bind_var_list, gomp_bind_var_list_len, gomp_places_list,
+	gomp_places_list_len): New extern decls.
+	(struct gomp_thread): Add place field.
+	(gomp_cpu_affinity, gomp_cpu_affinity_len): Remove.
+	(gomp_init_thread_affinity): Add place argument.
+	(gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus,
+	gomp_affinity_remove_cpu, gomp_affinity_copy_place,
+	gomp_affinity_same_place, gomp_affinity_finalize_place_list,
+	gomp_affinity_init_level, gomp_affinity_print_place): New
+	prototypes.
+	(gomp_team_start): Add flags argument.
+	(gomp_thread_limit_var, gomp_remaining_threads_count,
+	gomp_remaining_threads_lock): Remove.
+	(gomp_managed_threads_lock): New variable.
+	(struct gomp_thread_pool): Add threads_busy field.
+	(gomp_free_thread): New prototype.
+	* task.c: Include hashtab.h.
+	(hash_entry_type): New typedef.
+	(htab_alloc, htab_free, htab_hash, htab_eq): New inlines.
+	(gomp_init_task): Clear dependers, depend_hash, depend_count,
+	copy_ctors_done and taskgroup fields.
+	(GOMP_task): Add depend argument, handle depend clauses.  If
+	gomp_team_barrier_cancelled or if it's taskgroup has been
+	cancelled, don't queue or start new tasks.  Set copy_ctors_done
+	field if needed.  Initialize taskgroup field.  If copy_ctors_done
+	and already cancelled, don't discard the task.  If taskgroup is
+	non-NULL, enqueue the task into taskgroup queue.  Increment
+	num_children field in taskgroup.  Increment task_queued_count.
+	(gomp_task_run_pre, gomp_task_run_post_remove_parent,
+	gomp_task_run_post_remove_taskgroup): New inline functions.
+	(gomp_task_run_post_handle_depend_hash,
+	gomp_task_run_post_handle_dependers,
+	gomp_task_run_post_handle_depend): New functions.
+	(GOMP_taskwait): Use them.  If more than one new tasks
+	have been queued, wake other threads if needed.
+	(gomp_barrier_handle_tasks): Likewise.  If
+	gomp_team_barrier_cancelled, don't start any new tasks, just free
+	all tasks.
+	(GOMP_taskgroup_start, GOMP_taskgroup_end): New functions.
+	* loop.c (gomp_parallel_loop_start): Add flags argument, pass it
+	through to gomp_team_start.
+	(GOMP_parallel_loop_static_start, GOMP_parallel_loop_dynamic_start,
+	GOMP_parallel_loop_guided_start, GOMP_parallel_loop_runtime_start):
+	Adjust gomp_parallel_loop_start callers.
+	(GOMP_parallel_loop_static, GOMP_parallel_loop_dynamic,
+	GOMP_parallel_loop_guided, GOMP_parallel_loop_runtime,
+	GOMP_loop_end_cancel): New functions.
+	(GOMP_parallel_end): Add ialias_redirect.
+	* hashtab.h: New file.
+	* work.c (gomp_work_share_end, gomp_work_share_end_nowait): Set
+	team->work_shares_to_free to thr->ts.work_share before calling
+	free_work_share.
+	(gomp_work_share_end_cancel): New function.
+	* config/linux/proc.c: Include errno.h.
+	(gomp_get_cpuset_size, gomp_cpuset_size, gomp_cpusetp): New variables.
+	(gomp_cpuset_popcount): Add cpusetsize argument, use it instead of
+	sizeof (cpu_set_t) to determine number of iterations.  Fix up check
+	extern decl.  Use CPU_COUNT_S if available, or CPU_COUNT if
+	gomp_cpuset_size is sizeof (cpu_set_t).
+	(gomp_init_num_threads): Initialize gomp_cpuset_size,
+	gomp_get_cpuset_size and gomp_cpusetp here, use gomp_cpusetp instead
+	of &cpuset and pass gomp_cpuset_size instead of sizeof (cpu_set_t)
+	to pthread_getaffinity_np.  Free and clear gomp_cpusetp if it didn't
+	contain any logical CPUs.
+	(get_num_procs): Don't call pthread_getaffinity_np if gomp_cpusetp
+	is NULL.  Use gomp_cpusetp instead of &cpuset and pass
+	gomp_get_cpuset_size instead of sizeof (cpu_set_t) to
+	pthread_getaffinity_np.  Check gomp_places_list instead of
+	gomp_cpu_affinity.  Adjust gomp_cpuset_popcount caller.
+	* config/linux/bar.c (gomp_barrier_wait_end,
+	gomp_barrier_wait_last): Use BAR_* defines.
+	(gomp_team_barrier_wait_end): Likewise.  Clear BAR_CANCELLED
+	from state where needed.  Set work_share_cancelled to 0 on last
+	thread.
+	(gomp_team_barrier_wait_final, gomp_team_barrier_wait_cancel_end,
+	gomp_team_barrier_wait_cancel, gomp_team_barrier_cancel): New
+	functions.
+	* config/linux/proc.h (gomp_cpuset_popcount): Add attribute_hidden.
+	Add cpusetsize argument.
+	(gomp_cpuset_size, gomp_cpusetp): Declare.
+	* config/linux/affinity.c: Include errno.h, stdio.h and string.h.
+	(affinity_counter): Remove.
+	(CPU_ISSET_S, CPU_ZERO_S, CPU_SET_S, CPU_CLR_S): Define
+	if CPU_ALLOC_SIZE isn't defined.
+	(gomp_init_affinity): Rewritten, if gomp_places_list is NULL, try
+	silently create OMP_PLACES=threads, if it is non-NULL afterwards,
+	bind current thread to the first place.
+	(gomp_init_thread_affinity): Rewritten.  Add place argument, just
+	pthread_setaffinity_np to gomp_places_list[place].
+	(gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus,
+	gomp_affinity_remove_cpu, gomp_affinity_copy_place,
+	gomp_affinity_same_place, gomp_affinity_finalize_place_list,
+	gomp_affinity_init_level, gomp_affinity_print_place): New functions.
+	* config/linux/bar.h (BAR_TASK_PENDING, BAR_WAS_LAST,
+	BAR_WAITING_FOR_TASK, BAR_INCR, BAR_CANCELLED): Define.
+	(gomp_barrier_t): Add awaited_final field.
+	(gomp_barrier_init): Initialize awaited_final field.
+	(gomp_team_barrier_wait_final, gomp_team_barrier_wait_cancel,
+	gomp_team_barrier_wait_cancel_end, gomp_team_barrier_cancel): New
+	prototypes.
+	(gomp_barrier_wait_start): Preserve BAR_CANCELLED bit.  Use BAR_*
+	defines.
+	(gomp_barrier_wait_cancel_start, gomp_team_barrier_wait_final_start,
+	gomp_team_barrier_cancelled): New inline functions.
+	(gomp_barrier_last_thread,
+	gomp_team_barrier_set_task_pending,
+	gomp_team_barrier_clear_task_pending,
+	gomp_team_barrier_set_waiting_for_tasks,
+	gomp_team_barrier_waiting_for_tasks,
+	gomp_team_barrier_done): Use BAR_* defines.
+	* config/posix/bar.c (gomp_barrier_init): Clear cancellable field.
+	(gomp_barrier_wait_end): Use BAR_* defines.
+	(gomp_team_barrier_wait_end): Clear BAR_CANCELLED from state.
+	Set work_share_cancelled to 0 on last thread, use __atomic_load_n.
+	Use BAR_* defines.
+	(gomp_team_barrier_wait_cancel_end, gomp_team_barrier_wait_cancel,
+	gomp_team_barrier_cancel): New functions.
+	* config/posix/affinity.c (gomp_init_thread_affinity): Add place
+	argument.
+	(gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus,
+	gomp_affinity_remove_cpu, gomp_affinity_copy_place,
+	gomp_affinity_same_place, gomp_affinity_finalize_place_list,
+	gomp_affinity_init_level, gomp_affinity_print_place): New stubs.
+	* config/posix/bar.h (BAR_TASK_PENDING, BAR_WAS_LAST,
+	BAR_WAITING_FOR_TASK, BAR_INCR, BAR_CANCELLED): Define.
+	(gomp_barrier_t): Add cancellable field.
+	(gomp_team_barrier_wait_cancel, gomp_team_barrier_wait_cancel_end,
+	gomp_team_barrier_cancel): New prototypes.
+	(gomp_barrier_wait_start): Preserve BAR_CANCELLED bit.
+	(gomp_barrier_wait_cancel_start, gomp_team_barrier_wait_final,
+	gomp_team_barrier_cancelled): New inline functions.
+	(gomp_barrier_wait_start, gomp_barrier_last_thread,
+	gomp_team_barrier_set_task_pending,
+	gomp_team_barrier_clear_task_pending,
+	gomp_team_barrier_set_waiting_for_tasks,
+	gomp_team_barrier_waiting_for_tasks,
+	gomp_team_barrier_done): Use BAR_* defines.
+	* barrier.c (GOMP_barrier_cancel): New function.
+	* parallel.c (GOMP_parallel, GOMP_cancel, GOMP_cancellation_point):
+	New functions.
+	(gomp_resolve_num_threads): Adjust for thread_limit now being in
+	icv->thread_limit_var.  Use UINT_MAX instead of ULONG_MAX as
+	infinity.  If not nested, just return minimum of max_num_threads
+	and icv->thread_limit_var and if thr->thread_pool, set threads_busy
+	to the returned value.  Otherwise, don't update atomically
+	gomp_remaining_threads_count, but instead thr->thread_pool->threads_busy.
+	(GOMP_parallel_end): Adjust for thread_limit now being in
+	icv->thread_limit_var.  Use UINT_MAX instead of ULONG_MAX as
+	infinity.  Adjust threads_busy in the pool rather than
+	gomp_remaining_threads_count.  Remember team->nthreads and call
+	gomp_team_end before adjusting threads_busy, if not nested
+	afterwards, just set it to 1 non-atomically.  Add ialias.
+	(GOMP_parallel_start): Adjust gomp_team_start caller.
+	* testsuite/libgomp.c/atomic-14.c: Add parens to make it valid.
+
+--- libgomp/Makefile.am	(revision 210461)
++++ libgomp/Makefile.am	(revision 210462)
+@@ -60,7 +60,7 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_L
+ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
+ 	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
+ 	task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
+-	time.c fortran.c affinity.c
++	time.c fortran.c affinity.c target.c
+ 
+ nodist_noinst_HEADERS = libgomp_f.h
+ nodist_libsubinclude_HEADERS = omp.h
+--- libgomp/Makefile.in	(revision 210461)
++++ libgomp/Makefile.in	(revision 210462)
+@@ -96,7 +96,7 @@ am_libgomp_la_OBJECTS = alloc.lo barrier
+ 	error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \
+ 	parallel.lo sections.lo single.lo task.lo team.lo work.lo \
+ 	lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \
+-	fortran.lo affinity.lo
++	fortran.lo affinity.lo target.lo
+ libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
+ DEFAULT_INCLUDES = -I.@am__isrc@
+ depcomp = $(SHELL) $(top_srcdir)/../depcomp
+@@ -317,7 +317,7 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_L
+ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
+ 	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
+ 	task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
+-	time.c fortran.c affinity.c
++	time.c fortran.c affinity.c target.c
+ 
+ nodist_noinst_HEADERS = libgomp_f.h
+ nodist_libsubinclude_HEADERS = omp.h
+@@ -474,6 +474,7 @@ distclean-compile:
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@
+--- libgomp/libgomp_g.h	(revision 210461)
++++ libgomp/libgomp_g.h	(revision 210462)
+@@ -33,6 +33,7 @@
+ /* barrier.c */
+ 
+ extern void GOMP_barrier (void);
++extern bool GOMP_barrier_cancel (void);
+ 
+ /* critical.c */
+ 
+@@ -76,9 +77,22 @@ extern void GOMP_parallel_loop_guided_st
+ 					     unsigned, long, long, long, long);
+ extern void GOMP_parallel_loop_runtime_start (void (*)(void *), void *,
+ 					      unsigned, long, long, long);
++extern void GOMP_parallel_loop_static (void (*)(void *), void *,
++				       unsigned, long, long, long, long,
++				       unsigned);
++extern void GOMP_parallel_loop_dynamic (void (*)(void *), void *,
++					unsigned, long, long, long, long,
++					unsigned);
++extern void GOMP_parallel_loop_guided (void (*)(void *), void *,
++				       unsigned, long, long, long, long,
++				       unsigned);
++extern void GOMP_parallel_loop_runtime (void (*)(void *), void *,
++					unsigned, long, long, long,
++					unsigned);
+ 
+ extern void GOMP_loop_end (void);
+ extern void GOMP_loop_end_nowait (void);
++extern bool GOMP_loop_end_cancel (void);
+ 
+ /* loop_ull.c */
+ 
+@@ -157,13 +171,18 @@ extern void GOMP_ordered_end (void);
+ 
+ extern void GOMP_parallel_start (void (*) (void *), void *, unsigned);
+ extern void GOMP_parallel_end (void);
++extern void GOMP_parallel (void (*) (void *), void *, unsigned, unsigned);
++extern bool GOMP_cancel (int, bool);
++extern bool GOMP_cancellation_point (int);
+ 
+ /* task.c */
+ 
+ extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *),
+-		       long, long, bool, unsigned);
++		       long, long, bool, unsigned, void **);
+ extern void GOMP_taskwait (void);
+ extern void GOMP_taskyield (void);
++extern void GOMP_taskgroup_start (void);
++extern void GOMP_taskgroup_end (void);
+ 
+ /* sections.c */
+ 
+@@ -171,8 +190,11 @@ extern unsigned GOMP_sections_start (uns
+ extern unsigned GOMP_sections_next (void);
+ extern void GOMP_parallel_sections_start (void (*) (void *), void *,
+ 					  unsigned, unsigned);
++extern void GOMP_parallel_sections (void (*) (void *), void *,
++				    unsigned, unsigned, unsigned);
+ extern void GOMP_sections_end (void);
+ extern void GOMP_sections_end_nowait (void);
++extern bool GOMP_sections_end_cancel (void);
+ 
+ /* single.c */
+ 
+@@ -180,4 +202,15 @@ extern bool GOMP_single_start (void);
+ extern void *GOMP_single_copy_start (void);
+ extern void GOMP_single_copy_end (void *);
+ 
++/* target.c */
++
++extern void GOMP_target (int, void (*) (void *), const void *,
++			 size_t, void **, size_t *, unsigned char *);
++extern void GOMP_target_data (int, const void *,
++			      size_t, void **, size_t *, unsigned char *);
++extern void GOMP_target_end_data (void);
++extern void GOMP_target_update (int, const void *,
++				size_t, void **, size_t *, unsigned char *);
++extern void GOMP_teams (unsigned int, unsigned int);
++
+ #endif /* LIBGOMP_G_H */
+--- libgomp/fortran.c	(revision 210461)
++++ libgomp/fortran.c	(revision 210462)
+@@ -31,11 +31,6 @@
+ 
+ #ifdef HAVE_ATTRIBUTE_ALIAS
+ /* Use internal aliases if possible.  */
+-# define ULP		STR1(__USER_LABEL_PREFIX__)
+-# define STR1(x)	STR2(x)
+-# define STR2(x)	#x
+-# define ialias_redirect(fn) \
+-  extern __typeof (fn) fn __asm__ (ULP "gomp_ialias_" #fn) attribute_hidden;
+ # ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
+ ialias_redirect (omp_init_lock)
+ ialias_redirect (omp_init_nest_lock)
+@@ -70,6 +65,14 @@ ialias_redirect (omp_get_ancestor_thread
+ ialias_redirect (omp_get_team_size)
+ ialias_redirect (omp_get_active_level)
+ ialias_redirect (omp_in_final)
++ialias_redirect (omp_get_cancellation)
++ialias_redirect (omp_get_proc_bind)
++ialias_redirect (omp_set_default_device)
++ialias_redirect (omp_get_default_device)
++ialias_redirect (omp_get_num_devices)
++ialias_redirect (omp_get_num_teams)
++ialias_redirect (omp_get_team_num)
++ialias_redirect (omp_is_initial_device)
+ #endif
+ 
+ #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
+@@ -435,3 +438,57 @@ omp_in_final_ (void)
+ {
+   return omp_in_final ();
+ }
++
++int32_t
++omp_get_cancellation_ (void)
++{
++  return omp_get_cancellation ();
++}
++
++int32_t
++omp_get_proc_bind_ (void)
++{
++  return omp_get_proc_bind ();
++}
++
++void
++omp_set_default_device_ (const int32_t *device_num)
++{
++  return omp_set_default_device (*device_num);
++}
++
++void
++omp_set_default_device_8_ (const int64_t *device_num)
++{
++  return omp_set_default_device (TO_INT (*device_num));
++}
++
++int32_t
++omp_get_default_device_ (void)
++{
++  return omp_get_default_device ();
++}
++
++int32_t
++omp_get_num_devices_ (void)
++{
++  return omp_get_num_devices ();
++}
++
++int32_t
++omp_get_num_teams_ (void)
++{
++  return omp_get_num_teams ();
++}
++
++int32_t
++omp_get_team_num_ (void)
++{
++  return omp_get_team_num ();
++}
++
++int32_t
++omp_is_initial_device_ (void)
++{
++  return omp_is_initial_device ();
++}
+--- libgomp/libgomp.map	(revision 210461)
++++ libgomp/libgomp.map	(revision 210462)
+@@ -113,6 +113,27 @@ OMP_3.1 {
+ 	omp_in_final_;
+ } OMP_3.0;
+ 
++OMP_4.0 {
++  global:
++	omp_get_cancellation;
++	omp_get_cancellation_;
++	omp_get_proc_bind;
++	omp_get_proc_bind_;
++	omp_set_default_device;
++	omp_set_default_device_;
++	omp_set_default_device_8_;
++	omp_get_default_device;
++	omp_get_default_device_;
++	omp_get_num_devices;
++	omp_get_num_devices_;
++	omp_get_num_teams;
++	omp_get_num_teams_;
++	omp_get_team_num;
++	omp_get_team_num_;
++	omp_is_initial_device;
++	omp_is_initial_device_;
++} OMP_3.1;
++
+ GOMP_1.0 {
+   global:
+ 	GOMP_atomic_end;
+@@ -184,3 +205,25 @@ GOMP_3.0 {
+   global:
+ 	GOMP_taskyield;
+ } GOMP_2.0;
++
++GOMP_4.0 {
++  global:
++	GOMP_barrier_cancel;
++	GOMP_cancel;
++	GOMP_cancellation_point;
++	GOMP_loop_end_cancel;
++	GOMP_parallel_loop_dynamic;
++	GOMP_parallel_loop_guided;
++	GOMP_parallel_loop_runtime;
++	GOMP_parallel_loop_static;
++	GOMP_parallel_sections;
++	GOMP_parallel;
++	GOMP_sections_end_cancel;
++	GOMP_taskgroup_start;
++	GOMP_taskgroup_end;
++	GOMP_target;
++	GOMP_target_data;
++	GOMP_target_end_data;
++	GOMP_target_update;
++	GOMP_teams;
++} GOMP_3.0;
+--- libgomp/team.c	(revision 210461)
++++ libgomp/team.c	(revision 210462)
+@@ -53,6 +53,7 @@ struct gomp_thread_start_data
+   struct gomp_team_state ts;
+   struct gomp_task *task;
+   struct gomp_thread_pool *thread_pool;
++  unsigned int place;
+   bool nested;
+ };
+ 
+@@ -84,6 +85,7 @@ gomp_thread_start (void *xdata)
+   thr->thread_pool = data->thread_pool;
+   thr->ts = data->ts;
+   thr->task = data->task;
++  thr->place = data->place;
+ 
+   thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
+ 
+@@ -98,7 +100,7 @@ gomp_thread_start (void *xdata)
+       gomp_barrier_wait (&team->barrier);
+ 
+       local_fn (local_data);
+-      gomp_team_barrier_wait (&team->barrier);
++      gomp_team_barrier_wait_final (&team->barrier);
+       gomp_finish_task (task);
+       gomp_barrier_wait_last (&team->barrier);
+     }
+@@ -113,7 +115,7 @@ gomp_thread_start (void *xdata)
+ 	  struct gomp_task *task = thr->task;
+ 
+ 	  local_fn (local_data);
+-	  gomp_team_barrier_wait (&team->barrier);
++	  gomp_team_barrier_wait_final (&team->barrier);
+ 	  gomp_finish_task (task);
+ 
+ 	  gomp_barrier_wait (&pool->threads_dock);
+@@ -126,6 +128,8 @@ gomp_thread_start (void *xdata)
+     }
+ 
+   gomp_sem_destroy (&thr->release);
++  thr->thread_pool = NULL;
++  thr->task = NULL;
+   return NULL;
+ }
+ 
+@@ -149,6 +153,7 @@ gomp_new_team (unsigned nthreads)
+ #else
+   gomp_mutex_init (&team->work_share_list_free_lock);
+ #endif
++  team->work_shares_to_free = &team->work_shares[0];
+   gomp_init_work_share (&team->work_shares[0], false, nthreads);
+   team->work_shares[0].next_alloc = NULL;
+   team->work_share_list_free = NULL;
+@@ -167,7 +172,10 @@ gomp_new_team (unsigned nthreads)
+   gomp_mutex_init (&team->task_lock);
+   team->task_queue = NULL;
+   team->task_count = 0;
++  team->task_queued_count = 0;
+   team->task_running_count = 0;
++  team->work_share_cancelled = 0;
++  team->team_cancelled = 0;
+ 
+   return team;
+ }
+@@ -199,16 +207,19 @@ static struct gomp_thread_pool *gomp_new
+ static void
+ gomp_free_pool_helper (void *thread_pool)
+ {
++  struct gomp_thread *thr = gomp_thread ();
+   struct gomp_thread_pool *pool
+     = (struct gomp_thread_pool *) thread_pool;
+   gomp_barrier_wait_last (&pool->threads_dock);
+-  gomp_sem_destroy (&gomp_thread ()->release);
++  gomp_sem_destroy (&thr->release);
++  thr->thread_pool = NULL;
++  thr->task = NULL;
+   pthread_exit (NULL);
+ }
+ 
+ /* Free a thread pool and release its threads. */
+ 
+-static void
++void
+ gomp_free_thread (void *arg __attribute__((unused)))
+ {
+   struct gomp_thread *thr = gomp_thread ();
+@@ -236,9 +247,9 @@ gomp_free_thread (void *arg __attribute_
+ 	  __sync_fetch_and_add (&gomp_managed_threads,
+ 				1L - pool->threads_used);
+ #else
+-	  gomp_mutex_lock (&gomp_remaining_threads_lock);
++	  gomp_mutex_lock (&gomp_managed_threads_lock);
+ 	  gomp_managed_threads -= pool->threads_used - 1L;
+-	  gomp_mutex_unlock (&gomp_remaining_threads_lock);
++	  gomp_mutex_unlock (&gomp_managed_threads_lock);
+ #endif
+ 	}
+       free (pool->threads);
+@@ -259,7 +270,7 @@ gomp_free_thread (void *arg __attribute_
+ 
+ void
+ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
+-		 struct gomp_team *team)
++		 unsigned flags, struct gomp_team *team)
+ {
+   struct gomp_thread_start_data *start_data;
+   struct gomp_thread *thr, *nthr;
+@@ -270,17 +281,24 @@ gomp_team_start (void (*fn) (void *), vo
+   unsigned i, n, old_threads_used = 0;
+   pthread_attr_t thread_attr, *attr;
+   unsigned long nthreads_var;
++  char bind, bind_var;
++  unsigned int s = 0, rest = 0, p = 0, k = 0;
++  unsigned int affinity_count = 0;
++  struct gomp_thread **affinity_thr = NULL;
+ 
+   thr = gomp_thread ();
+   nested = thr->ts.team != NULL;
+   if (__builtin_expect (thr->thread_pool == NULL, 0))
+     {
+       thr->thread_pool = gomp_new_thread_pool ();
++      thr->thread_pool->threads_busy = nthreads;
+       pthread_setspecific (gomp_thread_destructor, thr);
+     }
+   pool = thr->thread_pool;
+   task = thr->task;
+   icv = task ? &task->icv : &gomp_global_icv;
++  if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
++    gomp_init_affinity ();
+ 
+   /* Always save the previous state, even if this isn't a nested team.
+      In particular, we should save any work share state from an outer
+@@ -303,14 +321,90 @@ gomp_team_start (void (*fn) (void *), vo
+   if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
+       && thr->ts.level < gomp_nthreads_var_list_len)
+     nthreads_var = gomp_nthreads_var_list[thr->ts.level];
++  bind_var = icv->bind_var;
++  if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
++    bind_var = flags & 7;
++  bind = bind_var;
++  if (__builtin_expect (gomp_bind_var_list != NULL, 0)
++      && thr->ts.level < gomp_bind_var_list_len)
++    bind_var = gomp_bind_var_list[thr->ts.level];
+   gomp_init_task (thr->task, task, icv);
+   team->implicit_task[0].icv.nthreads_var = nthreads_var;
++  team->implicit_task[0].icv.bind_var = bind_var;
+ 
+   if (nthreads == 1)
+     return;
+ 
+   i = 1;
+ 
++  if (__builtin_expect (gomp_places_list != NULL, 0))
++    {
++      /* Depending on chosen proc_bind model, set subpartition
++	 for the master thread and initialize helper variables
++	 P and optionally S, K and/or REST used by later place
++	 computation for each additional thread.  */
++      p = thr->place - 1;
++      switch (bind)
++	{
++	case omp_proc_bind_true:
++	case omp_proc_bind_close:
++	  if (nthreads > thr->ts.place_partition_len)
++	    {
++	      /* T > P.  S threads will be placed in each place,
++		 and the final REM threads placed one by one
++		 into the already occupied places.  */
++	      s = nthreads / thr->ts.place_partition_len;
++	      rest = nthreads % thr->ts.place_partition_len;
++	    }
++	  else
++	    s = 1;
++	  k = 1;
++	  break;
++	case omp_proc_bind_master:
++	  /* Each thread will be bound to master's place.  */
++	  break;
++	case omp_proc_bind_spread:
++	  if (nthreads <= thr->ts.place_partition_len)
++	    {
++	      /* T <= P.  Each subpartition will have in between s
++		 and s+1 places (subpartitions starting at or
++		 after rest will have s places, earlier s+1 places),
++		 each thread will be bound to the first place in
++		 its subpartition (except for the master thread
++		 that can be bound to another place in its
++		 subpartition).  */
++	      s = thr->ts.place_partition_len / nthreads;
++	      rest = thr->ts.place_partition_len % nthreads;
++	      rest = (s + 1) * rest + thr->ts.place_partition_off;
++	      if (p < rest)
++		{
++		  p -= (p - thr->ts.place_partition_off) % (s + 1);
++		  thr->ts.place_partition_len = s + 1;
++		}
++	      else
++		{
++		  p -= (p - rest) % s;
++		  thr->ts.place_partition_len = s;
++		}
++	      thr->ts.place_partition_off = p;
++	    }
++	  else
++	    {
++	      /* T > P.  Each subpartition will have just a single
++		 place and we'll place between s and s+1
++		 threads into each subpartition.  */
++	      s = nthreads / thr->ts.place_partition_len;
++	      rest = nthreads % thr->ts.place_partition_len;
++	      thr->ts.place_partition_off = p;
++	      thr->ts.place_partition_len = 1;
++	      k = 1;
++	    }
++	  break;
++	}
++    }
++  else
++    bind = omp_proc_bind_false;
++
+   /* We only allow the reuse of idle threads for non-nested PARALLEL
+      regions.  This appears to be implied by the semantics of
+      threadprivate variables, but perhaps that's reading too much into
+@@ -341,47 +435,244 @@ gomp_team_start (void (*fn) (void *), vo
+ 	 team will exit.  */
+       pool->threads_used = nthreads;
+ 
++      /* If necessary, expand the size of the gomp_threads array.  It is
++	 expected that changes in the number of threads are rare, thus we
++	 make no effort to expand gomp_threads_size geometrically.  */
++      if (nthreads >= pool->threads_size)
++	{
++	  pool->threads_size = nthreads + 1;
++	  pool->threads
++	    = gomp_realloc (pool->threads,
++			    pool->threads_size
++			    * sizeof (struct gomp_thread_data *));
++	}
++
+       /* Release existing idle threads.  */
+       for (; i < n; ++i)
+ 	{
+-	  nthr = pool->threads[i];
++	  unsigned int place_partition_off = thr->ts.place_partition_off;
++	  unsigned int place_partition_len = thr->ts.place_partition_len;
++	  unsigned int place = 0;
++	  if (__builtin_expect (gomp_places_list != NULL, 0))
++	    {
++	      switch (bind)
++		{
++		case omp_proc_bind_true:
++		case omp_proc_bind_close:
++		  if (k == s)
++		    {
++		      ++p;
++		      if (p == (team->prev_ts.place_partition_off
++				+ team->prev_ts.place_partition_len))
++			p = team->prev_ts.place_partition_off;
++		      k = 1;
++		      if (i == nthreads - rest)
++			s = 1;
++		    }
++		  else
++		    ++k;
++		  break;
++		case omp_proc_bind_master:
++		  break;
++		case omp_proc_bind_spread:
++		  if (k == 0)
++		    {
++		      /* T <= P.  */
++		      if (p < rest)
++			p += s + 1;
++		      else
++			p += s;
++		      if (p == (team->prev_ts.place_partition_off
++				+ team->prev_ts.place_partition_len))
++			p = team->prev_ts.place_partition_off;
++		      place_partition_off = p;
++		      if (p < rest)
++			place_partition_len = s + 1;
++		      else
++			place_partition_len = s;
++		    }
++		  else
++		    {
++		      /* T > P.  */
++		      if (k == s)
++			{
++			  ++p;
++			  if (p == (team->prev_ts.place_partition_off
++				    + team->prev_ts.place_partition_len))
++			    p = team->prev_ts.place_partition_off;
++			  k = 1;
++			  if (i == nthreads - rest)
++			    s = 1;
++			}
++		      else
++			++k;
++		      place_partition_off = p;
++		      place_partition_len = 1;
++		    }
++		  break;
++		}
++	      if (affinity_thr != NULL
++		  || (bind != omp_proc_bind_true
++		      && pool->threads[i]->place != p + 1)
++		  || pool->threads[i]->place <= place_partition_off
++		  || pool->threads[i]->place > (place_partition_off
++						+ place_partition_len))
++		{
++		  unsigned int l;
++		  if (affinity_thr == NULL)
++		    {
++		      unsigned int j;
++
++		      if (team->prev_ts.place_partition_len > 64)
++			affinity_thr
++			  = gomp_malloc (team->prev_ts.place_partition_len
++					 * sizeof (struct gomp_thread *));
++		      else
++			affinity_thr
++			  = gomp_alloca (team->prev_ts.place_partition_len
++					 * sizeof (struct gomp_thread *));
++		      memset (affinity_thr, '\0',
++			      team->prev_ts.place_partition_len
++			      * sizeof (struct gomp_thread *));
++		      for (j = i; j < old_threads_used; j++)
++			{
++			  if (pool->threads[j]->place
++			      > team->prev_ts.place_partition_off
++			      && (pool->threads[j]->place
++				  <= (team->prev_ts.place_partition_off
++				      + team->prev_ts.place_partition_len)))
++			    {
++			      l = pool->threads[j]->place - 1
++				  - team->prev_ts.place_partition_off;
++			      pool->threads[j]->data = affinity_thr[l];
++			      affinity_thr[l] = pool->threads[j];
++			    }
++			  pool->threads[j] = NULL;
++			}
++		      if (nthreads > old_threads_used)
++			memset (&pool->threads[old_threads_used],
++				'\0', ((nthreads - old_threads_used)
++				       * sizeof (struct gomp_thread *)));
++		      n = nthreads;
++		      affinity_count = old_threads_used - i;
++		    }
++		  if (affinity_count == 0)
++		    break;
++		  l = p;
++		  if (affinity_thr[l - team->prev_ts.place_partition_off]
++		      == NULL)
++		    {
++		      if (bind != omp_proc_bind_true)
++			continue;
++		      for (l = place_partition_off;
++			   l < place_partition_off + place_partition_len;
++			   l++)
++			if (affinity_thr[l - team->prev_ts.place_partition_off]
++			    != NULL)
++			  break;
++		      if (l == place_partition_off + place_partition_len)
++			continue;
++		    }
++		  nthr = affinity_thr[l - team->prev_ts.place_partition_off];
++		  affinity_thr[l - team->prev_ts.place_partition_off]
++		    = (struct gomp_thread *) nthr->data;
++		  affinity_count--;
++		  pool->threads[i] = nthr;
++		}
++	      else
++		nthr = pool->threads[i];
++	      place = p + 1;
++	    }
++	  else
++	    nthr = pool->threads[i];
+ 	  nthr->ts.team = team;
+ 	  nthr->ts.work_share = &team->work_shares[0];
+ 	  nthr->ts.last_work_share = NULL;
+ 	  nthr->ts.team_id = i;
+ 	  nthr->ts.level = team->prev_ts.level + 1;
+ 	  nthr->ts.active_level = thr->ts.active_level;
++	  nthr->ts.place_partition_off = place_partition_off;
++	  nthr->ts.place_partition_len = place_partition_len;
+ #ifdef HAVE_SYNC_BUILTINS
+ 	  nthr->ts.single_count = 0;
+ #endif
+ 	  nthr->ts.static_trip = 0;
+ 	  nthr->task = &team->implicit_task[i];
++	  nthr->place = place;
+ 	  gomp_init_task (nthr->task, task, icv);
+ 	  team->implicit_task[i].icv.nthreads_var = nthreads_var;
++	  team->implicit_task[i].icv.bind_var = bind_var;
+ 	  nthr->fn = fn;
+ 	  nthr->data = data;
+ 	  team->ordered_release[i] = &nthr->release;
+ 	}
+ 
++      if (__builtin_expect (affinity_thr != NULL, 0))
++	{
++	  /* If AFFINITY_THR is non-NULL just because we had to
++	     permute some threads in the pool, but we've managed
++	     to find exactly as many old threads as we'd find
++	     without affinity, we don't need to handle this
++	     specially anymore.  */
++	  if (nthreads <= old_threads_used
++	      ? (affinity_count == old_threads_used - nthreads)
++	      : (i == old_threads_used))
++	    {
++	      if (team->prev_ts.place_partition_len > 64)
++		free (affinity_thr);
++	      affinity_thr = NULL;
++	      affinity_count = 0;
++	    }
++	  else
++	    {
++	      i = 1;
++	      /* We are going to compute the places/subpartitions
++		 again from the beginning.  So, we need to reinitialize
++		 vars modified by the switch (bind) above inside
++		 of the loop, to the state they had after the initial
++		 switch (bind).  */
++	      switch (bind)
++		{
++		case omp_proc_bind_true:
++		case omp_proc_bind_close:
++		  if (nthreads > thr->ts.place_partition_len)
++		    /* T > P.  S has been changed, so needs
++		       to be recomputed.  */
++		    s = nthreads / thr->ts.place_partition_len;
++		  k = 1;
++		  p = thr->place - 1;
++		  break;
++		case omp_proc_bind_master:
++		  /* No vars have been changed.  */
++		  break;
++		case omp_proc_bind_spread:
++		  p = thr->ts.place_partition_off;
++		  if (k != 0)
++		    {
++		      /* T > P.  */
++		      s = nthreads / team->prev_ts.place_partition_len;
++		      k = 1;
++		    }
++		  break;
++		}
++
++	      /* Increase the barrier threshold to make sure all new
++		 threads and all the threads we're going to let die
++		 arrive before the team is released.  */
++	      if (affinity_count)
++		gomp_barrier_reinit (&pool->threads_dock,
++				     nthreads + affinity_count);
++	    }
++	}
++
+       if (i == nthreads)
+ 	goto do_release;
+ 
+-      /* If necessary, expand the size of the gomp_threads array.  It is
+-	 expected that changes in the number of threads are rare, thus we
+-	 make no effort to expand gomp_threads_size geometrically.  */
+-      if (nthreads >= pool->threads_size)
+-	{
+-	  pool->threads_size = nthreads + 1;
+-	  pool->threads
+-	    = gomp_realloc (pool->threads,
+-			    pool->threads_size
+-			    * sizeof (struct gomp_thread_data *));
+-	}
+     }
+ 
+-  if (__builtin_expect (nthreads > old_threads_used, 0))
++  if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
+     {
+-      long diff = (long) nthreads - (long) old_threads_used;
++      long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
+ 
+       if (old_threads_used == 0)
+ 	--diff;
+@@ -389,14 +680,14 @@ gomp_team_start (void (*fn) (void *), vo
+ #ifdef HAVE_SYNC_BUILTINS
+       __sync_fetch_and_add (&gomp_managed_threads, diff);
+ #else
+-      gomp_mutex_lock (&gomp_remaining_threads_lock);
++      gomp_mutex_lock (&gomp_managed_threads_lock);
+       gomp_managed_threads += diff;
+-      gomp_mutex_unlock (&gomp_remaining_threads_lock);
++      gomp_mutex_unlock (&gomp_managed_threads_lock);
+ #endif
+     }
+ 
+   attr = &gomp_thread_attr;
+-  if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
++  if (__builtin_expect (gomp_places_list != NULL, 0))
+     {
+       size_t stacksize;
+       pthread_attr_init (&thread_attr);
+@@ -410,11 +701,78 @@ gomp_team_start (void (*fn) (void *), vo
+ 			    * (nthreads-i));
+ 
+   /* Launch new threads.  */
+-  for (; i < nthreads; ++i, ++start_data)
++  for (; i < nthreads; ++i)
+     {
+       pthread_t pt;
+       int err;
+ 
++      start_data->ts.place_partition_off = thr->ts.place_partition_off;
++      start_data->ts.place_partition_len = thr->ts.place_partition_len;
++      start_data->place = 0;
++      if (__builtin_expect (gomp_places_list != NULL, 0))
++	{
++	  switch (bind)
++	    {
++	    case omp_proc_bind_true:
++	    case omp_proc_bind_close:
++	      if (k == s)
++		{
++		  ++p;
++		  if (p == (team->prev_ts.place_partition_off
++			    + team->prev_ts.place_partition_len))
++		    p = team->prev_ts.place_partition_off;
++		  k = 1;
++		  if (i == nthreads - rest)
++		    s = 1;
++		}
++	      else
++		++k;
++	      break;
++	    case omp_proc_bind_master:
++	      break;
++	    case omp_proc_bind_spread:
++	      if (k == 0)
++		{
++		  /* T <= P.  */
++		  if (p < rest)
++		    p += s + 1;
++		  else
++		    p += s;
++		  if (p == (team->prev_ts.place_partition_off
++			    + team->prev_ts.place_partition_len))
++		    p = team->prev_ts.place_partition_off;
++		  start_data->ts.place_partition_off = p;
++		  if (p < rest)
++		    start_data->ts.place_partition_len = s + 1;
++		  else
++		    start_data->ts.place_partition_len = s;
++		}
++	      else
++		{
++		  /* T > P.  */
++		  if (k == s)
++		    {
++		      ++p;
++		      if (p == (team->prev_ts.place_partition_off
++				+ team->prev_ts.place_partition_len))
++			p = team->prev_ts.place_partition_off;
++		      k = 1;
++		      if (i == nthreads - rest)
++			s = 1;
++		    }
++		  else
++		    ++k;
++		  start_data->ts.place_partition_off = p;
++		  start_data->ts.place_partition_len = 1;
++		}
++	      break;
++	    }
++	  start_data->place = p + 1;
++	  if (affinity_thr != NULL && pool->threads[i] != NULL)
++	    continue;
++	  gomp_init_thread_affinity (attr, p);
++	}
++
+       start_data->fn = fn;
+       start_data->fn_data = data;
+       start_data->ts.team = team;
+@@ -430,18 +788,16 @@ gomp_team_start (void (*fn) (void *), vo
+       start_data->task = &team->implicit_task[i];
+       gomp_init_task (start_data->task, task, icv);
+       team->implicit_task[i].icv.nthreads_var = nthreads_var;
++      team->implicit_task[i].icv.bind_var = bind_var;
+       start_data->thread_pool = pool;
+       start_data->nested = nested;
+ 
+-      if (gomp_cpu_affinity != NULL)
+-	gomp_init_thread_affinity (attr);
+-
+-      err = pthread_create (&pt, attr, gomp_thread_start, start_data);
++      err = pthread_create (&pt, attr, gomp_thread_start, start_data++);
+       if (err != 0)
+ 	gomp_fatal ("Thread creation failed: %s", strerror (err));
+     }
+ 
+-  if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
++  if (__builtin_expect (gomp_places_list != NULL, 0))
+     pthread_attr_destroy (&thread_attr);
+ 
+  do_release:
+@@ -450,21 +806,32 @@ gomp_team_start (void (*fn) (void *), vo
+   /* Decrease the barrier threshold to match the number of threads
+      that should arrive back at the end of this team.  The extra
+      threads should be exiting.  Note that we arrange for this test
+-     to never be true for nested teams.  */
+-  if (__builtin_expect (nthreads < old_threads_used, 0))
++     to never be true for nested teams.  If AFFINITY_COUNT is non-zero,
++     the barrier as well as gomp_managed_threads was temporarily
++     set to NTHREADS + AFFINITY_COUNT.  For NTHREADS < OLD_THREADS_COUNT,
++     AFFINITY_COUNT if non-zero will be always at least
++     OLD_THREADS_COUNT - NTHREADS.  */
++  if (__builtin_expect (nthreads < old_threads_used, 0)
++      || __builtin_expect (affinity_count, 0))
+     {
+       long diff = (long) nthreads - (long) old_threads_used;
+ 
++      if (affinity_count)
++	diff = -affinity_count;
++
+       gomp_barrier_reinit (&pool->threads_dock, nthreads);
+ 
+ #ifdef HAVE_SYNC_BUILTINS
+       __sync_fetch_and_add (&gomp_managed_threads, diff);
+ #else
+-      gomp_mutex_lock (&gomp_remaining_threads_lock);
++      gomp_mutex_lock (&gomp_managed_threads_lock);
+       gomp_managed_threads += diff;
+-      gomp_mutex_unlock (&gomp_remaining_threads_lock);
++      gomp_mutex_unlock (&gomp_managed_threads_lock);
+ #endif
+     }
++  if (__builtin_expect (affinity_thr != NULL, 0)
++      && team->prev_ts.place_partition_len > 64)
++    free (affinity_thr);
+ }
+ 
+ 
+@@ -477,9 +844,26 @@ gomp_team_end (void)
+   struct gomp_thread *thr = gomp_thread ();
+   struct gomp_team *team = thr->ts.team;
+ 
+-  /* This barrier handles all pending explicit threads.  */
+-  gomp_team_barrier_wait (&team->barrier);
+-  gomp_fini_work_share (thr->ts.work_share);
++  /* This barrier handles all pending explicit threads.
++     As #pragma omp cancel parallel might get awaited count in
++     team->barrier in a inconsistent state, we need to use a different
++     counter here.  */
++  gomp_team_barrier_wait_final (&team->barrier);
++  if (__builtin_expect (team->team_cancelled, 0))
++    {
++      struct gomp_work_share *ws = team->work_shares_to_free;
++      do
++	{
++	  struct gomp_work_share *next_ws = gomp_ptrlock_get (&ws->next_ws);
++	  if (next_ws == NULL)
++	    gomp_ptrlock_set (&ws->next_ws, ws);
++	  gomp_fini_work_share (ws);
++	  ws = next_ws;
++	}
++      while (ws != NULL);
++    }
++  else
++    gomp_fini_work_share (thr->ts.work_share);
+ 
+   gomp_end_task ();
+   thr->ts = team->prev_ts;
+@@ -489,9 +873,9 @@ gomp_team_end (void)
+ #ifdef HAVE_SYNC_BUILTINS
+       __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
+ #else
+-      gomp_mutex_lock (&gomp_remaining_threads_lock);
++      gomp_mutex_lock (&gomp_managed_threads_lock);
+       gomp_managed_threads -= team->nthreads - 1L;
+-      gomp_mutex_unlock (&gomp_remaining_threads_lock);
++      gomp_mutex_unlock (&gomp_managed_threads_lock);
+ #endif
+       /* This barrier has gomp_barrier_wait_last counterparts
+ 	 and ensures the team can be safely destroyed.  */
+@@ -532,8 +916,6 @@ gomp_team_end (void)
+ static void __attribute__((constructor))
+ initialize_team (void)
+ {
+-  struct gomp_thread *thr;
+-
+ #ifndef HAVE_TLS
+   static struct gomp_thread initial_thread_tls_data;
+ 
+@@ -543,13 +925,6 @@ initialize_team (void)
+ 
+   if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
+     gomp_fatal ("could not create thread pool destructor.");
+-
+-#ifdef HAVE_TLS
+-  thr = &gomp_tls_data;
+-#else
+-  thr = &initial_thread_tls_data;
+-#endif
+-  gomp_sem_init (&thr->release, 0);
+ }
+ 
+ static void __attribute__((destructor))
+--- libgomp/sections.c	(revision 210461)
++++ libgomp/sections.c	(revision 210462)
+@@ -139,11 +139,27 @@ GOMP_parallel_sections_start (void (*fn)
+   num_threads = gomp_resolve_num_threads (num_threads, count);
+   team = gomp_new_team (num_threads);
+   gomp_sections_init (&team->work_shares[0], count);
+-  gomp_team_start (fn, data, num_threads, team);
++  gomp_team_start (fn, data, num_threads, 0, team);
++}
++
++ialias_redirect (GOMP_parallel_end)
++
++void
++GOMP_parallel_sections (void (*fn) (void *), void *data,
++			unsigned num_threads, unsigned count, unsigned flags)
++{
++  struct gomp_team *team;
++
++  num_threads = gomp_resolve_num_threads (num_threads, count);
++  team = gomp_new_team (num_threads);
++  gomp_sections_init (&team->work_shares[0], count);
++  gomp_team_start (fn, data, num_threads, flags, team);
++  fn (data);
++  GOMP_parallel_end ();
+ }
+ 
+ /* The GOMP_section_end* routines are called after the thread is told
+-   that all sections are complete.  This first version synchronizes
++   that all sections are complete.  The first two versions synchronize
+    all threads; the nowait version does not.  */
+ 
+ void
+@@ -152,6 +168,12 @@ GOMP_sections_end (void)
+   gomp_work_share_end ();
+ }
+ 
++bool
++GOMP_sections_end_cancel (void)
++{
++  return gomp_work_share_end_cancel ();
++}
++
+ void
+ GOMP_sections_end_nowait (void)
+ {
+--- libgomp/env.c	(revision 210461)
++++ libgomp/env.c	(revision 210462)
+@@ -29,6 +29,10 @@
+ #include "libgomp_f.h"
+ #include <ctype.h>
+ #include <stdlib.h>
++#include <stdio.h>
++#ifdef HAVE_INTTYPES_H
++# include <inttypes.h>	/* For PRIu64.  */
++#endif
+ #ifdef STRING_WITH_STRINGS
+ # include <string.h>
+ # include <strings.h>
+@@ -50,23 +54,28 @@
+ 
+ struct gomp_task_icv gomp_global_icv = {
+   .nthreads_var = 1,
++  .thread_limit_var = UINT_MAX,
+   .run_sched_var = GFS_DYNAMIC,
+   .run_sched_modifier = 1,
++  .default_device_var = 0,
+   .dyn_var = false,
+-  .nest_var = false
++  .nest_var = false,
++  .bind_var = omp_proc_bind_false,
++  .target_data = NULL
+ };
+ 
+-unsigned short *gomp_cpu_affinity;
+-size_t gomp_cpu_affinity_len;
+ unsigned long gomp_max_active_levels_var = INT_MAX;
+-unsigned long gomp_thread_limit_var = ULONG_MAX;
+-unsigned long gomp_remaining_threads_count;
++bool gomp_cancel_var = false;
+ #ifndef HAVE_SYNC_BUILTINS
+-gomp_mutex_t gomp_remaining_threads_lock;
++gomp_mutex_t gomp_managed_threads_lock;
+ #endif
+ unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
+ unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
+ unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
++char *gomp_bind_var_list;
++unsigned long gomp_bind_var_list_len;
++void **gomp_places_list;
++unsigned long gomp_places_list_len;
+ 
+ /* Parse the OMP_SCHEDULE environment variable.  */
+ 
+@@ -184,6 +193,24 @@ parse_unsigned_long (const char *name, u
+   return false;
+ }
+ 
++/* Parse a positive int environment variable.  Return true if one was
++   present and it was successfully parsed.  */
++
++static bool
++parse_int (const char *name, int *pvalue, bool allow_zero)
++{
++  unsigned long value;
++  if (!parse_unsigned_long (name, &value, allow_zero))
++    return false;
++  if (value > INT_MAX)
++    {
++      gomp_error ("Invalid value for environment variable %s", name);
++      return false;
++    }
++  *pvalue = (int) value;
++  return true;
++}
++
+ /* Parse an unsigned long list environment variable.  Return true if one was
+    present and it was successfully parsed.  */
+ 
+@@ -273,6 +300,416 @@ parse_unsigned_long_list (const char *na
+   return false;
+ }
+ 
++/* Parse environment variable set to a boolean or list of omp_proc_bind_t
++   enum values.  Return true if one was present and it was successfully
++   parsed.  */
++
++static bool
++parse_bind_var (const char *name, char *p1stvalue,
++		char **pvalues, unsigned long *pnvalues)
++{
++  char *env;
++  char value = omp_proc_bind_false, *values = NULL;
++  int i;
++  static struct proc_bind_kinds
++  {
++    const char name[7];
++    const char len;
++    omp_proc_bind_t kind;
++  } kinds[] =
++  {
++    { "false", 5, omp_proc_bind_false },
++    { "true", 4, omp_proc_bind_true },
++    { "master", 6, omp_proc_bind_master },
++    { "close", 5, omp_proc_bind_close },
++    { "spread", 6, omp_proc_bind_spread }
++  };
++
++  env = getenv (name);
++  if (env == NULL)
++    return false;
++
++  while (isspace ((unsigned char) *env))
++    ++env;
++  if (*env == '\0')
++    goto invalid;
++
++  for (i = 0; i < 5; i++)
++    if (strncasecmp (env, kinds[i].name, kinds[i].len) == 0)
++      {
++	value = kinds[i].kind;
++	env += kinds[i].len;
++	break;
++      }
++  if (i == 5)
++    goto invalid;
++
++  while (isspace ((unsigned char) *env))
++    ++env;
++  if (*env != '\0')
++    {
++      if (*env == ',')
++	{
++	  unsigned long nvalues = 0, nalloced = 0;
++
++	  if (value == omp_proc_bind_false
++	      || value == omp_proc_bind_true)
++	    goto invalid;
++
++	  do
++	    {
++	      env++;
++	      if (nvalues == nalloced)
++		{
++		  char *n;
++		  nalloced = nalloced ? nalloced * 2 : 16;
++		  n = realloc (values, nalloced);
++		  if (n == NULL)
++		    {
++		      free (values);
++		      gomp_error ("Out of memory while trying to parse"
++				  " environment variable %s", name);
++		      return false;
++		    }
++		  values = n;
++		  if (nvalues == 0)
++		    values[nvalues++] = value;
++		}
++
++	      while (isspace ((unsigned char) *env))
++		++env;
++	      if (*env == '\0')
++		goto invalid;
++
++	      for (i = 2; i < 5; i++)
++		if (strncasecmp (env, kinds[i].name, kinds[i].len) == 0)
++		  {
++		    value = kinds[i].kind;
++		    env += kinds[i].len;
++		    break;
++		  }
++	      if (i == 5)
++		goto invalid;
++
++	      values[nvalues++] = value;
++	      while (isspace ((unsigned char) *env))
++		++env;
++	      if (*env == '\0')
++		break;
++	      if (*env != ',')
++		goto invalid;
++	    }
++	  while (1);
++	  *p1stvalue = values[0];
++	  *pvalues = values;
++	  *pnvalues = nvalues;
++	  return true;
++	}
++      goto invalid;
++    }
++
++  *p1stvalue = value;
++  return true;
++
++ invalid:
++  free (values);
++  gomp_error ("Invalid value for environment variable %s", name);
++  return false;
++}
++
++static bool
++parse_one_place (char **envp, bool *negatep, unsigned long *lenp,
++		 long *stridep)
++{
++  char *env = *envp, *start;
++  void *p = gomp_places_list ? gomp_places_list[gomp_places_list_len] : NULL;
++  unsigned long len = 1;
++  long stride = 1;
++  int pass;
++  bool any_negate = false;
++  *negatep = false;
++  while (isspace ((unsigned char) *env))
++    ++env;
++  if (*env == '!')
++    {
++      *negatep = true;
++      ++env;
++      while (isspace ((unsigned char) *env))
++	++env;
++    }
++  if (*env != '{')
++    return false;
++  ++env;
++  while (isspace ((unsigned char) *env))
++    ++env;
++  start = env;
++  for (pass = 0; pass < (any_negate ? 2 : 1); pass++)
++    {
++      env = start;
++      do
++	{
++	  unsigned long this_num, this_len = 1;
++	  long this_stride = 1;
++	  bool this_negate = (*env == '!');
++	  if (this_negate)
++	    {
++	      if (gomp_places_list)
++		any_negate = true;
++	      ++env;
++	      while (isspace ((unsigned char) *env))
++		++env;
++	    }
++
++	  errno = 0;
++	  this_num = strtoul (env, &env, 10);
++	  if (errno)
++	    return false;
++	  while (isspace ((unsigned char) *env))
++	    ++env;
++	  if (*env == ':')
++	    {
++	      ++env;
++	      while (isspace ((unsigned char) *env))
++		++env;
++	      errno = 0;
++	      this_len = strtoul (env, &env, 10);
++	      if (errno || this_len == 0)
++		return false;
++	      while (isspace ((unsigned char) *env))
++		++env;
++	      if (*env == ':')
++		{
++		  ++env;
++		  while (isspace ((unsigned char) *env))
++		    ++env;
++		  errno = 0;
++		  this_stride = strtol (env, &env, 10);
++		  if (errno)
++		    return false;
++		  while (isspace ((unsigned char) *env))
++		    ++env;
++		}
++	    }
++	  if (this_negate && this_len != 1)
++	    return false;
++	  if (gomp_places_list && pass == this_negate)
++	    {
++	      if (this_negate)
++		{
++		  if (!gomp_affinity_remove_cpu (p, this_num))
++		    return false;
++		}
++	      else if (!gomp_affinity_add_cpus (p, this_num, this_len,
++						this_stride, false))
++		return false;
++	    }
++	  if (*env == '}')
++	    break;
++	  if (*env != ',')
++	    return false;
++	  ++env;
++	}
++      while (1);
++    }
++
++  ++env;
++  while (isspace ((unsigned char) *env))
++    ++env;
++  if (*env == ':')
++    {
++      ++env;
++      while (isspace ((unsigned char) *env))
++	++env;
++      errno = 0;
++      len = strtoul (env, &env, 10);
++      if (errno || len == 0 || len >= 65536)
++	return false;
++      while (isspace ((unsigned char) *env))
++	++env;
++      if (*env == ':')
++	{
++	  ++env;
++	  while (isspace ((unsigned char) *env))
++	    ++env;
++	  errno = 0;
++	  stride = strtol (env, &env, 10);
++	  if (errno)
++	    return false;
++	  while (isspace ((unsigned char) *env))
++	    ++env;
++	}
++    }
++  if (*negatep && len != 1)
++    return false;
++  *envp = env;
++  *lenp = len;
++  *stridep = stride;
++  return true;
++}
++
++static bool
++parse_places_var (const char *name, bool ignore)
++{
++  char *env = getenv (name), *end;
++  bool any_negate = false;
++  int level = 0;
++  unsigned long count = 0;
++  if (env == NULL)
++    return false;
++
++  while (isspace ((unsigned char) *env))
++    ++env;
++  if (*env == '\0')
++    goto invalid;
++
++  if (strncasecmp (env, "threads", 7) == 0)
++    {
++      env += 7;
++      level = 1;
++    }
++  else if (strncasecmp (env, "cores", 5) == 0)
++    {
++      env += 5;
++      level = 2;
++    }
++  else if (strncasecmp (env, "sockets", 7) == 0)
++    {
++      env += 7;
++      level = 3;
++    }
++  if (level)
++    {
++      count = ULONG_MAX;
++      while (isspace ((unsigned char) *env))
++	++env;
++      if (*env != '\0')
++	{
++	  if (*env++ != '(')
++	    goto invalid;
++	  while (isspace ((unsigned char) *env))
++	    ++env;
++
++	  errno = 0;
++	  count = strtoul (env, &end, 10);
++	  if (errno)
++	    goto invalid;
++	  env = end;
++	  while (isspace ((unsigned char) *env))
++	    ++env;
++	  if (*env != ')')
++	    goto invalid;
++	  ++env;
++	  while (isspace ((unsigned char) *env))
++	    ++env;
++	  if (*env != '\0')
++	    goto invalid;
++	}
++
++      if (ignore)
++	return false;
++
++      return gomp_affinity_init_level (level, count, false);
++    }
++
++  count = 0;
++  end = env;
++  do
++    {
++      bool negate;
++      unsigned long len;
++      long stride;
++      if (!parse_one_place (&end, &negate, &len, &stride))
++	goto invalid;
++      if (negate)
++	{
++	  if (!any_negate)
++	    count++;
++	  any_negate = true;
++	}
++      else
++	count += len;
++      if (count > 65536)
++	goto invalid;
++      if (*end == '\0')
++	break;
++      if (*end != ',')
++	goto invalid;
++      end++;
++    }
++  while (1);
++
++  if (ignore)
++    return false;
++
++  gomp_places_list_len = 0;
++  gomp_places_list = gomp_affinity_alloc (count, false);
++  if (gomp_places_list == NULL)
++    return false;
++
++  do
++    {
++      bool negate;
++      unsigned long len;
++      long stride;
++      gomp_affinity_init_place (gomp_places_list[gomp_places_list_len]);
++      if (!parse_one_place (&env, &negate, &len, &stride))
++	goto invalid;
++      if (negate)
++	{
++	  void *p;
++	  for (count = 0; count < gomp_places_list_len; count++)
++	    if (gomp_affinity_same_place
++			(gomp_places_list[count],
++			 gomp_places_list[gomp_places_list_len]))
++	      break;
++	  if (count == gomp_places_list_len)
++	    {
++	      gomp_error ("Trying to remove a non-existing place from list "
++			  "of places");
++	      goto invalid;
++	    }
++	  p = gomp_places_list[count];
++	  memmove (&gomp_places_list[count],
++		   &gomp_places_list[count + 1],
++		   (gomp_places_list_len - count - 1) * sizeof (void *));
++	  --gomp_places_list_len;
++	  gomp_places_list[gomp_places_list_len] = p;
++	}
++      else if (len == 1)
++	++gomp_places_list_len;
++      else
++	{
++	  for (count = 0; count < len - 1; count++)
++	    if (!gomp_affinity_copy_place
++			(gomp_places_list[gomp_places_list_len + count + 1],
++			 gomp_places_list[gomp_places_list_len + count],
++			 stride))
++	      goto invalid;
++	  gomp_places_list_len += len;
++	}
++      if (*env == '\0')
++	break;
++      env++;
++    }
++  while (1);
++
++  if (gomp_places_list_len == 0)
++    {
++      gomp_error ("All places have been removed");
++      goto invalid;
++    }
++  if (!gomp_affinity_finalize_place_list (false))
++    goto invalid;
++  return true;
++
++ invalid:
++  free (gomp_places_list);
++  gomp_places_list = NULL;
++  gomp_places_list_len = 0;
++  gomp_error ("Invalid value for environment variable %s", name);
++  return false;
++}
++
+ /* Parse the OMP_STACKSIZE environment varible.  Return true if one was
+    present and it was successfully parsed.  */
+ 
+@@ -478,86 +915,95 @@ parse_wait_policy (void)
+    present and it was successfully parsed.  */
+ 
+ static bool
+-parse_affinity (void)
++parse_affinity (bool ignore)
+ {
+-  char *env, *end;
++  char *env, *end, *start;
++  int pass;
+   unsigned long cpu_beg, cpu_end, cpu_stride;
+-  unsigned short *cpus = NULL;
+-  size_t allocated = 0, used = 0, needed;
++  size_t count = 0, needed;
+ 
+   env = getenv ("GOMP_CPU_AFFINITY");
+   if (env == NULL)
+     return false;
+ 
+-  do
++  start = env;
++  for (pass = 0; pass < 2; pass++)
+     {
+-      while (*env == ' ' || *env == '\t')
+-	env++;
+-
+-      cpu_beg = strtoul (env, &end, 0);
+-      cpu_end = cpu_beg;
+-      cpu_stride = 1;
+-      if (env == end || cpu_beg >= 65536)
+-	goto invalid;
++      env = start;
++      if (pass == 1)
++	{
++	  if (ignore)
++	    return false;
+ 
+-      env = end;
+-      if (*env == '-')
++	  gomp_places_list_len = 0;
++	  gomp_places_list = gomp_affinity_alloc (count, true);
++	  if (gomp_places_list == NULL)
++	    return false;
++	}
++      do
+ 	{
+-	  cpu_end = strtoul (++env, &end, 0);
+-	  if (env == end || cpu_end >= 65536 || cpu_end < cpu_beg)
++	  while (isspace ((unsigned char) *env))
++	    ++env;
++
++	  errno = 0;
++	  cpu_beg = strtoul (env, &end, 0);
++	  if (errno || cpu_beg >= 65536)
+ 	    goto invalid;
++	  cpu_end = cpu_beg;
++	  cpu_stride = 1;
+ 
+ 	  env = end;
+-	  if (*env == ':')
++	  if (*env == '-')
+ 	    {
+-	      cpu_stride = strtoul (++env, &end, 0);
+-	      if (env == end || cpu_stride == 0 || cpu_stride >= 65536)
++	      errno = 0;
++	      cpu_end = strtoul (++env, &end, 0);
++	      if (errno || cpu_end >= 65536 || cpu_end < cpu_beg)
+ 		goto invalid;
+ 
+ 	      env = end;
+-	    }
+-	}
++	      if (*env == ':')
++		{
++		  errno = 0;
++		  cpu_stride = strtoul (++env, &end, 0);
++		  if (errno || cpu_stride == 0 || cpu_stride >= 65536)
++		    goto invalid;
+ 
+-      needed = (cpu_end - cpu_beg) / cpu_stride + 1;
+-      if (used + needed >= allocated)
+-	{
+-	  unsigned short *new_cpus;
++		  env = end;
++		}
++	    }
+ 
+-	  if (allocated < 64)
+-	    allocated = 64;
+-	  if (allocated > needed)
+-	    allocated <<= 1;
++	  needed = (cpu_end - cpu_beg) / cpu_stride + 1;
++	  if (pass == 0)
++	    count += needed;
+ 	  else
+-	    allocated += 2 * needed;
+-	  new_cpus = realloc (cpus, allocated * sizeof (unsigned short));
+-	  if (new_cpus == NULL)
+ 	    {
+-	      free (cpus);
+-	      gomp_error ("not enough memory to store GOMP_CPU_AFFINITY list");
+-	      return false;
++	      while (needed--)
++		{
++		  void *p = gomp_places_list[gomp_places_list_len];
++		  gomp_affinity_init_place (p);
++		  if (gomp_affinity_add_cpus (p, cpu_beg, 1, 0, true))
++		    ++gomp_places_list_len;
++		  cpu_beg += cpu_stride;
++		}
+ 	    }
+ 
+-	  cpus = new_cpus;
+-	}
++	  while (isspace ((unsigned char) *env))
++	    ++env;
+ 
+-      while (needed--)
+-	{
+-	  cpus[used++] = cpu_beg;
+-	  cpu_beg += cpu_stride;
++	  if (*env == ',')
++	    env++;
++	  else if (*env == '\0')
++	    break;
+ 	}
+-
+-      while (*env == ' ' || *env == '\t')
+-	env++;
+-
+-      if (*env == ',')
+-	env++;
+-      else if (*env == '\0')
+-	break;
++      while (1);
+     }
+-  while (1);
+ 
+-  gomp_cpu_affinity = cpus;
+-  gomp_cpu_affinity_len = used;
++  if (gomp_places_list_len == 0)
++    {
++      free (gomp_places_list);
++      gomp_places_list = NULL;
++      return false;
++    }
+   return true;
+ 
+  invalid:
+@@ -565,12 +1011,160 @@ parse_affinity (void)
+   return false;
+ }
+ 
++
++static void
++handle_omp_display_env (unsigned long stacksize, int wait_policy)
++{
++  const char *env;
++  bool display = false;
++  bool verbose = false;
++  int i;
++
++  env = getenv ("OMP_DISPLAY_ENV");
++  if (env == NULL)
++    return;
++
++  while (isspace ((unsigned char) *env))
++    ++env;
++  if (strncasecmp (env, "true", 4) == 0)
++    {
++      display = true;
++      env += 4;
++    }
++  else if (strncasecmp (env, "false", 5) == 0)
++    {
++      display = false;
++      env += 5;
++    }
++  else if (strncasecmp (env, "verbose", 7) == 0)
++    {
++      display = true;
++      verbose = true;
++      env += 7;
++    }
++  else
++    env = "X";
++  while (isspace ((unsigned char) *env))
++    ++env;
++  if (*env != '\0')
++    gomp_error ("Invalid value for environment variable OMP_DISPLAY_ENV");
++
++  if (!display)
++    return;
++
++  fputs ("\nOPENMP DISPLAY ENVIRONMENT BEGIN\n", stderr);
++
++  fputs ("  _OPENMP = '201307'\n", stderr);
++  fprintf (stderr, "  OMP_DYNAMIC = '%s'\n",
++	   gomp_global_icv.dyn_var ? "TRUE" : "FALSE");
++  fprintf (stderr, "  OMP_NESTED = '%s'\n",
++	   gomp_global_icv.nest_var ? "TRUE" : "FALSE");
++
++  fprintf (stderr, "  OMP_NUM_THREADS = '%lu", gomp_global_icv.nthreads_var);
++  for (i = 1; i < gomp_nthreads_var_list_len; i++)
++    fprintf (stderr, ",%lu", gomp_nthreads_var_list[i]);
++  fputs ("'\n", stderr);
++
++  fprintf (stderr, "  OMP_SCHEDULE = '");
++  switch (gomp_global_icv.run_sched_var)
++    {
++    case GFS_RUNTIME:
++      fputs ("RUNTIME", stderr);
++      break;
++    case GFS_STATIC:
++      fputs ("STATIC", stderr);
++      break;
++    case GFS_DYNAMIC:
++      fputs ("DYNAMIC", stderr);
++      break;
++    case GFS_GUIDED:
++      fputs ("GUIDED", stderr);
++      break;
++    case GFS_AUTO:
++      fputs ("AUTO", stderr);
++      break;
++    }
++  fputs ("'\n", stderr);
++
++  fputs ("  OMP_PROC_BIND = '", stderr);
++  switch (gomp_global_icv.bind_var)
++    {
++    case omp_proc_bind_false:
++      fputs ("FALSE", stderr);
++      break;
++    case omp_proc_bind_true:
++      fputs ("TRUE", stderr);
++      break;
++    case omp_proc_bind_master:
++      fputs ("MASTER", stderr);
++      break;
++    case omp_proc_bind_close:
++      fputs ("CLOSE", stderr);
++      break;
++    case omp_proc_bind_spread:
++      fputs ("SPREAD", stderr);
++      break;
++    }
++  for (i = 1; i < gomp_bind_var_list_len; i++)
++    switch (gomp_bind_var_list[i])
++      {
++      case omp_proc_bind_master:
++	fputs (",MASTER", stderr);
++	break;
++      case omp_proc_bind_close:
++	fputs (",CLOSE", stderr);
++	break;
++      case omp_proc_bind_spread:
++	fputs (",SPREAD", stderr);
++	break;
++      }
++  fputs ("'\n", stderr);
++  fputs ("  OMP_PLACES = '", stderr);
++  for (i = 0; i < gomp_places_list_len; i++)
++    {
++      fputs ("{", stderr);
++      gomp_affinity_print_place (gomp_places_list[i]);
++      fputs (i + 1 == gomp_places_list_len ? "}" : "},", stderr);
++    }
++  fputs ("'\n", stderr);
++
++  fprintf (stderr, "  OMP_STACKSIZE = '%lu'\n", stacksize);
++
++  /* GOMP's default value is actually neither active nor passive.  */
++  fprintf (stderr, "  OMP_WAIT_POLICY = '%s'\n",
++	   wait_policy > 0 ? "ACTIVE" : "PASSIVE");
++  fprintf (stderr, "  OMP_THREAD_LIMIT = '%u'\n",
++	   gomp_global_icv.thread_limit_var);
++  fprintf (stderr, "  OMP_MAX_ACTIVE_LEVELS = '%lu'\n",
++	   gomp_max_active_levels_var);
++
++  fprintf (stderr, "  OMP_CANCELLATION = '%s'\n",
++	   gomp_cancel_var ? "TRUE" : "FALSE");
++  fprintf (stderr, "  OMP_DEFAULT_DEVICE = '%d'\n",
++	   gomp_global_icv.default_device_var);
++
++  if (verbose)
++    {
++      fputs ("  GOMP_CPU_AFFINITY = ''\n", stderr);
++      fprintf (stderr, "  GOMP_STACKSIZE = '%lu'\n", stacksize);
++#ifdef HAVE_INTTYPES_H
++      fprintf (stderr, "  GOMP_SPINCOUNT = '%"PRIu64"'\n",
++	       (uint64_t) gomp_spin_count_var);
++#else
++      fprintf (stderr, "  GOMP_SPINCOUNT = '%lu'\n",
++	       (unsigned long) gomp_spin_count_var);
++#endif
++    }
++
++  fputs ("OPENMP DISPLAY ENVIRONMENT END\n", stderr);
++}
++
++
+ static void __attribute__((constructor))
+ initialize_env (void)
+ {
+-  unsigned long stacksize;
++  unsigned long thread_limit_var, stacksize;
+   int wait_policy;
+-  bool bind_var = false;
+ 
+   /* Do a compile time check that mkomp_h.pl did good job.  */
+   omp_check_defines ();
+@@ -578,14 +1172,17 @@ initialize_env (void)
+   parse_schedule ();
+   parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var);
+   parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
+-  parse_boolean ("OMP_PROC_BIND", &bind_var);
++  parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var);
++  parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true);
+   parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var,
+ 		       true);
+-  parse_unsigned_long ("OMP_THREAD_LIMIT", &gomp_thread_limit_var, false);
+-  if (gomp_thread_limit_var != ULONG_MAX)
+-    gomp_remaining_threads_count = gomp_thread_limit_var - 1;
++  if (parse_unsigned_long ("OMP_THREAD_LIMIT", &thread_limit_var, false))
++    {
++      gomp_global_icv.thread_limit_var
++	= thread_limit_var > INT_MAX ? UINT_MAX : thread_limit_var;
++    }
+ #ifndef HAVE_SYNC_BUILTINS
+-  gomp_mutex_init (&gomp_remaining_threads_lock);
++  gomp_mutex_init (&gomp_managed_threads_lock);
+ #endif
+   gomp_init_num_threads ();
+   gomp_available_cpus = gomp_global_icv.nthreads_var;
+@@ -594,7 +1191,34 @@ initialize_env (void)
+ 				 &gomp_nthreads_var_list,
+ 				 &gomp_nthreads_var_list_len))
+     gomp_global_icv.nthreads_var = gomp_available_cpus;
+-  if (parse_affinity () || bind_var)
++  bool ignore = false;
++  if (parse_bind_var ("OMP_PROC_BIND",
++		      &gomp_global_icv.bind_var,
++		      &gomp_bind_var_list,
++		      &gomp_bind_var_list_len)
++      && gomp_global_icv.bind_var == omp_proc_bind_false)
++    ignore = true;
++  /* Make sure OMP_PLACES and GOMP_CPU_AFFINITY env vars are always
++     parsed if present in the environment.  If OMP_PROC_BIND was set
++     explictly to false, don't populate places list though.  If places
++     list was successfully set from OMP_PLACES, only parse but don't process
++     GOMP_CPU_AFFINITY.  If OMP_PROC_BIND was not set in the environment,
++     default to OMP_PROC_BIND=true if OMP_PLACES or GOMP_CPU_AFFINITY
++     was successfully parsed into a places list, otherwise to
++     OMP_PROC_BIND=false.  */
++  if (parse_places_var ("OMP_PLACES", ignore))
++    {
++      if (gomp_global_icv.bind_var == omp_proc_bind_false)
++	gomp_global_icv.bind_var = true;
++      ignore = true;
++    }
++  if (parse_affinity (ignore))
++    {
++      if (gomp_global_icv.bind_var == omp_proc_bind_false)
++	gomp_global_icv.bind_var = true;
++      ignore = true;
++    }
++  if (gomp_global_icv.bind_var != omp_proc_bind_false)
+     gomp_init_affinity ();
+   wait_policy = parse_wait_policy ();
+   if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
+@@ -645,6 +1269,8 @@ initialize_env (void)
+       if (err != 0)
+ 	gomp_error ("Stack size change failed: %s", strerror (err));
+     }
++
++  handle_omp_display_env (stacksize, wait_policy);
+ }
+ 
+ 
+@@ -728,7 +1354,8 @@ omp_get_max_threads (void)
+ int
+ omp_get_thread_limit (void)
+ {
+-  return gomp_thread_limit_var > INT_MAX ? INT_MAX : gomp_thread_limit_var;
++  struct gomp_task_icv *icv = gomp_icv (false);
++  return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var;
+ }
+ 
+ void
+@@ -744,6 +1371,60 @@ omp_get_max_active_levels (void)
+   return gomp_max_active_levels_var;
+ }
+ 
++int
++omp_get_cancellation (void)
++{
++  return gomp_cancel_var;
++}
++
++omp_proc_bind_t
++omp_get_proc_bind (void)
++{
++  struct gomp_task_icv *icv = gomp_icv (false);
++  return icv->bind_var;
++}
++
++void
++omp_set_default_device (int device_num)
++{
++  struct gomp_task_icv *icv = gomp_icv (true);
++  icv->default_device_var = device_num >= 0 ? device_num : 0;
++}
++
++int
++omp_get_default_device (void)
++{
++  struct gomp_task_icv *icv = gomp_icv (false);
++  return icv->default_device_var;
++}
++
++int
++omp_get_num_devices (void)
++{
++  return gomp_get_num_devices ();
++}
++
++int
++omp_get_num_teams (void)
++{
++  /* Hardcoded to 1 on host, MIC, HSAIL?  Maybe variable on PTX.  */
++  return 1;
++}
++
++int
++omp_get_team_num (void)
++{
++  /* Hardcoded to 0 on host, MIC, HSAIL?  Maybe variable on PTX.  */
++  return 0;
++}
++
++int
++omp_is_initial_device (void)
++{
++  /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX.  */
++  return 1;
++}
++
+ ialias (omp_set_dynamic)
+ ialias (omp_set_nested)
+ ialias (omp_set_num_threads)
+@@ -755,3 +1436,11 @@ ialias (omp_get_max_threads)
+ ialias (omp_get_thread_limit)
+ ialias (omp_set_max_active_levels)
+ ialias (omp_get_max_active_levels)
++ialias (omp_get_cancellation)
++ialias (omp_get_proc_bind)
++ialias (omp_set_default_device)
++ialias (omp_get_default_device)
++ialias (omp_get_num_devices)
++ialias (omp_get_num_teams)
++ialias (omp_get_team_num)
++ialias (omp_is_initial_device)
+--- libgomp/libgomp.h	(revision 210461)
++++ libgomp/libgomp.h	(revision 213654)
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005-2013 Free Software Foundation, Inc.
++/* Copyright (C) 2005-2014 Free Software Foundation, Inc.
+    Contributed by Richard Henderson <rth@redhat.com>.
+ 
+    This file is part of the GNU OpenMP Library (libgomp).
+@@ -39,6 +39,7 @@
+ 
+ #include <pthread.h>
+ #include <stdbool.h>
++#include <stdlib.h>
+ 
+ #ifdef HAVE_ATTRIBUTE_VISIBILITY
+ # pragma GCC visibility push(hidden)
+@@ -201,6 +202,10 @@ struct gomp_team_state
+   /* Active nesting level.  Only active parallel regions are counted.  */
+   unsigned active_level;
+ 
++  /* Place-partition-var, offset and length into gomp_places_list array.  */
++  unsigned place_partition_off;
++  unsigned place_partition_len;
++
+ #ifdef HAVE_SYNC_BUILTINS
+   /* Number of single stmts encountered.  */
+   unsigned long single_count;
+@@ -214,30 +219,40 @@ struct gomp_team_state
+   unsigned long static_trip;
+ };
+ 
+-/* These are the OpenMP 3.0 Internal Control Variables described in
++struct target_mem_desc;
++
++/* These are the OpenMP 4.0 Internal Control Variables described in
+    section 2.3.1.  Those described as having one copy per task are
+    stored within the structure; those described as having one copy
+    for the whole program are (naturally) global variables.  */
+-
++   
+ struct gomp_task_icv
+ {
+   unsigned long nthreads_var;
+   enum gomp_schedule_type run_sched_var;
+   int run_sched_modifier;
++  int default_device_var;
++  unsigned int thread_limit_var;
+   bool dyn_var;
+   bool nest_var;
++  char bind_var;
++  /* Internal ICV.  */
++  struct target_mem_desc *target_data;
+ };
+ 
+ extern struct gomp_task_icv gomp_global_icv;
+-extern unsigned long gomp_thread_limit_var;
+-extern unsigned long gomp_remaining_threads_count;
+ #ifndef HAVE_SYNC_BUILTINS
+-extern gomp_mutex_t gomp_remaining_threads_lock;
++extern gomp_mutex_t gomp_managed_threads_lock;
+ #endif
+ extern unsigned long gomp_max_active_levels_var;
++extern bool gomp_cancel_var;
+ extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
+ extern unsigned long gomp_available_cpus, gomp_managed_threads;
+ extern unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
++extern char *gomp_bind_var_list;
++extern unsigned long gomp_bind_var_list_len;
++extern void **gomp_places_list;
++extern unsigned long gomp_places_list_len;
+ 
+ enum gomp_task_kind
+ {
+@@ -247,6 +262,39 @@ enum gomp_task_kind
+   GOMP_TASK_TIED
+ };
+ 
++struct gomp_task;
++struct gomp_taskgroup;
++struct htab;
++
++struct gomp_task_depend_entry
++{
++  void *addr;
++  struct gomp_task_depend_entry *next;
++  struct gomp_task_depend_entry *prev;
++  struct gomp_task *task;
++  bool is_in;
++  bool redundant;
++  bool redundant_out;
++};
++
++struct gomp_dependers_vec
++{
++  size_t n_elem;
++  size_t allocated;
++  struct gomp_task *elem[];
++};
++
++/* Used when in GOMP_taskwait or in gomp_task_maybe_wait_for_dependencies.  */
++
++struct gomp_taskwait
++{
++  bool in_taskwait;
++  bool in_depend_wait;
++  size_t n_depend;
++  struct gomp_task *last_parent_depends_on;
++  gomp_sem_t taskwait_sem;
++};
++
+ /* This structure describes a "task" to be run by a thread.  */
+ 
+ struct gomp_task
+@@ -257,14 +305,33 @@ struct gomp_task
+   struct gomp_task *prev_child;
+   struct gomp_task *next_queue;
+   struct gomp_task *prev_queue;
++  struct gomp_task *next_taskgroup;
++  struct gomp_task *prev_taskgroup;
++  struct gomp_taskgroup *taskgroup;
++  struct gomp_dependers_vec *dependers;
++  struct htab *depend_hash;
++  struct gomp_taskwait *taskwait;
++  size_t depend_count;
++  size_t num_dependees;
+   struct gomp_task_icv icv;
+   void (*fn) (void *);
+   void *fn_data;
+   enum gomp_task_kind kind;
+-  bool in_taskwait;
+   bool in_tied_task;
+   bool final_task;
+-  gomp_sem_t taskwait_sem;
++  bool copy_ctors_done;
++  bool parent_depends_on;
++  struct gomp_task_depend_entry depend[];
++};
++
++struct gomp_taskgroup
++{
++  struct gomp_taskgroup *prev;
++  struct gomp_task *children;
++  bool in_taskgroup_wait;
++  bool cancelled;
++  gomp_sem_t taskgroup_sem;
++  size_t num_children;
+ };
+ 
+ /* This structure describes a "team" of threads.  These are the threads
+@@ -293,6 +360,12 @@ struct gomp_team
+      of the threads in the team.  */
+   gomp_sem_t **ordered_release;
+ 
++  /* List of work shares on which gomp_fini_work_share hasn't been
++     called yet.  If the team hasn't been cancelled, this should be
++     equal to each thr->ts.work_share, but otherwise it can be a possibly
++     long list of workshares.  */
++  struct gomp_work_share *work_shares_to_free;
++
+   /* List of gomp_work_share structs chained through next_free fields.
+      This is populated and taken off only by the first thread in the
+      team encountering a new work sharing construct, in a critical
+@@ -324,8 +397,20 @@ struct gomp_team
+ 
+   gomp_mutex_t task_lock;
+   struct gomp_task *task_queue;
+-  int task_count;
+-  int task_running_count;
++  /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team.  */
++  unsigned int task_count;
++  /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled.  */
++  unsigned int task_queued_count;
++  /* Number of GOMP_TASK_{WAITING,TIED} tasks currently running
++     directly in gomp_barrier_handle_tasks; tasks spawned
++     from e.g. GOMP_taskwait or GOMP_taskgroup_end don't count, even when
++     that is called from a task run from gomp_barrier_handle_tasks.
++     task_running_count should be always <= team->nthreads,
++     and if current task isn't in_tied_task, then it will be
++     even < team->nthreads.  */
++  unsigned int task_running_count;
++  int work_share_cancelled;
++  int team_cancelled;
+ 
+   /* This array contains structures for implicit tasks.  */
+   struct gomp_task implicit_task[];
+@@ -350,7 +435,11 @@ struct gomp_thread
+   /* This semaphore is used for ordered loops.  */
+   gomp_sem_t release;
+ 
+-  /* user pthread thread pool */
++  /* Place this thread is bound to plus one, or zero if not bound
++     to any place.  */
++  unsigned int place;
++
++  /* User pthread thread pool */
+   struct gomp_thread_pool *thread_pool;
+ };
+ 
+@@ -363,11 +452,23 @@ struct gomp_thread_pool
+   unsigned threads_size;
+   unsigned threads_used;
+   struct gomp_team *last_team;
++  /* Number of threads running in this contention group.  */
++  unsigned long threads_busy;
+ 
+   /* This barrier holds and releases threads waiting in threads.  */
+   gomp_barrier_t threads_dock;
+ };
+ 
++enum gomp_cancel_kind
++{
++  GOMP_CANCEL_PARALLEL = 1,
++  GOMP_CANCEL_LOOP = 2,
++  GOMP_CANCEL_FOR = GOMP_CANCEL_LOOP,
++  GOMP_CANCEL_DO = GOMP_CANCEL_LOOP,
++  GOMP_CANCEL_SECTIONS = 4,
++  GOMP_CANCEL_TASKGROUP = 8
++};
++
+ /* ... and here is that TLS data.  */
+ 
+ #ifdef HAVE_TLS
+@@ -402,17 +503,22 @@ static inline struct gomp_task_icv *gomp
+ /* The attributes to be used during thread creation.  */
+ extern pthread_attr_t gomp_thread_attr;
+ 
+-/* Other variables.  */
+-
+-extern unsigned short *gomp_cpu_affinity;
+-extern size_t gomp_cpu_affinity_len;
+-
+ /* Function prototypes.  */
+ 
+ /* affinity.c */
+ 
+ extern void gomp_init_affinity (void);
+-extern void gomp_init_thread_affinity (pthread_attr_t *);
++extern void gomp_init_thread_affinity (pthread_attr_t *, unsigned int);
++extern void **gomp_affinity_alloc (unsigned long, bool);
++extern void gomp_affinity_init_place (void *);
++extern bool gomp_affinity_add_cpus (void *, unsigned long, unsigned long,
++				    long, bool);
++extern bool gomp_affinity_remove_cpu (void *, unsigned long);
++extern bool gomp_affinity_copy_place (void *, void *, long);
++extern bool gomp_affinity_same_place (void *, void *);
++extern bool gomp_affinity_finalize_place_list (bool);
++extern bool gomp_affinity_init_level (int, unsigned long, bool);
++extern void gomp_affinity_print_place (void *);
+ 
+ /* alloc.c */
+ 
+@@ -486,15 +592,21 @@ extern void gomp_barrier_handle_tasks (g
+ static void inline
+ gomp_finish_task (struct gomp_task *task)
+ {
+-  gomp_sem_destroy (&task->taskwait_sem);
++  if (__builtin_expect (task->depend_hash != NULL, 0))
++    free (task->depend_hash);
+ }
+ 
+ /* team.c */
+ 
+ extern struct gomp_team *gomp_new_team (unsigned);
+ extern void gomp_team_start (void (*) (void *), void *, unsigned,
+-			     struct gomp_team *);
++			     unsigned, struct gomp_team *);
+ extern void gomp_team_end (void);
++extern void gomp_free_thread (void *);
++
++/* target.c */
++
++extern int gomp_get_num_devices (void);
+ 
+ /* work.c */
+ 
+@@ -502,6 +614,7 @@ extern void gomp_init_work_share (struct
+ extern void gomp_fini_work_share (struct gomp_work_share *);
+ extern bool gomp_work_share_start (bool);
+ extern void gomp_work_share_end (void);
++extern bool gomp_work_share_end_cancel (void);
+ extern void gomp_work_share_end_nowait (void);
+ 
+ static inline void
+@@ -524,6 +637,26 @@ gomp_work_share_init_done (void)
+ #define _LIBGOMP_OMP_LOCK_DEFINED 1
+ #include "omp.h.in"
+ 
++typedef enum omp_proc_bind_t
++{
++  omp_proc_bind_false = 0,
++  omp_proc_bind_true = 1,
++  omp_proc_bind_master = 2,
++  omp_proc_bind_close = 3,
++  omp_proc_bind_spread = 4
++} omp_proc_bind_t;
++
++extern int omp_get_cancellation (void) __GOMP_NOTHROW;
++extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW;
++
++extern void omp_set_default_device (int) __GOMP_NOTHROW;
++extern int omp_get_default_device (void) __GOMP_NOTHROW;
++extern int omp_get_num_devices (void) __GOMP_NOTHROW;
++extern int omp_get_num_teams (void) __GOMP_NOTHROW;
++extern int omp_get_team_num (void) __GOMP_NOTHROW;
++
++extern int omp_is_initial_device (void) __GOMP_NOTHROW;
++
+ #if !defined (HAVE_ATTRIBUTE_VISIBILITY) \
+     || !defined (HAVE_ATTRIBUTE_ALIAS) \
+     || !defined (HAVE_AS_SYMVER_DIRECTIVE) \
+@@ -580,11 +713,19 @@ extern int gomp_test_nest_lock_25 (omp_n
+ #endif
+ 
+ #ifdef HAVE_ATTRIBUTE_ALIAS
++# define ialias_ulp	ialias_str1(__USER_LABEL_PREFIX__)
++# define ialias_str1(x)	ialias_str2(x)
++# define ialias_str2(x)	#x
+ # define ialias(fn) \
+   extern __typeof (fn) gomp_ialias_##fn \
+     __attribute__ ((alias (#fn))) attribute_hidden;
++# define ialias_redirect(fn) \
++  extern __typeof (fn) fn __asm__ (ialias_ulp "gomp_ialias_" #fn) attribute_hidden;
++# define ialias_call(fn) gomp_ialias_ ## fn
+ #else
+ # define ialias(fn)
++# define ialias_redirect(fn)
++# define ialias_call(fn) fn
+ #endif
+ 
+ #endif /* LIBGOMP_H */
+--- libgomp/task.c	(revision 210461)
++++ libgomp/task.c	(revision 213654)
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2007-2013 Free Software Foundation, Inc.
++/* Copyright (C) 2007-2014 Free Software Foundation, Inc.
+    Contributed by Richard Henderson <rth@redhat.com>.
+ 
+    This file is part of the GNU OpenMP Library (libgomp).
+@@ -29,6 +29,33 @@
+ #include <stdlib.h>
+ #include <string.h>
+ 
++typedef struct gomp_task_depend_entry *hash_entry_type;
++
++static inline void *
++htab_alloc (size_t size)
++{
++  return gomp_malloc (size);
++}
++
++static inline void
++htab_free (void *ptr)
++{
++  free (ptr);
++}
++
++#include "hashtab.h"
++
++static inline hashval_t
++htab_hash (hash_entry_type element)
++{
++  return hash_pointer (element->addr);
++}
++
++static inline bool
++htab_eq (hash_entry_type x, hash_entry_type y)
++{
++  return x->addr == y->addr;
++}
+ 
+ /* Create a new task data structure.  */
+ 
+@@ -39,11 +66,16 @@ gomp_init_task (struct gomp_task *task,
+   task->parent = parent_task;
+   task->icv = *prev_icv;
+   task->kind = GOMP_TASK_IMPLICIT;
+-  task->in_taskwait = false;
++  task->taskwait = NULL;
+   task->in_tied_task = false;
+   task->final_task = false;
++  task->copy_ctors_done = false;
++  task->parent_depends_on = false;
+   task->children = NULL;
+-  gomp_sem_init (&task->taskwait_sem, 0);
++  task->taskgroup = NULL;
++  task->dependers = NULL;
++  task->depend_hash = NULL;
++  task->depend_count = 0;
+ }
+ 
+ /* Clean up a task, after completing it.  */
+@@ -72,13 +104,16 @@ gomp_clear_parent (struct gomp_task *chi
+     while (task != children);
+ }
+ 
++static void gomp_task_maybe_wait_for_dependencies (void **depend);
++
+ /* Called when encountering an explicit task directive.  If IF_CLAUSE is
+    false, then we must not delay in executing the task.  If UNTIED is true,
+    then the task may be executed by any member of the team.  */
+ 
+ void
+ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
+-	   long arg_size, long arg_align, bool if_clause, unsigned flags)
++	   long arg_size, long arg_align, bool if_clause, unsigned flags,
++	   void **depend)
+ {
+   struct gomp_thread *thr = gomp_thread ();
+   struct gomp_team *team = thr->ts.team;
+@@ -94,17 +129,35 @@ GOMP_task (void (*fn) (void *), void *da
+     flags &= ~1;
+ #endif
+ 
++  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
++  if (team
++      && (gomp_team_barrier_cancelled (&team->barrier)
++	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
++    return;
++
+   if (!if_clause || team == NULL
+       || (thr->task && thr->task->final_task)
+       || team->task_count > 64 * team->nthreads)
+     {
+       struct gomp_task task;
+ 
++      /* If there are depend clauses and earlier deferred sibling tasks
++	 with depend clauses, check if there isn't a dependency.  If there
++	 is, we need to wait for them.  There is no need to handle
++	 depend clauses for non-deferred tasks other than this, because
++	 the parent task is suspended until the child task finishes and thus
++	 it can't start further child tasks.  */
++      if ((flags & 8) && thr->task && thr->task->depend_hash)
++	gomp_task_maybe_wait_for_dependencies (depend);
++
+       gomp_init_task (&task, thr->task, gomp_icv (false));
+       task.kind = GOMP_TASK_IFFALSE;
+       task.final_task = (thr->task && thr->task->final_task) || (flags & 2);
+       if (thr->task)
+-	task.in_tied_task = thr->task->in_tied_task;
++	{
++	  task.in_tied_task = thr->task->in_tied_task;
++	  task.taskgroup = thr->task->taskgroup;
++	}
+       thr->task = &task;
+       if (__builtin_expect (cpyfn != NULL, 0))
+ 	{
+@@ -137,27 +190,174 @@ GOMP_task (void (*fn) (void *), void *da
+     {
+       struct gomp_task *task;
+       struct gomp_task *parent = thr->task;
++      struct gomp_taskgroup *taskgroup = parent->taskgroup;
+       char *arg;
+       bool do_wake;
++      size_t depend_size = 0;
+ 
+-      task = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1);
+-      arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1)
++      if (flags & 8)
++	depend_size = ((uintptr_t) depend[0]
++		       * sizeof (struct gomp_task_depend_entry));
++      task = gomp_malloc (sizeof (*task) + depend_size
++			  + arg_size + arg_align - 1);
++      arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
+ 		      & ~(uintptr_t) (arg_align - 1));
+       gomp_init_task (task, parent, gomp_icv (false));
+       task->kind = GOMP_TASK_IFFALSE;
+       task->in_tied_task = parent->in_tied_task;
++      task->taskgroup = taskgroup;
+       thr->task = task;
+       if (cpyfn)
+-	cpyfn (arg, data);
++	{
++	  cpyfn (arg, data);
++	  task->copy_ctors_done = true;
++	}
+       else
+ 	memcpy (arg, data, arg_size);
+       thr->task = parent;
+       task->kind = GOMP_TASK_WAITING;
+       task->fn = fn;
+       task->fn_data = arg;
+-      task->in_tied_task = true;
+       task->final_task = (flags & 2) >> 1;
+       gomp_mutex_lock (&team->task_lock);
++      /* If parallel or taskgroup has been cancelled, don't start new
++	 tasks.  */
++      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
++			     || (taskgroup && taskgroup->cancelled))
++			    && !task->copy_ctors_done, 0))
++	{
++	  gomp_mutex_unlock (&team->task_lock);
++	  gomp_finish_task (task);
++	  free (task);
++	  return;
++	}
++      if (taskgroup)
++	taskgroup->num_children++;
++      if (depend_size)
++	{
++	  size_t ndepend = (uintptr_t) depend[0];
++	  size_t nout = (uintptr_t) depend[1];
++	  size_t i;
++	  hash_entry_type ent;
++
++	  task->depend_count = ndepend;
++	  task->num_dependees = 0;
++	  if (parent->depend_hash == NULL)
++	    parent->depend_hash
++	      = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
++	  for (i = 0; i < ndepend; i++)
++	    {
++	      task->depend[i].addr = depend[2 + i];
++	      task->depend[i].next = NULL;
++	      task->depend[i].prev = NULL;
++	      task->depend[i].task = task;
++	      task->depend[i].is_in = i >= nout;
++	      task->depend[i].redundant = false;
++	      task->depend[i].redundant_out = false;
++
++	      hash_entry_type *slot
++		= htab_find_slot (&parent->depend_hash, &task->depend[i],
++				  INSERT);
++	      hash_entry_type out = NULL, last = NULL;
++	      if (*slot)
++		{
++		  /* If multiple depends on the same task are the
++		     same, all but the first one are redundant.
++		     As inout/out come first, if any of them is
++		     inout/out, it will win, which is the right
++		     semantics.  */
++		  if ((*slot)->task == task)
++		    {
++		      task->depend[i].redundant = true;
++		      continue;
++		    }
++		  for (ent = *slot; ent; ent = ent->next)
++		    {
++		      if (ent->redundant_out)
++			break;
++
++		      last = ent;
++
++		      /* depend(in:...) doesn't depend on earlier
++			 depend(in:...).  */
++		      if (i >= nout && ent->is_in)
++			continue;
++
++		      if (!ent->is_in)
++			out = ent;
++
++		      struct gomp_task *tsk = ent->task;
++		      if (tsk->dependers == NULL)
++			{
++			  tsk->dependers
++			    = gomp_malloc (sizeof (struct gomp_dependers_vec)
++					   + 6 * sizeof (struct gomp_task *));
++			  tsk->dependers->n_elem = 1;
++			  tsk->dependers->allocated = 6;
++			  tsk->dependers->elem[0] = task;
++			  task->num_dependees++;
++			  continue;
++			}
++		      /* We already have some other dependency on tsk
++			 from earlier depend clause.  */
++		      else if (tsk->dependers->n_elem
++			       && (tsk->dependers->elem[tsk->dependers->n_elem
++							- 1]
++				   == task))
++			continue;
++		      else if (tsk->dependers->n_elem
++			       == tsk->dependers->allocated)
++			{
++			  tsk->dependers->allocated
++			    = tsk->dependers->allocated * 2 + 2;
++			  tsk->dependers
++			    = gomp_realloc (tsk->dependers,
++					    sizeof (struct gomp_dependers_vec)
++					    + (tsk->dependers->allocated
++					       * sizeof (struct gomp_task *)));
++			}
++		      tsk->dependers->elem[tsk->dependers->n_elem++] = task;
++		      task->num_dependees++;
++		    }
++		  task->depend[i].next = *slot;
++		  (*slot)->prev = &task->depend[i];
++		}
++	      *slot = &task->depend[i];
++
++	      /* There is no need to store more than one depend({,in}out:)
++		 task per address in the hash table chain for the purpose
++		 of creation of deferred tasks, because each out
++		 depends on all earlier outs, thus it is enough to record
++		 just the last depend({,in}out:).  For depend(in:), we need
++		 to keep all of the previous ones not terminated yet, because
++		 a later depend({,in}out:) might need to depend on all of
++		 them.  So, if the new task's clause is depend({,in}out:),
++		 we know there is at most one other depend({,in}out:) clause
++		 in the list (out).  For non-deferred tasks we want to see
++		 all outs, so they are moved to the end of the chain,
++		 after first redundant_out entry all following entries
++		 should be redundant_out.  */
++	      if (!task->depend[i].is_in && out)
++		{
++		  if (out != last)
++		    {
++		      out->next->prev = out->prev;
++		      out->prev->next = out->next;
++		      out->next = last->next;
++		      out->prev = last;
++		      last->next = out;
++		      if (out->next)
++			out->next->prev = out;
++		    }
++		  out->redundant_out = true;
++		}
++	    }
++	  if (task->num_dependees)
++	    {
++	      gomp_mutex_unlock (&team->task_lock);
++	      return;
++	    }
++	}
+       if (parent->children)
+ 	{
+ 	  task->next_child = parent->children;
+@@ -171,6 +371,22 @@ GOMP_task (void (*fn) (void *), void *da
+ 	  task->prev_child = task;
+ 	}
+       parent->children = task;
++      if (taskgroup)
++	{
++	  if (taskgroup->children)
++	    {
++	      task->next_taskgroup = taskgroup->children;
++	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
++	      task->next_taskgroup->prev_taskgroup = task;
++	      task->prev_taskgroup->next_taskgroup = task;
++	    }
++	  else
++	    {
++	      task->next_taskgroup = task;
++	      task->prev_taskgroup = task;
++	    }
++	  taskgroup->children = task;
++	}
+       if (team->task_queue)
+ 	{
+ 	  task->next_queue = team->task_queue;
+@@ -185,6 +401,7 @@ GOMP_task (void (*fn) (void *), void *da
+ 	  team->task_queue = task;
+ 	}
+       ++team->task_count;
++      ++team->task_queued_count;
+       gomp_team_barrier_set_task_pending (&team->barrier);
+       do_wake = team->task_running_count + !parent->in_tied_task
+ 		< team->nthreads;
+@@ -194,6 +411,265 @@ GOMP_task (void (*fn) (void *), void *da
+     }
+ }
+ 
++static inline bool
++gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
++		   struct gomp_taskgroup *taskgroup, struct gomp_team *team)
++{
++  if (parent)
++    {
++      if (parent->children == child_task)
++	parent->children = child_task->next_child;
++      if (__builtin_expect (child_task->parent_depends_on, 0)
++	  && parent->taskwait->last_parent_depends_on == child_task)
++	{
++	  if (child_task->prev_child->kind == GOMP_TASK_WAITING
++	      && child_task->prev_child->parent_depends_on)
++	    parent->taskwait->last_parent_depends_on = child_task->prev_child;
++	  else
++	    parent->taskwait->last_parent_depends_on = NULL;
++	}
++    }
++  if (taskgroup && taskgroup->children == child_task)
++    taskgroup->children = child_task->next_taskgroup;
++  child_task->prev_queue->next_queue = child_task->next_queue;
++  child_task->next_queue->prev_queue = child_task->prev_queue;
++  if (team->task_queue == child_task)
++    {
++      if (child_task->next_queue != child_task)
++	team->task_queue = child_task->next_queue;
++      else
++	team->task_queue = NULL;
++    }
++  child_task->kind = GOMP_TASK_TIED;
++  if (--team->task_queued_count == 0)
++    gomp_team_barrier_clear_task_pending (&team->barrier);
++  if ((gomp_team_barrier_cancelled (&team->barrier)
++       || (taskgroup && taskgroup->cancelled))
++      && !child_task->copy_ctors_done)
++    return true;
++  return false;
++}
++
++static void
++gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task)
++{
++  struct gomp_task *parent = child_task->parent;
++  size_t i;
++
++  for (i = 0; i < child_task->depend_count; i++)
++    if (!child_task->depend[i].redundant)
++      {
++	if (child_task->depend[i].next)
++	  child_task->depend[i].next->prev = child_task->depend[i].prev;
++	if (child_task->depend[i].prev)
++	  child_task->depend[i].prev->next = child_task->depend[i].next;
++	else
++	  {
++	    hash_entry_type *slot
++	      = htab_find_slot (&parent->depend_hash, &child_task->depend[i],
++				NO_INSERT);
++	    if (*slot != &child_task->depend[i])
++	      abort ();
++	    if (child_task->depend[i].next)
++	      *slot = child_task->depend[i].next;
++	    else
++	      htab_clear_slot (parent->depend_hash, slot);
++	  }
++      }
++}
++
++static size_t
++gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
++				     struct gomp_team *team)
++{
++  struct gomp_task *parent = child_task->parent;
++  size_t i, count = child_task->dependers->n_elem, ret = 0;
++  for (i = 0; i < count; i++)
++    {
++      struct gomp_task *task = child_task->dependers->elem[i];
++      if (--task->num_dependees != 0)
++	continue;
++
++      struct gomp_taskgroup *taskgroup = task->taskgroup;
++      if (parent)
++	{
++	  if (parent->children)
++	    {
++	      /* If parent is in gomp_task_maybe_wait_for_dependencies
++		 and it doesn't need to wait for this task, put it after
++		 all ready to run tasks it needs to wait for.  */
++	      if (parent->taskwait && parent->taskwait->last_parent_depends_on
++		  && !task->parent_depends_on)
++		{
++		  struct gomp_task *last_parent_depends_on
++		    = parent->taskwait->last_parent_depends_on;
++		  task->next_child = last_parent_depends_on->next_child;
++		  task->prev_child = last_parent_depends_on;
++		}
++	      else
++		{
++		  task->next_child = parent->children;
++		  task->prev_child = parent->children->prev_child;
++		  parent->children = task;
++		}
++	      task->next_child->prev_child = task;
++	      task->prev_child->next_child = task;
++	    }
++	  else
++	    {
++	      task->next_child = task;
++	      task->prev_child = task;
++	      parent->children = task;
++	    }
++	  if (parent->taskwait)
++	    {
++	      if (parent->taskwait->in_taskwait)
++		{
++		  parent->taskwait->in_taskwait = false;
++		  gomp_sem_post (&parent->taskwait->taskwait_sem);
++		}
++	      else if (parent->taskwait->in_depend_wait)
++		{
++		  parent->taskwait->in_depend_wait = false;
++		  gomp_sem_post (&parent->taskwait->taskwait_sem);
++		}
++	      if (parent->taskwait->last_parent_depends_on == NULL
++		  && task->parent_depends_on)
++		parent->taskwait->last_parent_depends_on = task;
++	    }
++	}
++      if (taskgroup)
++	{
++	  if (taskgroup->children)
++	    {
++	      task->next_taskgroup = taskgroup->children;
++	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
++	      task->next_taskgroup->prev_taskgroup = task;
++	      task->prev_taskgroup->next_taskgroup = task;
++	    }
++	  else
++	    {
++	      task->next_taskgroup = task;
++	      task->prev_taskgroup = task;
++	    }
++	  taskgroup->children = task;
++	  if (taskgroup->in_taskgroup_wait)
++	    {
++	      taskgroup->in_taskgroup_wait = false;
++	      gomp_sem_post (&taskgroup->taskgroup_sem);
++	    }
++	}
++      if (team->task_queue)
++	{
++	  task->next_queue = team->task_queue;
++	  task->prev_queue = team->task_queue->prev_queue;
++	  task->next_queue->prev_queue = task;
++	  task->prev_queue->next_queue = task;
++	}
++      else
++	{
++	  task->next_queue = task;
++	  task->prev_queue = task;
++	  team->task_queue = task;
++	}
++      ++team->task_count;
++      ++team->task_queued_count;
++      ++ret;
++    }
++  free (child_task->dependers);
++  child_task->dependers = NULL;
++  if (ret > 1)
++    gomp_team_barrier_set_task_pending (&team->barrier);
++  return ret;
++}
++
++static inline size_t
++gomp_task_run_post_handle_depend (struct gomp_task *child_task,
++				  struct gomp_team *team)
++{
++  if (child_task->depend_count == 0)
++    return 0;
++
++  /* If parent is gone already, the hash table is freed and nothing
++     will use the hash table anymore, no need to remove anything from it.  */
++  if (child_task->parent != NULL)
++    gomp_task_run_post_handle_depend_hash (child_task);
++
++  if (child_task->dependers == NULL)
++    return 0;
++
++  return gomp_task_run_post_handle_dependers (child_task, team);
++}
++
++static inline void
++gomp_task_run_post_remove_parent (struct gomp_task *child_task)
++{
++  struct gomp_task *parent = child_task->parent;
++  if (parent == NULL)
++    return;
++  if (__builtin_expect (child_task->parent_depends_on, 0)
++      && --parent->taskwait->n_depend == 0
++      && parent->taskwait->in_depend_wait)
++    {
++      parent->taskwait->in_depend_wait = false;
++      gomp_sem_post (&parent->taskwait->taskwait_sem);
++    }
++  child_task->prev_child->next_child = child_task->next_child;
++  child_task->next_child->prev_child = child_task->prev_child;
++  if (parent->children != child_task)
++    return;
++  if (child_task->next_child != child_task)
++    parent->children = child_task->next_child;
++  else
++    {
++      /* We access task->children in GOMP_taskwait
++	 outside of the task lock mutex region, so
++	 need a release barrier here to ensure memory
++	 written by child_task->fn above is flushed
++	 before the NULL is written.  */
++      __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE);
++      if (parent->taskwait && parent->taskwait->in_taskwait)
++	{
++	  parent->taskwait->in_taskwait = false;
++	  gomp_sem_post (&parent->taskwait->taskwait_sem);
++	}
++    }
++}
++
++static inline void
++gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
++{
++  struct gomp_taskgroup *taskgroup = child_task->taskgroup;
++  if (taskgroup == NULL)
++    return;
++  child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup;
++  child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup;
++  if (taskgroup->num_children > 1)
++    --taskgroup->num_children;
++  else
++    {
++      /* We access taskgroup->num_children in GOMP_taskgroup_end
++	 outside of the task lock mutex region, so
++	 need a release barrier here to ensure memory
++	 written by child_task->fn above is flushed
++	 before the NULL is written.  */
++      __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
++    }
++  if (taskgroup->children != child_task)
++    return;
++  if (child_task->next_taskgroup != child_task)
++    taskgroup->children = child_task->next_taskgroup;
++  else
++    {
++      taskgroup->children = NULL;
++      if (taskgroup->in_taskgroup_wait)
++	{
++	  taskgroup->in_taskgroup_wait = false;
++	  gomp_sem_post (&taskgroup->taskgroup_sem);
++	}
++    }
++}
++
+ void
+ gomp_barrier_handle_tasks (gomp_barrier_state_t state)
+ {
+@@ -202,6 +678,7 @@ gomp_barrier_handle_tasks (gomp_barrier_
+   struct gomp_task *task = thr->task;
+   struct gomp_task *child_task = NULL;
+   struct gomp_task *to_free = NULL;
++  int do_wake = 0;
+ 
+   gomp_mutex_lock (&team->task_lock);
+   if (gomp_barrier_last_thread (state))
+@@ -218,26 +695,31 @@ gomp_barrier_handle_tasks (gomp_barrier_
+ 
+   while (1)
+     {
++      bool cancelled = false;
+       if (team->task_queue != NULL)
+ 	{
+-	  struct gomp_task *parent;
+-
+ 	  child_task = team->task_queue;
+-	  parent = child_task->parent;
+-	  if (parent && parent->children == child_task)
+-	    parent->children = child_task->next_child;
+-	  child_task->prev_queue->next_queue = child_task->next_queue;
+-	  child_task->next_queue->prev_queue = child_task->prev_queue;
+-	  if (child_task->next_queue != child_task)
+-	    team->task_queue = child_task->next_queue;
+-	  else
+-	    team->task_queue = NULL;
+-	  child_task->kind = GOMP_TASK_TIED;
++	  cancelled = gomp_task_run_pre (child_task, child_task->parent,
++					 child_task->taskgroup, team);
++	  if (__builtin_expect (cancelled, 0))
++	    {
++	      if (to_free)
++		{
++		  gomp_finish_task (to_free);
++		  free (to_free);
++		  to_free = NULL;
++		}
++	      goto finish_cancelled;
++	    }
+ 	  team->task_running_count++;
+-	  if (team->task_count == team->task_running_count)
+-	    gomp_team_barrier_clear_task_pending (&team->barrier);
++	  child_task->in_tied_task = true;
+ 	}
+       gomp_mutex_unlock (&team->task_lock);
++      if (do_wake)
++	{
++	  gomp_team_barrier_wake (&team->barrier, do_wake);
++	  do_wake = 0;
++	}
+       if (to_free)
+ 	{
+ 	  gomp_finish_task (to_free);
+@@ -255,33 +737,22 @@ gomp_barrier_handle_tasks (gomp_barrier_
+       gomp_mutex_lock (&team->task_lock);
+       if (child_task)
+ 	{
+-	  struct gomp_task *parent = child_task->parent;
+-	  if (parent)
+-	    {
+-	      child_task->prev_child->next_child = child_task->next_child;
+-	      child_task->next_child->prev_child = child_task->prev_child;
+-	      if (parent->children == child_task)
+-		{
+-		  if (child_task->next_child != child_task)
+-		    parent->children = child_task->next_child;
+-		  else
+-		    {
+-		      /* We access task->children in GOMP_taskwait
+-			 outside of the task lock mutex region, so
+-			 need a release barrier here to ensure memory
+-			 written by child_task->fn above is flushed
+-			 before the NULL is written.  */
+-		      __atomic_store_n (&parent->children, NULL,
+-					MEMMODEL_RELEASE);
+-		      if (parent->in_taskwait)
+-			gomp_sem_post (&parent->taskwait_sem);
+-		    }
+-		}
+-	    }
++	 finish_cancelled:;
++	  size_t new_tasks
++	    = gomp_task_run_post_handle_depend (child_task, team);
++	  gomp_task_run_post_remove_parent (child_task);
+ 	  gomp_clear_parent (child_task->children);
++	  gomp_task_run_post_remove_taskgroup (child_task);
+ 	  to_free = child_task;
+ 	  child_task = NULL;
+-	  team->task_running_count--;
++	  if (!cancelled)
++	    team->task_running_count--;
++	  if (new_tasks > 1)
++	    {
++	      do_wake = team->nthreads - team->task_running_count;
++	      if (do_wake > new_tasks)
++		do_wake = new_tasks;
++	    }
+ 	  if (--team->task_count == 0
+ 	      && gomp_team_barrier_waiting_for_tasks (&team->barrier))
+ 	    {
+@@ -304,9 +775,11 @@ GOMP_taskwait (void)
+   struct gomp_task *task = thr->task;
+   struct gomp_task *child_task = NULL;
+   struct gomp_task *to_free = NULL;
++  struct gomp_taskwait taskwait;
++  int do_wake = 0;
+ 
+   /* The acquire barrier on load of task->children here synchronizes
+-     with the write of a NULL in gomp_barrier_handle_tasks.  It is
++     with the write of a NULL in gomp_task_run_post_remove_parent.  It is
+      not necessary that we synchronize with other non-NULL writes at
+      this point, but we must ensure that all writes to memory by a
+      child thread task work function are seen before we exit from
+@@ -315,42 +788,60 @@ GOMP_taskwait (void)
+       || __atomic_load_n (&task->children, MEMMODEL_ACQUIRE) == NULL)
+     return;
+ 
++  memset (&taskwait, 0, sizeof (taskwait));
+   gomp_mutex_lock (&team->task_lock);
+   while (1)
+     {
++      bool cancelled = false;
+       if (task->children == NULL)
+ 	{
++	  bool destroy_taskwait = task->taskwait != NULL;
++	  task->taskwait = NULL;
+ 	  gomp_mutex_unlock (&team->task_lock);
+ 	  if (to_free)
+ 	    {
+ 	      gomp_finish_task (to_free);
+ 	      free (to_free);
+ 	    }
++	  if (destroy_taskwait)
++	    gomp_sem_destroy (&taskwait.taskwait_sem);
+ 	  return;
+ 	}
+       if (task->children->kind == GOMP_TASK_WAITING)
+ 	{
+ 	  child_task = task->children;
+-	  task->children = child_task->next_child;
+-	  child_task->prev_queue->next_queue = child_task->next_queue;
+-	  child_task->next_queue->prev_queue = child_task->prev_queue;
+-	  if (team->task_queue == child_task)
++	  cancelled
++	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
++				 team);
++	  if (__builtin_expect (cancelled, 0))
+ 	    {
+-	      if (child_task->next_queue != child_task)
+-		team->task_queue = child_task->next_queue;
+-	      else
+-		team->task_queue = NULL;
++	      if (to_free)
++		{
++		  gomp_finish_task (to_free);
++		  free (to_free);
++		  to_free = NULL;
++		}
++	      goto finish_cancelled;
+ 	    }
+-	  child_task->kind = GOMP_TASK_TIED;
+-	  team->task_running_count++;
+-	  if (team->task_count == team->task_running_count)
+-	    gomp_team_barrier_clear_task_pending (&team->barrier);
+ 	}
+       else
+-	/* All tasks we are waiting for are already running
+-	   in other threads.  Wait for them.  */
+-	task->in_taskwait = true;
++	{
++	  /* All tasks we are waiting for are already running
++	     in other threads.  Wait for them.  */
++	  if (task->taskwait == NULL)
++	    {
++	      taskwait.in_depend_wait = false;
++	      gomp_sem_init (&taskwait.taskwait_sem, 0);
++	      task->taskwait = &taskwait;
++	    }
++	  taskwait.in_taskwait = true;
++	}
+       gomp_mutex_unlock (&team->task_lock);
++      if (do_wake)
++	{
++	  gomp_team_barrier_wake (&team->barrier, do_wake);
++	  do_wake = 0;
++	}
+       if (to_free)
+ 	{
+ 	  gomp_finish_task (to_free);
+@@ -364,14 +855,178 @@ GOMP_taskwait (void)
+ 	  thr->task = task;
+ 	}
+       else
++	gomp_sem_wait (&taskwait.taskwait_sem);
++      gomp_mutex_lock (&team->task_lock);
++      if (child_task)
+ 	{
+-	  gomp_sem_wait (&task->taskwait_sem);
+-	  task->in_taskwait = false;
++	 finish_cancelled:;
++	  size_t new_tasks
++	    = gomp_task_run_post_handle_depend (child_task, team);
++	  child_task->prev_child->next_child = child_task->next_child;
++	  child_task->next_child->prev_child = child_task->prev_child;
++	  if (task->children == child_task)
++	    {
++	      if (child_task->next_child != child_task)
++		task->children = child_task->next_child;
++	      else
++		task->children = NULL;
++	    }
++	  gomp_clear_parent (child_task->children);
++	  gomp_task_run_post_remove_taskgroup (child_task);
++	  to_free = child_task;
++	  child_task = NULL;
++	  team->task_count--;
++	  if (new_tasks > 1)
++	    {
++	      do_wake = team->nthreads - team->task_running_count
++			- !task->in_tied_task;
++	      if (do_wake > new_tasks)
++		do_wake = new_tasks;
++	    }
++	}
++    }
++}
++
++/* This is like GOMP_taskwait, but we only wait for tasks that the
++   upcoming task depends on.  */
++
++static void
++gomp_task_maybe_wait_for_dependencies (void **depend)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_task *task = thr->task;
++  struct gomp_team *team = thr->ts.team;
++  struct gomp_task_depend_entry elem, *ent = NULL;
++  struct gomp_taskwait taskwait;
++  struct gomp_task *last_parent_depends_on = NULL;
++  size_t ndepend = (uintptr_t) depend[0];
++  size_t nout = (uintptr_t) depend[1];
++  size_t i;
++  size_t num_awaited = 0;
++  struct gomp_task *child_task = NULL;
++  struct gomp_task *to_free = NULL;
++  int do_wake = 0;
++
++  gomp_mutex_lock (&team->task_lock);
++  for (i = 0; i < ndepend; i++)
++    {
++      elem.addr = depend[i + 2];
++      ent = htab_find (task->depend_hash, &elem);
++      for (; ent; ent = ent->next)
++	if (i >= nout && ent->is_in)
++	  continue;
++	else
++	  {
++	    struct gomp_task *tsk = ent->task;
++	    if (!tsk->parent_depends_on)
++	      {
++		tsk->parent_depends_on = true;
++		++num_awaited;
++		if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING)
++		  {
++		    /* If a task we need to wait for is not already
++		       running and is ready to be scheduled, move it
++		       to front, so that we run it as soon as possible.  */
++		    if (last_parent_depends_on)
++		      {
++			tsk->prev_child->next_child = tsk->next_child;
++			tsk->next_child->prev_child = tsk->prev_child;
++			tsk->prev_child = last_parent_depends_on;
++			tsk->next_child = last_parent_depends_on->next_child;
++			tsk->prev_child->next_child = tsk;
++			tsk->next_child->prev_child = tsk;
++		      }
++		    else if (tsk != task->children)
++		      {
++			tsk->prev_child->next_child = tsk->next_child;
++			tsk->next_child->prev_child = tsk->prev_child;
++			tsk->prev_child = task->children;
++			tsk->next_child = task->children->next_child;
++			task->children = tsk;
++			tsk->prev_child->next_child = tsk;
++			tsk->next_child->prev_child = tsk;
++		      }
++		    last_parent_depends_on = tsk;
++		  }
++	      }
++	  }
++    }
++  if (num_awaited == 0)
++    {
++      gomp_mutex_unlock (&team->task_lock);
++      return;
++    }
++
++  memset (&taskwait, 0, sizeof (taskwait));
++  taskwait.n_depend = num_awaited;
++  taskwait.last_parent_depends_on = last_parent_depends_on;
++  gomp_sem_init (&taskwait.taskwait_sem, 0);
++  task->taskwait = &taskwait;
++
++  while (1)
++    {
++      bool cancelled = false;
++      if (taskwait.n_depend == 0)
++	{
++	  task->taskwait = NULL;
++	  gomp_mutex_unlock (&team->task_lock);
++	  if (to_free)
++	    {
++	      gomp_finish_task (to_free);
++	      free (to_free);
++	    }
++	  gomp_sem_destroy (&taskwait.taskwait_sem);
+ 	  return;
+ 	}
++      if (task->children->kind == GOMP_TASK_WAITING)
++	{
++	  child_task = task->children;
++	  cancelled
++	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
++				 team);
++	  if (__builtin_expect (cancelled, 0))
++	    {
++	      if (to_free)
++		{
++		  gomp_finish_task (to_free);
++		  free (to_free);
++		  to_free = NULL;
++		}
++	      goto finish_cancelled;
++	    }
++	}
++      else
++	/* All tasks we are waiting for are already running
++	   in other threads.  Wait for them.  */
++	taskwait.in_depend_wait = true;
++      gomp_mutex_unlock (&team->task_lock);
++      if (do_wake)
++	{
++	  gomp_team_barrier_wake (&team->barrier, do_wake);
++	  do_wake = 0;
++	}
++      if (to_free)
++	{
++	  gomp_finish_task (to_free);
++	  free (to_free);
++	  to_free = NULL;
++	}
++      if (child_task)
++	{
++	  thr->task = child_task;
++	  child_task->fn (child_task->fn_data);
++	  thr->task = task;
++	}
++      else
++	gomp_sem_wait (&taskwait.taskwait_sem);
+       gomp_mutex_lock (&team->task_lock);
+       if (child_task)
+ 	{
++	 finish_cancelled:;
++	  size_t new_tasks
++	    = gomp_task_run_post_handle_depend (child_task, team);
++	  if (child_task->parent_depends_on)
++	    --taskwait.n_depend;
+ 	  child_task->prev_child->next_child = child_task->next_child;
+ 	  child_task->next_child->prev_child = child_task->prev_child;
+ 	  if (task->children == child_task)
+@@ -382,10 +1037,17 @@ GOMP_taskwait (void)
+ 		task->children = NULL;
+ 	    }
+ 	  gomp_clear_parent (child_task->children);
++	  gomp_task_run_post_remove_taskgroup (child_task);
+ 	  to_free = child_task;
+ 	  child_task = NULL;
+ 	  team->task_count--;
+-	  team->task_running_count--;
++	  if (new_tasks > 1)
++	    {
++	      do_wake = team->nthreads - team->task_running_count
++			- !task->in_tied_task;
++	      if (do_wake > new_tasks)
++		do_wake = new_tasks;
++	    }
+ 	}
+     }
+ }
+@@ -398,6 +1060,151 @@ GOMP_taskyield (void)
+   /* Nothing at the moment.  */
+ }
+ 
++void
++GOMP_taskgroup_start (void)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++  struct gomp_task *task = thr->task;
++  struct gomp_taskgroup *taskgroup;
++
++  /* If team is NULL, all tasks are executed as
++     GOMP_TASK_IFFALSE tasks and thus all children tasks of
++     taskgroup and their descendant tasks will be finished
++     by the time GOMP_taskgroup_end is called.  */
++  if (team == NULL)
++    return;
++  taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup));
++  taskgroup->prev = task->taskgroup;
++  taskgroup->children = NULL;
++  taskgroup->in_taskgroup_wait = false;
++  taskgroup->cancelled = false;
++  taskgroup->num_children = 0;
++  gomp_sem_init (&taskgroup->taskgroup_sem, 0);
++  task->taskgroup = taskgroup;
++}
++
++void
++GOMP_taskgroup_end (void)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++  struct gomp_task *task = thr->task;
++  struct gomp_taskgroup *taskgroup;
++  struct gomp_task *child_task = NULL;
++  struct gomp_task *to_free = NULL;
++  int do_wake = 0;
++
++  if (team == NULL)
++    return;
++  taskgroup = task->taskgroup;
++
++  /* The acquire barrier on load of taskgroup->num_children here
++     synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup.
++     It is not necessary that we synchronize with other non-0 writes at
++     this point, but we must ensure that all writes to memory by a
++     child thread task work function are seen before we exit from
++     GOMP_taskgroup_end.  */
++  if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
++    goto finish;
++
++  gomp_mutex_lock (&team->task_lock);
++  while (1)
++    {
++      bool cancelled = false;
++      if (taskgroup->children == NULL)
++	{
++	  if (taskgroup->num_children)
++	    {
++	      if (task->children == NULL)
++		goto do_wait;
++	      child_task = task->children;
++            }
++          else
++	    {
++	      gomp_mutex_unlock (&team->task_lock);
++	      if (to_free)
++		{
++		  gomp_finish_task (to_free);
++		  free (to_free);
++		}
++	      goto finish;
++	    }
++	}
++      else
++	child_task = taskgroup->children;
++      if (child_task->kind == GOMP_TASK_WAITING)
++	{
++	  cancelled
++	    = gomp_task_run_pre (child_task, child_task->parent, taskgroup,
++				 team);
++	  if (__builtin_expect (cancelled, 0))
++	    {
++	      if (to_free)
++		{
++		  gomp_finish_task (to_free);
++		  free (to_free);
++		  to_free = NULL;
++		}
++	      goto finish_cancelled;
++	    }
++	}
++      else
++	{
++	  child_task = NULL;
++	 do_wait:
++	  /* All tasks we are waiting for are already running
++	     in other threads.  Wait for them.  */
++	  taskgroup->in_taskgroup_wait = true;
++	}
++      gomp_mutex_unlock (&team->task_lock);
++      if (do_wake)
++	{
++	  gomp_team_barrier_wake (&team->barrier, do_wake);
++	  do_wake = 0;
++	}
++      if (to_free)
++	{
++	  gomp_finish_task (to_free);
++	  free (to_free);
++	  to_free = NULL;
++	}
++      if (child_task)
++	{
++	  thr->task = child_task;
++	  child_task->fn (child_task->fn_data);
++	  thr->task = task;
++	}
++      else
++	gomp_sem_wait (&taskgroup->taskgroup_sem);
++      gomp_mutex_lock (&team->task_lock);
++      if (child_task)
++	{
++	 finish_cancelled:;
++	  size_t new_tasks
++	    = gomp_task_run_post_handle_depend (child_task, team);
++	  gomp_task_run_post_remove_parent (child_task);
++	  gomp_clear_parent (child_task->children);
++	  gomp_task_run_post_remove_taskgroup (child_task);
++	  to_free = child_task;
++	  child_task = NULL;
++	  team->task_count--;
++	  if (new_tasks > 1)
++	    {
++	      do_wake = team->nthreads - team->task_running_count
++			- !task->in_tied_task;
++	      if (do_wake > new_tasks)
++		do_wake = new_tasks;
++	    }
++	}
++    }
++
++ finish:
++  task->taskgroup = taskgroup->prev;
++  gomp_sem_destroy (&taskgroup->taskgroup_sem);
++  free (taskgroup);
++}
++
+ int
+ omp_in_final (void)
+ {
+--- libgomp/testsuite/libgomp.fortran/lib3.f	(revision 210461)
++++ libgomp/testsuite/libgomp.fortran/lib3.f	(revision 210462)
+@@ -66,6 +66,7 @@ C$OMP END PARALLEL
+ C$OMP PARALLEL REDUCTION (.OR.:L) IF (.TRUE.)
+       L = .NOT. OMP_IN_PARALLEL ()
+ C$OMP END PARALLEL
++      IF (L) CALL ABORT
+ 
+       E = OMP_GET_WTIME ()
+       IF (D .GT. E) CALL ABORT
+--- libgomp/testsuite/libgomp.fortran/lib1.f90	(revision 210461)
++++ libgomp/testsuite/libgomp.fortran/lib1.f90	(revision 210462)
+@@ -66,6 +66,7 @@
+ !$omp parallel reduction (.or.:l) if (.true.)
+   l = .not. omp_in_parallel ()
+ !$omp end parallel
++  if (l) call abort
+ 
+   e = omp_get_wtime ()
+   if (d .gt. e) call abort
+--- libgomp/testsuite/libgomp.fortran/lib2.f	(revision 210461)
++++ libgomp/testsuite/libgomp.fortran/lib2.f	(revision 210462)
+@@ -66,6 +66,7 @@ C$OMP END PARALLEL
+ C$OMP PARALLEL REDUCTION (.OR.:L) IF (.TRUE.)
+       L = .NOT. OMP_IN_PARALLEL ()
+ C$OMP END PARALLEL
++      IF (L) CALL ABORT
+ 
+       E = OMP_GET_WTIME ()
+       IF (D .GT. E) CALL ABORT
+--- libgomp/testsuite/libgomp.c/atomic-14.c	(revision 210461)
++++ libgomp/testsuite/libgomp.c/atomic-14.c	(revision 210462)
+@@ -16,7 +16,7 @@ main ()
+   #pragma omp atomic update
+     x = x + 7;
+   #pragma omp atomic
+-    x = x + 7 + 6;
++    x = x + (7 + 6);
+   #pragma omp atomic update
+     x = x + 2 * 3;
+   #pragma omp atomic
+@@ -65,7 +65,7 @@ main ()
+   if (v != -8)
+     abort ();
+   #pragma omp atomic
+-    x = x * -4 / 2;
++    x = x * (-4 / 2);
+   #pragma omp atomic read
+     v = x;
+   if (v != 16)
+--- libgomp/testsuite/libgomp.c/lib-1.c	(revision 210461)
++++ libgomp/testsuite/libgomp.c/lib-1.c	(revision 210462)
+@@ -85,6 +85,8 @@ main (void)
+   l = ! omp_in_parallel ();
+ #pragma omp parallel reduction (|:l) if (1)
+   l = ! omp_in_parallel ();
++  if (l)
++    abort ();
+ 
+   e = omp_get_wtime ();
+   if (d > e)
+--- libgomp/loop.c	(revision 210461)
++++ libgomp/loop.c	(revision 210462)
+@@ -439,14 +439,14 @@ static void
+ gomp_parallel_loop_start (void (*fn) (void *), void *data,
+ 			  unsigned num_threads, long start, long end,
+ 			  long incr, enum gomp_schedule_type sched,
+-			  long chunk_size)
++			  long chunk_size, unsigned int flags)
+ {
+   struct gomp_team *team;
+ 
+   num_threads = gomp_resolve_num_threads (num_threads, 0);
+   team = gomp_new_team (num_threads);
+   gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size);
+-  gomp_team_start (fn, data, num_threads, team);
++  gomp_team_start (fn, data, num_threads, flags, team);
+ }
+ 
+ void
+@@ -455,7 +455,7 @@ GOMP_parallel_loop_static_start (void (*
+ 				 long incr, long chunk_size)
+ {
+   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+-			    GFS_STATIC, chunk_size);
++			    GFS_STATIC, chunk_size, 0);
+ }
+ 
+ void
+@@ -464,7 +464,7 @@ GOMP_parallel_loop_dynamic_start (void (
+ 				  long incr, long chunk_size)
+ {
+   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+-			    GFS_DYNAMIC, chunk_size);
++			    GFS_DYNAMIC, chunk_size, 0);
+ }
+ 
+ void
+@@ -473,7 +473,7 @@ GOMP_parallel_loop_guided_start (void (*
+ 				 long incr, long chunk_size)
+ {
+   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+-			    GFS_GUIDED, chunk_size);
++			    GFS_GUIDED, chunk_size, 0);
+ }
+ 
+ void
+@@ -483,11 +483,59 @@ GOMP_parallel_loop_runtime_start (void (
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+-			    icv->run_sched_var, icv->run_sched_modifier);
++			    icv->run_sched_var, icv->run_sched_modifier, 0);
++}
++
++ialias_redirect (GOMP_parallel_end)
++
++void
++GOMP_parallel_loop_static (void (*fn) (void *), void *data,
++			   unsigned num_threads, long start, long end,
++			   long incr, long chunk_size, unsigned flags)
++{
++  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
++			    GFS_STATIC, chunk_size, flags);
++  fn (data);
++  GOMP_parallel_end ();
++}
++
++void
++GOMP_parallel_loop_dynamic (void (*fn) (void *), void *data,
++			    unsigned num_threads, long start, long end,
++			    long incr, long chunk_size, unsigned flags)
++{
++  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
++			    GFS_DYNAMIC, chunk_size, flags);
++  fn (data);
++  GOMP_parallel_end ();
++}
++
++void
++GOMP_parallel_loop_guided (void (*fn) (void *), void *data,
++			  unsigned num_threads, long start, long end,
++			  long incr, long chunk_size, unsigned flags)
++{
++  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
++			    GFS_GUIDED, chunk_size, flags);
++  fn (data);
++  GOMP_parallel_end ();
++}
++
++void
++GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
++			    unsigned num_threads, long start, long end,
++			    long incr, unsigned flags)
++{
++  struct gomp_task_icv *icv = gomp_icv (false);
++  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
++			    icv->run_sched_var, icv->run_sched_modifier,
++			    flags);
++  fn (data);
++  GOMP_parallel_end ();
+ }
+ 
+ /* The GOMP_loop_end* routines are called after the thread is told that
+-   all loop iterations are complete.  This first version synchronizes
++   all loop iterations are complete.  The first two versions synchronize
+    all threads; the nowait version does not.  */
+ 
+ void
+@@ -496,6 +544,12 @@ GOMP_loop_end (void)
+   gomp_work_share_end ();
+ }
+ 
++bool
++GOMP_loop_end_cancel (void)
++{
++  return gomp_work_share_end_cancel ();
++}
++
+ void
+ GOMP_loop_end_nowait (void)
+ {
+--- libgomp/hashtab.h	(revision 0)
++++ libgomp/hashtab.h	(revision 210462)
+@@ -0,0 +1,443 @@
++/* An expandable hash tables datatype.
++   Copyright (C) 1999-2013
++   Free Software Foundation, Inc.
++   Contributed by Vladimir Makarov <vmakarov@cygnus.com>.
++
++This program is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 2 of the License, or
++(at your option) any later version.
++
++This program is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with this program; if not, write to the Free Software
++Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
++
++/* The hash table code copied from include/hashtab.[hc] and adjusted,
++   so that the hash table entries are in the flexible array at the end
++   of the control structure, no callbacks are used and the elements in the
++   table are of the hash_entry_type type.
++   Before including this file, define hash_entry_type type and
++   htab_alloc and htab_free functions.  After including it, define
++   htab_hash and htab_eq inline functions.   */
++
++/* This package implements basic hash table functionality.  It is possible
++   to search for an entry, create an entry and destroy an entry.
++
++   Elements in the table are generic pointers.
++
++   The size of the table is not fixed; if the occupancy of the table
++   grows too high the hash table will be expanded.
++
++   The abstract data implementation is based on generalized Algorithm D
++   from Knuth's book "The art of computer programming".  Hash table is
++   expanded by creation of new hash table and transferring elements from
++   the old table to the new table.  */
++
++/* The type for a hash code.  */
++typedef unsigned int hashval_t;
++
++static inline hashval_t htab_hash (hash_entry_type);
++static inline bool htab_eq (hash_entry_type, hash_entry_type);
++
++/* This macro defines reserved value for empty table entry.  */
++
++#define HTAB_EMPTY_ENTRY    ((hash_entry_type) 0)
++
++/* This macro defines reserved value for table entry which contained
++   a deleted element. */
++
++#define HTAB_DELETED_ENTRY  ((hash_entry_type) 1)
++
++/* Hash tables are of the following type.  The structure
++   (implementation) of this type is not needed for using the hash
++   tables.  All work with hash table should be executed only through
++   functions mentioned below.  The size of this structure is subject to
++   change.  */
++
++struct htab {
++  /* Current size (in entries) of the hash table.  */
++  size_t size;
++
++  /* Current number of elements including also deleted elements.  */
++  size_t n_elements;
++
++  /* Current number of deleted elements in the table.  */
++  size_t n_deleted;
++
++  /* Current size (in entries) of the hash table, as an index into the
++     table of primes.  */
++  unsigned int size_prime_index;
++
++  /* Table itself.  */
++  hash_entry_type entries[];
++};
++
++typedef struct htab *htab_t;
++
++/* An enum saying whether we insert into the hash table or not.  */
++enum insert_option {NO_INSERT, INSERT};
++
++/* Table of primes and multiplicative inverses.
++
++   Note that these are not minimally reduced inverses.  Unlike when generating
++   code to divide by a constant, we want to be able to use the same algorithm
++   all the time.  All of these inverses (are implied to) have bit 32 set.
++
++   For the record, the function that computed the table is in
++   libiberty/hashtab.c.  */
++
++struct prime_ent
++{
++  hashval_t prime;
++  hashval_t inv;
++  hashval_t inv_m2;	/* inverse of prime-2 */
++  hashval_t shift;
++};
++
++static struct prime_ent const prime_tab[] = {
++  {          7, 0x24924925, 0x9999999b, 2 },
++  {         13, 0x3b13b13c, 0x745d1747, 3 },
++  {         31, 0x08421085, 0x1a7b9612, 4 },
++  {         61, 0x0c9714fc, 0x15b1e5f8, 5 },
++  {        127, 0x02040811, 0x0624dd30, 6 },
++  {        251, 0x05197f7e, 0x073260a5, 7 },
++  {        509, 0x01824366, 0x02864fc8, 8 },
++  {       1021, 0x00c0906d, 0x014191f7, 9 },
++  {       2039, 0x0121456f, 0x0161e69e, 10 },
++  {       4093, 0x00300902, 0x00501908, 11 },
++  {       8191, 0x00080041, 0x00180241, 12 },
++  {      16381, 0x000c0091, 0x00140191, 13 },
++  {      32749, 0x002605a5, 0x002a06e6, 14 },
++  {      65521, 0x000f00e2, 0x00110122, 15 },
++  {     131071, 0x00008001, 0x00018003, 16 },
++  {     262139, 0x00014002, 0x0001c004, 17 },
++  {     524287, 0x00002001, 0x00006001, 18 },
++  {    1048573, 0x00003001, 0x00005001, 19 },
++  {    2097143, 0x00004801, 0x00005801, 20 },
++  {    4194301, 0x00000c01, 0x00001401, 21 },
++  {    8388593, 0x00001e01, 0x00002201, 22 },
++  {   16777213, 0x00000301, 0x00000501, 23 },
++  {   33554393, 0x00001381, 0x00001481, 24 },
++  {   67108859, 0x00000141, 0x000001c1, 25 },
++  {  134217689, 0x000004e1, 0x00000521, 26 },
++  {  268435399, 0x00000391, 0x000003b1, 27 },
++  {  536870909, 0x00000019, 0x00000029, 28 },
++  { 1073741789, 0x0000008d, 0x00000095, 29 },
++  { 2147483647, 0x00000003, 0x00000007, 30 },
++  /* Avoid "decimal constant so large it is unsigned" for 4294967291.  */
++  { 0xfffffffb, 0x00000006, 0x00000008, 31 }
++};
++
++/* The following function returns an index into the above table of the
++   nearest prime number which is greater than N, and near a power of two. */
++
++static unsigned int
++higher_prime_index (unsigned long n)
++{
++  unsigned int low = 0;
++  unsigned int high = sizeof(prime_tab) / sizeof(prime_tab[0]);
++
++  while (low != high)
++    {
++      unsigned int mid = low + (high - low) / 2;
++      if (n > prime_tab[mid].prime)
++	low = mid + 1;
++      else
++	high = mid;
++    }
++
++  /* If we've run out of primes, abort.  */
++  if (n > prime_tab[low].prime)
++    abort ();
++
++  return low;
++}
++
++/* Return the current size of given hash table.  */
++
++static inline size_t
++htab_size (htab_t htab)
++{
++  return htab->size;
++}
++
++/* Return the current number of elements in given hash table. */
++
++static inline size_t
++htab_elements (htab_t htab)
++{
++  return htab->n_elements - htab->n_deleted;
++}
++
++/* Return X % Y.  */
++
++static inline hashval_t
++htab_mod_1 (hashval_t x, hashval_t y, hashval_t inv, int shift)
++{
++  /* The multiplicative inverses computed above are for 32-bit types, and
++     requires that we be able to compute a highpart multiply.  */
++  if (sizeof (hashval_t) * __CHAR_BIT__ <= 32)
++    {
++      hashval_t t1, t2, t3, t4, q, r;
++
++      t1 = ((unsigned long long)x * inv) >> 32;
++      t2 = x - t1;
++      t3 = t2 >> 1;
++      t4 = t1 + t3;
++      q  = t4 >> shift;
++      r  = x - (q * y);
++
++      return r;
++    }
++
++  /* Otherwise just use the native division routines.  */
++  return x % y;
++}
++
++/* Compute the primary hash for HASH given HTAB's current size.  */
++
++static inline hashval_t
++htab_mod (hashval_t hash, htab_t htab)
++{
++  const struct prime_ent *p = &prime_tab[htab->size_prime_index];
++  return htab_mod_1 (hash, p->prime, p->inv, p->shift);
++}
++
++/* Compute the secondary hash for HASH given HTAB's current size.  */
++
++static inline hashval_t
++htab_mod_m2 (hashval_t hash, htab_t htab)
++{
++  const struct prime_ent *p = &prime_tab[htab->size_prime_index];
++  return 1 + htab_mod_1 (hash, p->prime - 2, p->inv_m2, p->shift);
++}
++
++/* Create hash table of size SIZE.  */
++
++static htab_t
++htab_create (size_t size)
++{
++  htab_t result;
++  unsigned int size_prime_index;
++
++  size_prime_index = higher_prime_index (size);
++  size = prime_tab[size_prime_index].prime;
++
++  result = (htab_t) htab_alloc (sizeof (struct htab)
++				+ size * sizeof (hash_entry_type));
++  result->size = size;
++  result->n_elements = 0;
++  result->n_deleted = 0;
++  result->size_prime_index = size_prime_index;
++  memset (result->entries, 0, size * sizeof (hash_entry_type));
++  return result;
++}
++
++/* Similar to htab_find_slot, but without several unwanted side effects:
++    - Does not call htab_eq when it finds an existing entry.
++    - Does not change the count of elements in the hash table.
++   This function also assumes there are no deleted entries in the table.
++   HASH is the hash value for the element to be inserted.  */
++
++static hash_entry_type *
++find_empty_slot_for_expand (htab_t htab, hashval_t hash)
++{
++  hashval_t index = htab_mod (hash, htab);
++  size_t size = htab_size (htab);
++  hash_entry_type *slot = htab->entries + index;
++  hashval_t hash2;
++
++  if (*slot == HTAB_EMPTY_ENTRY)
++    return slot;
++  else if (*slot == HTAB_DELETED_ENTRY)
++    abort ();
++
++  hash2 = htab_mod_m2 (hash, htab);
++  for (;;)
++    {
++      index += hash2;
++      if (index >= size)
++	index -= size;
++
++      slot = htab->entries + index;
++      if (*slot == HTAB_EMPTY_ENTRY)
++	return slot;
++      else if (*slot == HTAB_DELETED_ENTRY)
++	abort ();
++    }
++}
++
++/* The following function changes size of memory allocated for the
++   entries and repeatedly inserts the table elements.  The occupancy
++   of the table after the call will be about 50%.  Naturally the hash
++   table must already exist.  Remember also that the place of the
++   table entries is changed.  */
++
++static htab_t
++htab_expand (htab_t htab)
++{
++  htab_t nhtab;
++  hash_entry_type *olimit;
++  hash_entry_type *p;
++  size_t osize, elts;
++
++  osize = htab->size;
++  olimit = htab->entries + osize;
++  elts = htab_elements (htab);
++
++  /* Resize only when table after removal of unused elements is either
++     too full or too empty.  */
++  if (elts * 2 > osize || (elts * 8 < osize && osize > 32))
++    nhtab = htab_create (elts * 2);
++  else
++    nhtab = htab_create (osize - 1);
++  nhtab->n_elements = htab->n_elements - htab->n_deleted;
++
++  p = htab->entries;
++  do
++    {
++      hash_entry_type x = *p;
++
++      if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY)
++	*find_empty_slot_for_expand (nhtab, htab_hash (x)) = x;
++
++      p++;
++    }
++  while (p < olimit);
++
++  htab_free (htab);
++  return nhtab;
++}
++
++/* This function searches for a hash table entry equal to the given
++   element.  It cannot be used to insert or delete an element.  */
++
++static hash_entry_type
++htab_find (htab_t htab, const hash_entry_type element)
++{
++  hashval_t index, hash2, hash = htab_hash (element);
++  size_t size;
++  hash_entry_type entry;
++
++  size = htab_size (htab);
++  index = htab_mod (hash, htab);
++
++  entry = htab->entries[index];
++  if (entry == HTAB_EMPTY_ENTRY
++      || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element)))
++    return entry;
++
++  hash2 = htab_mod_m2 (hash, htab);
++  for (;;)
++    {
++      index += hash2;
++      if (index >= size)
++	index -= size;
++
++      entry = htab->entries[index];
++      if (entry == HTAB_EMPTY_ENTRY
++	  || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element)))
++	return entry;
++    }
++}
++
++/* This function searches for a hash table slot containing an entry
++   equal to the given element.  To delete an entry, call this with
++   insert=NO_INSERT, then call htab_clear_slot on the slot returned
++   (possibly after doing some checks).  To insert an entry, call this
++   with insert=INSERT, then write the value you want into the returned
++   slot.  */
++
++static hash_entry_type *
++htab_find_slot (htab_t *htabp, const hash_entry_type element,
++		enum insert_option insert)
++{
++  hash_entry_type *first_deleted_slot;
++  hashval_t index, hash2, hash = htab_hash (element);
++  size_t size;
++  hash_entry_type entry;
++  htab_t htab = *htabp;
++
++  size = htab_size (htab);
++  if (insert == INSERT && size * 3 <= htab->n_elements * 4)
++    {
++      htab = *htabp = htab_expand (htab);
++      size = htab_size (htab);
++    }
++
++  index = htab_mod (hash, htab);
++
++  first_deleted_slot = NULL;
++
++  entry = htab->entries[index];
++  if (entry == HTAB_EMPTY_ENTRY)
++    goto empty_entry;
++  else if (entry == HTAB_DELETED_ENTRY)
++    first_deleted_slot = &htab->entries[index];
++  else if (htab_eq (entry, element))
++    return &htab->entries[index];
++
++  hash2 = htab_mod_m2 (hash, htab);
++  for (;;)
++    {
++      index += hash2;
++      if (index >= size)
++	index -= size;
++
++      entry = htab->entries[index];
++      if (entry == HTAB_EMPTY_ENTRY)
++	goto empty_entry;
++      else if (entry == HTAB_DELETED_ENTRY)
++	{
++	  if (!first_deleted_slot)
++	    first_deleted_slot = &htab->entries[index];
++	}
++      else if (htab_eq (entry, element))
++	return &htab->entries[index];
++    }
++
++ empty_entry:
++  if (insert == NO_INSERT)
++    return NULL;
++
++  if (first_deleted_slot)
++    {
++      htab->n_deleted--;
++      *first_deleted_slot = HTAB_EMPTY_ENTRY;
++      return first_deleted_slot;
++    }
++
++  htab->n_elements++;
++  return &htab->entries[index];
++}
++
++/* This function clears a specified slot in a hash table.  It is
++   useful when you've already done the lookup and don't want to do it
++   again.  */
++
++static inline void
++htab_clear_slot (htab_t htab, hash_entry_type *slot)
++{
++  if (slot < htab->entries || slot >= htab->entries + htab_size (htab)
++      || *slot == HTAB_EMPTY_ENTRY || *slot == HTAB_DELETED_ENTRY)
++    abort ();
++
++  *slot = HTAB_DELETED_ENTRY;
++  htab->n_deleted++;
++}
++
++/* Returns a hash code for pointer P. Simplified version of evahash */
++
++static inline hashval_t
++hash_pointer (const void *p)
++{
++  uintptr_t v = (uintptr_t) p;
++  if (sizeof (v) > sizeof (hashval_t))
++    v ^= v >> (sizeof (uintptr_t) / 2 * __CHAR_BIT__);
++  return v;
++}
+--- libgomp/work.c	(revision 210461)
++++ libgomp/work.c	(revision 210462)
+@@ -221,7 +221,10 @@ gomp_work_share_end (void)
+   if (gomp_barrier_last_thread (bstate))
+     {
+       if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
+-	free_work_share (team, thr->ts.last_work_share);
++	{
++	  team->work_shares_to_free = thr->ts.work_share;
++	  free_work_share (team, thr->ts.last_work_share);
++	}
+     }
+ 
+   gomp_team_barrier_wait_end (&team->barrier, bstate);
+@@ -229,6 +232,32 @@ gomp_work_share_end (void)
+ }
+ 
+ /* The current thread is done with its current work sharing construct.
++   This version implies a cancellable barrier at the end of the work-share.  */
++
++bool
++gomp_work_share_end_cancel (void)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++  gomp_barrier_state_t bstate;
++
++  /* Cancellable work sharing constructs cannot be orphaned.  */
++  bstate = gomp_barrier_wait_cancel_start (&team->barrier);
++
++  if (gomp_barrier_last_thread (bstate))
++    {
++      if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
++	{
++	  team->work_shares_to_free = thr->ts.work_share;
++	  free_work_share (team, thr->ts.last_work_share);
++	}
++    }
++  thr->ts.last_work_share = NULL;
++
++  return gomp_team_barrier_wait_cancel_end (&team->barrier, bstate);
++}
++
++/* The current thread is done with its current work sharing construct.
+    This version does NOT imply a barrier at the end of the work-share.  */
+ 
+ void
+@@ -259,6 +288,9 @@ gomp_work_share_end_nowait (void)
+ #endif
+ 
+   if (completed == team->nthreads)
+-    free_work_share (team, thr->ts.last_work_share);
++    {
++      team->work_shares_to_free = thr->ts.work_share;
++      free_work_share (team, thr->ts.last_work_share);
++    }
+   thr->ts.last_work_share = NULL;
+ }
+--- libgomp/config/linux/proc.c	(revision 210461)
++++ libgomp/config/linux/proc.c	(revision 210462)
+@@ -30,6 +30,7 @@
+ #endif
+ #include "libgomp.h"
+ #include "proc.h"
++#include <errno.h>
+ #include <stdlib.h>
+ #include <unistd.h>
+ #ifdef HAVE_GETLOADAVG
+@@ -39,19 +40,28 @@
+ #endif
+ 
+ #ifdef HAVE_PTHREAD_AFFINITY_NP
++unsigned long gomp_cpuset_size;
++static unsigned long gomp_get_cpuset_size;
++cpu_set_t *gomp_cpusetp;
++
+ unsigned long
+-gomp_cpuset_popcount (cpu_set_t *cpusetp)
++gomp_cpuset_popcount (unsigned long cpusetsize, cpu_set_t *cpusetp)
+ {
+-#ifdef CPU_COUNT
+-  /* glibc 2.6 and above provide a macro for this.  */
+-  return CPU_COUNT (cpusetp);
++#ifdef CPU_COUNT_S
++  /* glibc 2.7 and above provide a macro for this.  */
++  return CPU_COUNT_S (cpusetsize, cpusetp);
+ #else
++#ifdef CPU_COUNT
++  if (cpusetsize == sizeof (cpu_set_t))
++    /* glibc 2.6 and above provide a macro for this.  */
++    return CPU_COUNT (cpusetp);
++#endif
+   size_t i;
+   unsigned long ret = 0;
+-  extern int check[sizeof (cpusetp->__bits[0]) == sizeof (unsigned long int)];
++  extern int check[sizeof (cpusetp->__bits[0]) == sizeof (unsigned long int)
++		   ? 1 : -1] __attribute__((unused));
+ 
+-  (void) check;
+-  for (i = 0; i < sizeof (*cpusetp) / sizeof (cpusetp->__bits[0]); i++)
++  for (i = 0; i < cpusetsize / sizeof (cpusetp->__bits[0]); i++)
+     {
+       unsigned long int mask = cpusetp->__bits[i];
+       if (mask == 0)
+@@ -70,16 +80,63 @@ void
+ gomp_init_num_threads (void)
+ {
+ #ifdef HAVE_PTHREAD_AFFINITY_NP
+-  cpu_set_t cpuset;
++#if defined (_SC_NPROCESSORS_CONF) && defined (CPU_ALLOC_SIZE)
++  gomp_cpuset_size = sysconf (_SC_NPROCESSORS_CONF);
++  gomp_cpuset_size = CPU_ALLOC_SIZE (gomp_cpuset_size);
++#else
++  gomp_cpuset_size = sizeof (cpu_set_t);
++#endif
+ 
+-  if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset), &cpuset) == 0)
++  gomp_cpusetp = (cpu_set_t *) gomp_malloc (gomp_cpuset_size);
++  do
+     {
+-      /* Count only the CPUs this process can use.  */
+-      gomp_global_icv.nthreads_var = gomp_cpuset_popcount (&cpuset);
+-      if (gomp_global_icv.nthreads_var == 0)
+-	gomp_global_icv.nthreads_var = 1;
+-      return;
++      int ret = pthread_getaffinity_np (pthread_self (), gomp_cpuset_size,
++					gomp_cpusetp);
++      if (ret == 0)
++	{
++	  /* Count only the CPUs this process can use.  */
++	  gomp_global_icv.nthreads_var
++	    = gomp_cpuset_popcount (gomp_cpuset_size, gomp_cpusetp);
++	  if (gomp_global_icv.nthreads_var == 0)
++	    break;
++	  gomp_get_cpuset_size = gomp_cpuset_size;
++#ifdef CPU_ALLOC_SIZE
++	  unsigned long i;
++	  for (i = gomp_cpuset_size * 8; i; i--)
++	    if (CPU_ISSET_S (i - 1, gomp_cpuset_size, gomp_cpusetp))
++	      break;
++	  gomp_cpuset_size = CPU_ALLOC_SIZE (i);
++#endif
++	  return;
++	}
++      if (ret != EINVAL)
++	break;
++#ifdef CPU_ALLOC_SIZE
++      if (gomp_cpuset_size < sizeof (cpu_set_t))
++	gomp_cpuset_size = sizeof (cpu_set_t);
++      else
++	gomp_cpuset_size = gomp_cpuset_size * 2;
++      if (gomp_cpuset_size < 8 * sizeof (cpu_set_t))
++	gomp_cpusetp
++	  = (cpu_set_t *) gomp_realloc (gomp_cpusetp, gomp_cpuset_size);
++      else
++	{
++	  /* Avoid gomp_fatal if too large memory allocation would be
++	     requested, e.g. kernel returning EINVAL all the time.  */
++	  void *p = realloc (gomp_cpusetp, gomp_cpuset_size);
++	  if (p == NULL)
++	    break;
++	  gomp_cpusetp = (cpu_set_t *) p;
++	}
++#else
++      break;
++#endif
+     }
++  while (1);
++  gomp_cpuset_size = 0;
++  gomp_global_icv.nthreads_var = 1;
++  free (gomp_cpusetp);
++  gomp_cpusetp = NULL;
+ #endif
+ #ifdef _SC_NPROCESSORS_ONLN
+   gomp_global_icv.nthreads_var = sysconf (_SC_NPROCESSORS_ONLN);
+@@ -90,15 +147,14 @@ static int
+ get_num_procs (void)
+ {
+ #ifdef HAVE_PTHREAD_AFFINITY_NP
+-  cpu_set_t cpuset;
+-
+-  if (gomp_cpu_affinity == NULL)
++  if (gomp_places_list == NULL)
+     {
+       /* Count only the CPUs this process can use.  */
+-      if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset),
+-				  &cpuset) == 0)
++      if (gomp_cpusetp
++	  && pthread_getaffinity_np (pthread_self (), gomp_get_cpuset_size,
++				     gomp_cpusetp) == 0)
+ 	{
+-	  int ret = gomp_cpuset_popcount (&cpuset);
++	  int ret = gomp_cpuset_popcount (gomp_get_cpuset_size, gomp_cpusetp);
+ 	  return ret != 0 ? ret : 1;
+ 	}
+     }
+--- libgomp/config/linux/bar.c	(revision 210461)
++++ libgomp/config/linux/bar.c	(revision 210462)
+@@ -33,11 +33,11 @@
+ void
+ gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+ {
+-  if (__builtin_expect ((state & 1) != 0, 0))
++  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+     {
+       /* Next time we'll be awaiting TOTAL threads again.  */
+       bar->awaited = bar->total;
+-      __atomic_store_n (&bar->generation, bar->generation + 4,
++      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+ 			MEMMODEL_RELEASE);
+       futex_wake ((int *) &bar->generation, INT_MAX);
+     }
+@@ -66,7 +66,7 @@ void
+ gomp_barrier_wait_last (gomp_barrier_t *bar)
+ {
+   gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
+-  if (state & 1)
++  if (state & BAR_WAS_LAST)
+     gomp_barrier_wait_end (bar, state);
+ }
+ 
+@@ -81,40 +81,43 @@ gomp_team_barrier_wait_end (gomp_barrier
+ {
+   unsigned int generation, gen;
+ 
+-  if (__builtin_expect ((state & 1) != 0, 0))
++  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+     {
+       /* Next time we'll be awaiting TOTAL threads again.  */
+       struct gomp_thread *thr = gomp_thread ();
+       struct gomp_team *team = thr->ts.team;
+ 
+       bar->awaited = bar->total;
++      team->work_share_cancelled = 0;
+       if (__builtin_expect (team->task_count, 0))
+ 	{
+ 	  gomp_barrier_handle_tasks (state);
+-	  state &= ~1;
++	  state &= ~BAR_WAS_LAST;
+ 	}
+       else
+ 	{
+-	  __atomic_store_n (&bar->generation, state + 3, MEMMODEL_RELEASE);
++	  state &= ~BAR_CANCELLED;
++	  state += BAR_INCR - BAR_WAS_LAST;
++	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+ 	  futex_wake ((int *) &bar->generation, INT_MAX);
+ 	  return;
+ 	}
+     }
+ 
+   generation = state;
++  state &= ~BAR_CANCELLED;
+   do
+     {
+       do_wait ((int *) &bar->generation, generation);
+       gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+-      if (__builtin_expect (gen & 1, 0))
++      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+ 	{
+ 	  gomp_barrier_handle_tasks (state);
+ 	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ 	}
+-      if ((gen & 2) != 0)
+-	generation |= 2;
++      generation |= gen & BAR_WAITING_FOR_TASK;
+     }
+-  while (gen != state + 4);
++  while (gen != state + BAR_INCR);
+ }
+ 
+ void
+@@ -122,3 +125,86 @@ gomp_team_barrier_wait (gomp_barrier_t *
+ {
+   gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+ }
++
++void
++gomp_team_barrier_wait_final (gomp_barrier_t *bar)
++{
++  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
++  if (__builtin_expect (state & BAR_WAS_LAST, 0))
++    bar->awaited_final = bar->total;
++  gomp_team_barrier_wait_end (bar, state);
++}
++
++bool
++gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
++				   gomp_barrier_state_t state)
++{
++  unsigned int generation, gen;
++
++  if (__builtin_expect (state & BAR_WAS_LAST, 0))
++    {
++      /* Next time we'll be awaiting TOTAL threads again.  */
++      /* BAR_CANCELLED should never be set in state here, because
++	 cancellation means that at least one of the threads has been
++	 cancelled, thus on a cancellable barrier we should never see
++	 all threads to arrive.  */
++      struct gomp_thread *thr = gomp_thread ();
++      struct gomp_team *team = thr->ts.team;
++
++      bar->awaited = bar->total;
++      team->work_share_cancelled = 0;
++      if (__builtin_expect (team->task_count, 0))
++	{
++	  gomp_barrier_handle_tasks (state);
++	  state &= ~BAR_WAS_LAST;
++	}
++      else
++	{
++	  state += BAR_INCR - BAR_WAS_LAST;
++	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
++	  futex_wake ((int *) &bar->generation, INT_MAX);
++	  return false;
++	}
++    }
++
++  if (__builtin_expect (state & BAR_CANCELLED, 0))
++    return true;
++
++  generation = state;
++  do
++    {
++      do_wait ((int *) &bar->generation, generation);
++      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
++      if (__builtin_expect (gen & BAR_CANCELLED, 0))
++	return true;
++      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
++	{
++	  gomp_barrier_handle_tasks (state);
++	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
++	}
++      generation |= gen & BAR_WAITING_FOR_TASK;
++    }
++  while (gen != state + BAR_INCR);
++
++  return false;
++}
++
++bool
++gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
++{
++  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
++}
++
++void
++gomp_team_barrier_cancel (struct gomp_team *team)
++{
++  gomp_mutex_lock (&team->task_lock);
++  if (team->barrier.generation & BAR_CANCELLED)
++    {
++      gomp_mutex_unlock (&team->task_lock);
++      return;
++    }
++  team->barrier.generation |= BAR_CANCELLED;
++  gomp_mutex_unlock (&team->task_lock);
++  futex_wake ((int *) &team->barrier.generation, INT_MAX);
++}
+--- libgomp/config/linux/proc.h	(revision 210461)
++++ libgomp/config/linux/proc.h	(revision 210462)
+@@ -28,7 +28,10 @@
+ #include <sched.h>
+ 
+ #ifdef HAVE_PTHREAD_AFFINITY_NP
+-extern unsigned long gomp_cpuset_popcount (cpu_set_t *);
++extern unsigned long gomp_cpuset_size attribute_hidden;
++extern cpu_set_t *gomp_cpusetp attribute_hidden;
++extern unsigned long gomp_cpuset_popcount (unsigned long, cpu_set_t *)
++     attribute_hidden;
+ #endif
+ 
+ #endif /* GOMP_PROC_H */
+--- libgomp/config/linux/affinity.c	(revision 210461)
++++ libgomp/config/linux/affinity.c	(revision 210462)
+@@ -29,90 +29,327 @@
+ #endif
+ #include "libgomp.h"
+ #include "proc.h"
++#include <errno.h>
+ #include <stdlib.h>
++#include <stdio.h>
++#include <string.h>
+ #include <unistd.h>
+ 
+ #ifdef HAVE_PTHREAD_AFFINITY_NP
+ 
+-static unsigned int affinity_counter;
++#ifndef CPU_ALLOC_SIZE
++#define CPU_ISSET_S(idx, size, set) CPU_ISSET(idx, set)
++#define CPU_ZERO_S(size, set) CPU_ZERO(set)
++#define CPU_SET_S(idx, size, set) CPU_SET(idx, set)
++#define CPU_CLR_S(idx, size, set) CPU_CLR(idx, set)
++#endif
+ 
+ void
+ gomp_init_affinity (void)
+ {
+-  cpu_set_t cpuset, cpusetnew;
+-  size_t idx, widx;
+-  unsigned long cpus = 0;
+-
+-  if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset), &cpuset))
+-    {
+-      gomp_error ("could not get CPU affinity set");
+-      free (gomp_cpu_affinity);
+-      gomp_cpu_affinity = NULL;
+-      gomp_cpu_affinity_len = 0;
+-      return;
+-    }
+-
+-  CPU_ZERO (&cpusetnew);
+-  if (gomp_cpu_affinity_len == 0)
+-    {
+-      unsigned long count = gomp_cpuset_popcount (&cpuset);
+-      if (count >= 65536)
+-	count = 65536;
+-      gomp_cpu_affinity = malloc (count * sizeof (unsigned short));
+-      if (gomp_cpu_affinity == NULL)
++  if (gomp_places_list == NULL)
++    {
++      if (!gomp_affinity_init_level (1, ULONG_MAX, true))
++	return;
++    }
++
++  struct gomp_thread *thr = gomp_thread ();
++  pthread_setaffinity_np (pthread_self (), gomp_cpuset_size,
++			  (cpu_set_t *) gomp_places_list[0]);
++  thr->place = 1;
++  thr->ts.place_partition_off = 0;
++  thr->ts.place_partition_len = gomp_places_list_len;
++}
++
++void
++gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place)
++{
++  pthread_attr_setaffinity_np (attr, gomp_cpuset_size,
++			       (cpu_set_t *) gomp_places_list[place]);
++}
++
++void **
++gomp_affinity_alloc (unsigned long count, bool quiet)
++{
++  unsigned long i;
++  void **ret;
++  char *p;
++
++  if (gomp_cpusetp == NULL)
++    {
++      if (!quiet)
++	gomp_error ("Could not get CPU affinity set");
++      return NULL;
++    }
++
++  ret = malloc (count * sizeof (void *) + count * gomp_cpuset_size);
++  if (ret == NULL)
++    {
++      if (!quiet)
++	gomp_error ("Out of memory trying to allocate places list");
++      return NULL;
++    }
++
++  p = (char *) (ret + count);
++  for (i = 0; i < count; i++, p += gomp_cpuset_size)
++    ret[i] = p;
++  return ret;
++}
++
++void
++gomp_affinity_init_place (void *p)
++{
++  cpu_set_t *cpusetp = (cpu_set_t *) p;
++  CPU_ZERO_S (gomp_cpuset_size, cpusetp);
++}
++
++bool
++gomp_affinity_add_cpus (void *p, unsigned long num,
++			unsigned long len, long stride, bool quiet)
++{
++  cpu_set_t *cpusetp = (cpu_set_t *) p;
++  unsigned long max = 8 * gomp_cpuset_size;
++  for (;;)
++    {
++      if (num >= max)
+ 	{
+-	  gomp_error ("not enough memory to store CPU affinity list");
+-	  return;
++	  if (!quiet)
++	    gomp_error ("Logical CPU number %lu out of range", num);
++	  return false;
+ 	}
+-      for (widx = idx = 0; widx < count && idx < 65536; idx++)
+-	if (CPU_ISSET (idx, &cpuset))
++      CPU_SET_S (num, gomp_cpuset_size, cpusetp);
++      if (--len == 0)
++	return true;
++      if ((stride < 0 && num + stride > num)
++	  || (stride > 0 && num + stride < num))
++	{
++	  if (!quiet)
++	    gomp_error ("Logical CPU number %lu+%ld out of range",
++			num, stride);
++	  return false;
++	}
++      num += stride;
++    }
++}
++
++bool
++gomp_affinity_remove_cpu (void *p, unsigned long num)
++{
++  cpu_set_t *cpusetp = (cpu_set_t *) p;
++  if (num >= 8 * gomp_cpuset_size)
++    {
++      gomp_error ("Logical CPU number %lu out of range", num);
++      return false;
++    }
++  if (!CPU_ISSET_S (num, gomp_cpuset_size, cpusetp))
++    {
++      gomp_error ("Logical CPU %lu to be removed is not in the set", num);
++      return false;
++    }
++  CPU_CLR_S (num, gomp_cpuset_size, cpusetp);
++  return true;
++}
++
++bool
++gomp_affinity_copy_place (void *p, void *q, long stride)
++{
++  unsigned long i, max = 8 * gomp_cpuset_size;
++  cpu_set_t *destp = (cpu_set_t *) p;
++  cpu_set_t *srcp = (cpu_set_t *) q;
++
++  CPU_ZERO_S (gomp_cpuset_size, destp);
++  for (i = 0; i < max; i++)
++    if (CPU_ISSET_S (i, gomp_cpuset_size, srcp))
++      {
++	if ((stride < 0 && i + stride > i)
++	    || (stride > 0 && (i + stride < i || i + stride >= max)))
++	  {
++	    gomp_error ("Logical CPU number %lu+%ld out of range", i, stride);
++	    return false;
++	  }
++	CPU_SET_S (i + stride, gomp_cpuset_size, destp);
++      }
++  return true;
++}
++
++bool
++gomp_affinity_same_place (void *p, void *q)
++{
++#ifdef CPU_EQUAL_S
++  return CPU_EQUAL_S (gomp_cpuset_size, (cpu_set_t *) p, (cpu_set_t *) q);
++#else
++  return memcmp (p, q, gomp_cpuset_size) == 0;
++#endif
++}
++
++bool
++gomp_affinity_finalize_place_list (bool quiet)
++{
++  unsigned long i, j;
++
++  for (i = 0, j = 0; i < gomp_places_list_len; i++)
++    {
++      cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[i];
++      bool nonempty = false;
++#ifdef CPU_AND_S
++      CPU_AND_S (gomp_cpuset_size, cpusetp, cpusetp, gomp_cpusetp);
++      nonempty = gomp_cpuset_popcount (gomp_cpuset_size, cpusetp) != 0;
++#else
++      unsigned long k, max = gomp_cpuset_size / sizeof (cpusetp->__bits[0]);
++      for (k = 0; k < max; k++)
++	if ((cpusetp->__bits[k] &= gomp_cpusetp->__bits[k]) != 0)
++	  nonempty = true;
++#endif
++      if (nonempty)
++	gomp_places_list[j++] = gomp_places_list[i];
++    }
++
++  if (j == 0)
++    {
++      if (!quiet)
++	gomp_error ("None of the places contain usable logical CPUs");
++      return false;
++    }
++  else if (j < gomp_places_list_len)
++    {
++      if (!quiet)
++	gomp_error ("Number of places reduced from %ld to %ld because some "
++		    "places didn't contain any usable logical CPUs",
++		    gomp_places_list_len, j);
++      gomp_places_list_len = j;
++    }
++  return true;
++}
++
++bool
++gomp_affinity_init_level (int level, unsigned long count, bool quiet)
++{
++  unsigned long i, max = 8 * gomp_cpuset_size;
++
++  if (gomp_cpusetp)
++    {
++      unsigned long maxcount
++	= gomp_cpuset_popcount (gomp_cpuset_size, gomp_cpusetp);
++      if (count > maxcount)
++	count = maxcount;
++    }
++  gomp_places_list = gomp_affinity_alloc (count, quiet);
++  gomp_places_list_len = 0;
++  if (gomp_places_list == NULL)
++    return false;
++  /* SMT (threads).  */
++  if (level == 1)
++    {
++      for (i = 0; i < max && gomp_places_list_len < count; i++)
++	if (CPU_ISSET_S (i, gomp_cpuset_size, gomp_cpusetp))
+ 	  {
+-	    cpus++;
+-	    gomp_cpu_affinity[widx++] = idx;
++	    gomp_affinity_init_place (gomp_places_list[gomp_places_list_len]);
++	    gomp_affinity_add_cpus (gomp_places_list[gomp_places_list_len],
++				    i, 1, 0, true);
++	    ++gomp_places_list_len;
+ 	  }
++      return true;
+     }
+   else
+-    for (widx = idx = 0; idx < gomp_cpu_affinity_len; idx++)
+-      if (gomp_cpu_affinity[idx] < CPU_SETSIZE
+-	  && CPU_ISSET (gomp_cpu_affinity[idx], &cpuset))
++    {
++      char name[sizeof ("/sys/devices/system/cpu/cpu/topology/"
++			"thread_siblings_list") + 3 * sizeof (unsigned long)];
++      size_t prefix_len = sizeof ("/sys/devices/system/cpu/cpu") - 1;
++      cpu_set_t *copy = gomp_alloca (gomp_cpuset_size);
++      FILE *f;
++      char *line = NULL;
++      size_t linelen = 0;
++
++      memcpy (name, "/sys/devices/system/cpu/cpu", prefix_len);
++      memcpy (copy, gomp_cpusetp, gomp_cpuset_size);
++      for (i = 0; i < max && gomp_places_list_len < count; i++)
++	if (CPU_ISSET_S (i, gomp_cpuset_size, copy))
++	  {
++	    sprintf (name + prefix_len, "%lu/topology/%s_siblings_list",
++		     i, level == 2 ? "thread" : "core");
++	    f = fopen (name, "r");
++	    if (f != NULL)
++	      {
++		if (getline (&line, &linelen, f) > 0)
++		  {
++		    char *p = line;
++		    bool seen_i = false;
++		    void *pl = gomp_places_list[gomp_places_list_len];
++		    gomp_affinity_init_place (pl);
++		    while (*p && *p != '\n')
++		      {
++			unsigned long first, last;
++			errno = 0;
++			first = strtoul (p, &p, 10);
++			if (errno)
++			  break;
++			last = first;
++			if (*p == '-')
++			  {
++			    errno = 0;
++			    last = strtoul (p + 1, &p, 10);
++			    if (errno || last < first)
++			      break;
++			  }
++			for (; first <= last; first++)
++			  if (CPU_ISSET_S (first, gomp_cpuset_size, copy)
++			      && gomp_affinity_add_cpus (pl, first, 1, 0,
++							 true))
++			    {
++			      CPU_CLR_S (first, gomp_cpuset_size, copy);
++			      if (first == i)
++				seen_i = true;
++			    }
++			if (*p == ',')
++			  ++p;
++		      }
++		    if (seen_i)
++		      gomp_places_list_len++;
++		  }
++		fclose (f);
++	      }
++	  }
++      if (gomp_places_list_len == 0)
+ 	{
+-	  if (! CPU_ISSET (gomp_cpu_affinity[idx], &cpusetnew))
+-	    {
+-	      cpus++;
+-	      CPU_SET (gomp_cpu_affinity[idx], &cpusetnew);
+-	    }
+-	  gomp_cpu_affinity[widx++] = gomp_cpu_affinity[idx];
++	  if (!quiet)
++	    gomp_error ("Error reading %s topology",
++			level == 2 ? "core" : "socket");
++	  free (gomp_places_list);
++	  gomp_places_list = NULL;
++	  return false;
+ 	}
+-
+-  if (widx == 0)
+-    {
+-      gomp_error ("no CPUs left for affinity setting");
+-      free (gomp_cpu_affinity);
+-      gomp_cpu_affinity = NULL;
+-      gomp_cpu_affinity_len = 0;
+-      return;
+-    }
+-
+-  gomp_cpu_affinity_len = widx;
+-  if (cpus < gomp_available_cpus)
+-    gomp_available_cpus = cpus;
+-  CPU_ZERO (&cpuset);
+-  CPU_SET (gomp_cpu_affinity[0], &cpuset);
+-  pthread_setaffinity_np (pthread_self (), sizeof (cpuset), &cpuset);
+-  affinity_counter = 1;
++      return true;
++    }
++  return false;
+ }
+ 
+ void
+-gomp_init_thread_affinity (pthread_attr_t *attr)
++gomp_affinity_print_place (void *p)
+ {
+-  unsigned int cpu;
+-  cpu_set_t cpuset;
++  unsigned long i, max = 8 * gomp_cpuset_size, len;
++  cpu_set_t *cpusetp = (cpu_set_t *) p;
++  bool notfirst = false;
+ 
+-  cpu = __atomic_fetch_add (&affinity_counter, 1, MEMMODEL_RELAXED);
+-  cpu %= gomp_cpu_affinity_len;
+-  CPU_ZERO (&cpuset);
+-  CPU_SET (gomp_cpu_affinity[cpu], &cpuset);
+-  pthread_attr_setaffinity_np (attr, sizeof (cpu_set_t), &cpuset);
++  for (i = 0, len = 0; i < max; i++)
++    if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp))
++      {
++	if (len == 0)
++	  {
++	    if (notfirst)
++	      fputc (',', stderr);
++	    notfirst = true;
++	    fprintf (stderr, "%lu", i);
++	  }
++	++len;
++      }
++    else
++      {
++	if (len > 1)
++	  fprintf (stderr, ":%lu", len);
++	len = 0;
++      }
++  if (len > 1)
++    fprintf (stderr, ":%lu", len);
+ }
+ 
+ #else
+--- libgomp/config/linux/bar.h	(revision 210461)
++++ libgomp/config/linux/bar.h	(revision 210462)
+@@ -38,13 +38,25 @@ typedef struct
+   unsigned total __attribute__((aligned (64)));
+   unsigned generation;
+   unsigned awaited __attribute__((aligned (64)));
++  unsigned awaited_final;
+ } gomp_barrier_t;
++
+ typedef unsigned int gomp_barrier_state_t;
+ 
++/* The generation field contains a counter in the high bits, with a few
++   low bits dedicated to flags.  Note that TASK_PENDING and WAS_LAST can
++   share space because WAS_LAST is never stored back to generation.  */
++#define BAR_TASK_PENDING	1
++#define BAR_WAS_LAST		1
++#define BAR_WAITING_FOR_TASK	2
++#define BAR_CANCELLED		4
++#define BAR_INCR		8
++
+ static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
+ {
+   bar->total = count;
+   bar->awaited = count;
++  bar->awaited_final = count;
+   bar->generation = 0;
+ }
+ 
+@@ -62,27 +74,55 @@ extern void gomp_barrier_wait (gomp_barr
+ extern void gomp_barrier_wait_last (gomp_barrier_t *);
+ extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
+ extern void gomp_team_barrier_wait (gomp_barrier_t *);
++extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
+ extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
+ 					gomp_barrier_state_t);
++extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
++extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
++					       gomp_barrier_state_t);
+ extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
++struct gomp_team;
++extern void gomp_team_barrier_cancel (struct gomp_team *);
+ 
+ static inline gomp_barrier_state_t
+ gomp_barrier_wait_start (gomp_barrier_t *bar)
+ {
+-  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE) & ~3;
++  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
++  ret &= -BAR_INCR | BAR_CANCELLED;
+   /* A memory barrier is needed before exiting from the various forms
+      of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
+      2.8.6 flush Construct, which says there is an implicit flush during
+      a barrier region.  This is a convenient place to add the barrier,
+      so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE.  */
+-  ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0;
++  if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0)
++    ret |= BAR_WAS_LAST;
++  return ret;
++}
++
++static inline gomp_barrier_state_t
++gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
++{
++  return gomp_barrier_wait_start (bar);
++}
++
++/* This is like gomp_barrier_wait_start, except it decrements
++   bar->awaited_final rather than bar->awaited and should be used
++   for the gomp_team_end barrier only.  */
++static inline gomp_barrier_state_t
++gomp_barrier_wait_final_start (gomp_barrier_t *bar)
++{
++  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
++  ret &= -BAR_INCR | BAR_CANCELLED;
++  /* See above gomp_barrier_wait_start comment.  */
++  if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_ACQ_REL) == 0)
++    ret |= BAR_WAS_LAST;
+   return ret;
+ }
+ 
+ static inline bool
+ gomp_barrier_last_thread (gomp_barrier_state_t state)
+ {
+-  return state & 1;
++  return state & BAR_WAS_LAST;
+ }
+ 
+ /* All the inlines below must be called with team->task_lock
+@@ -91,31 +131,37 @@ gomp_barrier_last_thread (gomp_barrier_s
+ static inline void
+ gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
+ {
+-  bar->generation |= 1;
++  bar->generation |= BAR_TASK_PENDING;
+ }
+ 
+ static inline void
+ gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
+ {
+-  bar->generation &= ~1;
++  bar->generation &= ~BAR_TASK_PENDING;
+ }
+ 
+ static inline void
+ gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
+ {
+-  bar->generation |= 2;
++  bar->generation |= BAR_WAITING_FOR_TASK;
+ }
+ 
+ static inline bool
+ gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
+ {
+-  return (bar->generation & 2) != 0;
++  return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
++}
++
++static inline bool
++gomp_team_barrier_cancelled (gomp_barrier_t *bar)
++{
++  return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
+ }
+ 
+ static inline void
+ gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
+ {
+-  bar->generation = (state & ~3) + 4;
++  bar->generation = (state & -BAR_INCR) + BAR_INCR;
+ }
+ 
+ #endif /* GOMP_BARRIER_H */
+--- libgomp/config/posix/bar.c	(revision 210461)
++++ libgomp/config/posix/bar.c	(revision 210462)
+@@ -42,6 +42,7 @@ gomp_barrier_init (gomp_barrier_t *bar,
+   bar->total = count;
+   bar->arrived = 0;
+   bar->generation = 0;
++  bar->cancellable = false;
+ }
+ 
+ void
+@@ -72,7 +73,7 @@ gomp_barrier_wait_end (gomp_barrier_t *b
+ {
+   unsigned int n;
+ 
+-  if (state & 1)
++  if (state & BAR_WAS_LAST)
+     {
+       n = --bar->arrived;
+       if (n > 0)
+@@ -113,12 +114,14 @@ gomp_team_barrier_wait_end (gomp_barrier
+ {
+   unsigned int n;
+ 
+-  if (state & 1)
++  state &= ~BAR_CANCELLED;
++  if (state & BAR_WAS_LAST)
+     {
+       n = --bar->arrived;
+       struct gomp_thread *thr = gomp_thread ();
+       struct gomp_team *team = thr->ts.team;
+ 
++      team->work_share_cancelled = 0;
+       if (team->task_count)
+ 	{
+ 	  gomp_barrier_handle_tasks (state);
+@@ -128,7 +131,7 @@ gomp_team_barrier_wait_end (gomp_barrier
+ 	  return;
+ 	}
+ 
+-      bar->generation = state + 3;
++      bar->generation = state + BAR_INCR - BAR_WAS_LAST;
+       if (n > 0)
+ 	{
+ 	  do
+@@ -141,13 +144,18 @@ gomp_team_barrier_wait_end (gomp_barrier
+   else
+     {
+       gomp_mutex_unlock (&bar->mutex1);
++      int gen;
+       do
+ 	{
+ 	  gomp_sem_wait (&bar->sem1);
+-	  if (bar->generation & 1)
+-	    gomp_barrier_handle_tasks (state);
++	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
++	  if (gen & BAR_TASK_PENDING)
++	    {
++	      gomp_barrier_handle_tasks (state);
++	      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
++	    }
+ 	}
+-      while (bar->generation != state + 4);
++      while (gen != state + BAR_INCR);
+ 
+ #ifdef HAVE_SYNC_BUILTINS
+       n = __sync_add_and_fetch (&bar->arrived, -1);
+@@ -162,6 +170,81 @@ gomp_team_barrier_wait_end (gomp_barrier
+     }
+ }
+ 
++bool
++gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
++				   gomp_barrier_state_t state)
++{
++  unsigned int n;
++
++  if (state & BAR_WAS_LAST)
++    {
++      bar->cancellable = false;
++      n = --bar->arrived;
++      struct gomp_thread *thr = gomp_thread ();
++      struct gomp_team *team = thr->ts.team;
++
++      team->work_share_cancelled = 0;
++      if (team->task_count)
++	{
++	  gomp_barrier_handle_tasks (state);
++	  if (n > 0)
++	    gomp_sem_wait (&bar->sem2);
++	  gomp_mutex_unlock (&bar->mutex1);
++	  return false;
++	}
++
++      bar->generation = state + BAR_INCR - BAR_WAS_LAST;
++      if (n > 0)
++	{
++	  do
++	    gomp_sem_post (&bar->sem1);
++	  while (--n != 0);
++	  gomp_sem_wait (&bar->sem2);
++	}
++      gomp_mutex_unlock (&bar->mutex1);
++    }
++  else
++    {
++      if (state & BAR_CANCELLED)
++	{
++	  gomp_mutex_unlock (&bar->mutex1);
++	  return true;
++	}
++      bar->cancellable = true;
++      gomp_mutex_unlock (&bar->mutex1);
++      int gen;
++      do
++	{
++	  gomp_sem_wait (&bar->sem1);
++	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
++	  if (gen & BAR_CANCELLED)
++	    break;
++	  if (gen & BAR_TASK_PENDING)
++	    {
++	      gomp_barrier_handle_tasks (state);
++	      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
++	      if (gen & BAR_CANCELLED)
++		break;
++	    }
++	}
++      while (gen != state + BAR_INCR);
++
++#ifdef HAVE_SYNC_BUILTINS
++      n = __sync_add_and_fetch (&bar->arrived, -1);
++#else
++      gomp_mutex_lock (&bar->mutex2);
++      n = --bar->arrived;
++      gomp_mutex_unlock (&bar->mutex2);
++#endif
++
++      if (n == 0)
++	gomp_sem_post (&bar->sem2);
++      if (gen & BAR_CANCELLED)
++	return true;
++    }
++  return false;
++}
++
+ void
+ gomp_team_barrier_wait (gomp_barrier_t *barrier)
+ {
+@@ -176,3 +259,40 @@ gomp_team_barrier_wake (gomp_barrier_t *
+   while (count-- > 0)
+     gomp_sem_post (&bar->sem1);
+ }
++
++bool
++gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
++{
++  gomp_barrier_state_t state = gomp_barrier_wait_cancel_start (bar);
++  return gomp_team_barrier_wait_cancel_end (bar, state);
++}
++
++void
++gomp_team_barrier_cancel (struct gomp_team *team)
++{
++  if (team->barrier.generation & BAR_CANCELLED)
++    return;
++  gomp_mutex_lock (&team->barrier.mutex1);
++  gomp_mutex_lock (&team->task_lock);
++  if (team->barrier.generation & BAR_CANCELLED)
++    {
++      gomp_mutex_unlock (&team->task_lock);
++      gomp_mutex_unlock (&team->barrier.mutex1);
++      return;
++    }
++  team->barrier.generation |= BAR_CANCELLED;
++  gomp_mutex_unlock (&team->task_lock);
++  if (team->barrier.cancellable)
++    {
++      int n = team->barrier.arrived;
++      if (n > 0)
++	{
++	  do
++	    gomp_sem_post (&team->barrier.sem1);
++	  while (--n != 0);
++	  gomp_sem_wait (&team->barrier.sem2);
++	}
++      team->barrier.cancellable = false;
++    }
++  gomp_mutex_unlock (&team->barrier.mutex1);
++}
+--- libgomp/config/posix/affinity.c	(revision 210461)
++++ libgomp/config/posix/affinity.c	(revision 210462)
+@@ -32,7 +32,84 @@ gomp_init_affinity (void)
+ }
+ 
+ void
+-gomp_init_thread_affinity (pthread_attr_t *attr)
++gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place)
+ {
+   (void) attr;
++  (void) place;
++}
++
++void **
++gomp_affinity_alloc (unsigned long count, bool quiet)
++{
++  (void) count;
++  if (!quiet)
++    gomp_error ("Affinity not supported on this configuration");
++  return NULL;
++}
++
++void
++gomp_affinity_init_place (void *p)
++{
++  (void) p;
++}
++
++bool
++gomp_affinity_add_cpus (void *p, unsigned long num,
++			unsigned long len, long stride, bool quiet)
++{
++  (void) p;
++  (void) num;
++  (void) len;
++  (void) stride;
++  (void) quiet;
++  return false;
++}
++
++bool
++gomp_affinity_remove_cpu (void *p, unsigned long num)
++{
++  (void) p;
++  (void) num;
++  return false;
++}
++
++bool
++gomp_affinity_copy_place (void *p, void *q, long stride)
++{
++  (void) p;
++  (void) q;
++  (void) stride;
++  return false;
++}
++
++bool
++gomp_affinity_same_place (void *p, void *q)
++{
++  (void) p;
++  (void) q;
++  return false;
++}
++
++bool
++gomp_affinity_finalize_place_list (bool quiet)
++{
++  (void) quiet;
++  return false;
++}
++
++bool
++gomp_affinity_init_level (int level, unsigned long count, bool quiet)
++{
++  (void) level;
++  (void) count;
++  (void) quiet;
++  if (!quiet)
++    gomp_error ("Affinity not supported on this configuration");
++  return NULL;
++}
++
++void
++gomp_affinity_print_place (void *p)
++{
++  (void) p;
+ }
+--- libgomp/config/posix/bar.h	(revision 210461)
++++ libgomp/config/posix/bar.h	(revision 210462)
+@@ -43,9 +43,20 @@ typedef struct
+   unsigned total;
+   unsigned arrived;
+   unsigned generation;
++  bool cancellable;
+ } gomp_barrier_t;
++
+ typedef unsigned int gomp_barrier_state_t;
+ 
++/* The generation field contains a counter in the high bits, with a few
++   low bits dedicated to flags.  Note that TASK_PENDING and WAS_LAST can
++   share space because WAS_LAST is never stored back to generation.  */
++#define BAR_TASK_PENDING	1
++#define BAR_WAS_LAST		1
++#define BAR_WAITING_FOR_TASK	2
++#define BAR_CANCELLED		4
++#define BAR_INCR		8
++
+ extern void gomp_barrier_init (gomp_barrier_t *, unsigned);
+ extern void gomp_barrier_reinit (gomp_barrier_t *, unsigned);
+ extern void gomp_barrier_destroy (gomp_barrier_t *);
+@@ -55,22 +66,47 @@ extern void gomp_barrier_wait_end (gomp_
+ extern void gomp_team_barrier_wait (gomp_barrier_t *);
+ extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
+ 					gomp_barrier_state_t);
++extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
++extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
++					       gomp_barrier_state_t);
+ extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
++struct gomp_team;
++extern void gomp_team_barrier_cancel (struct gomp_team *);
+ 
+ static inline gomp_barrier_state_t
+ gomp_barrier_wait_start (gomp_barrier_t *bar)
+ {
+   unsigned int ret;
+   gomp_mutex_lock (&bar->mutex1);
+-  ret = bar->generation & ~3;
+-  ret += ++bar->arrived == bar->total;
++  ret = bar->generation & (-BAR_INCR | BAR_CANCELLED);
++  if (++bar->arrived == bar->total)
++    ret |= BAR_WAS_LAST;
++  return ret;
++}
++
++static inline gomp_barrier_state_t
++gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
++{
++  unsigned int ret;
++  gomp_mutex_lock (&bar->mutex1);
++  ret = bar->generation & (-BAR_INCR | BAR_CANCELLED);
++  if (ret & BAR_CANCELLED)
++    return ret;
++  if (++bar->arrived == bar->total)
++    ret |= BAR_WAS_LAST;
+   return ret;
+ }
+ 
++static inline void
++gomp_team_barrier_wait_final (gomp_barrier_t *bar)
++{
++  gomp_team_barrier_wait (bar);
++}
++
+ static inline bool
+ gomp_barrier_last_thread (gomp_barrier_state_t state)
+ {
+-  return state & 1;
++  return state & BAR_WAS_LAST;
+ }
+ 
+ static inline void
+@@ -85,31 +121,37 @@ gomp_barrier_wait_last (gomp_barrier_t *
+ static inline void
+ gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
+ {
+-  bar->generation |= 1;
++  bar->generation |= BAR_TASK_PENDING;
+ }
+ 
+ static inline void
+ gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
+ {
+-  bar->generation &= ~1;
++  bar->generation &= ~BAR_TASK_PENDING;
+ }
+ 
+ static inline void
+ gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
+ {
+-  bar->generation |= 2;
++  bar->generation |= BAR_WAITING_FOR_TASK;
+ }
+ 
+ static inline bool
+ gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
+ {
+-  return (bar->generation & 2) != 0;
++  return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
++}
++
++static inline bool
++gomp_team_barrier_cancelled (gomp_barrier_t *bar)
++{
++  return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
+ }
+ 
+ static inline void
+ gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
+ {
+-  bar->generation = (state & ~3) + 4;
++  bar->generation = (state & -BAR_INCR) + BAR_INCR;
+ }
+ 
+ #endif /* GOMP_BARRIER_H */
+--- libgomp/barrier.c	(revision 210461)
++++ libgomp/barrier.c	(revision 210462)
+@@ -39,3 +39,15 @@ GOMP_barrier (void)
+ 
+   gomp_team_barrier_wait (&team->barrier);
+ }
++
++bool
++GOMP_barrier_cancel (void)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++
++  /* The compiler transforms to barrier_cancel when it sees that the
++     barrier is within a construct that can cancel.  Thus we should
++     never have an orphaned cancellable barrier.  */
++  return gomp_team_barrier_wait_cancel (&team->barrier);
++}
+--- libgomp/target.c	(revision 0)
++++ libgomp/target.c	(revision 210462)
+@@ -0,0 +1,96 @@
++/* Copyright (C) 2013 Free Software Foundation, Inc.
++   Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++   This file is part of the GNU OpenMP Library (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* This file handles the maintainence of threads in response to team
++   creation and termination.  */
++
++#include "libgomp.h"
++#include <limits.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <string.h>
++
++attribute_hidden int
++gomp_get_num_devices (void)
++{
++  return 0;
++}
++
++/* Called when encountering a target directive.  If DEVICE
++   is -1, it means use device-var ICV.  If it is -2 (or any other value
++   larger than last available hw device, use host fallback.
++   FN is address of host code, OPENMP_TARGET contains value of the
++   __OPENMP_TARGET__ symbol in the shared library or binary that invokes
++   GOMP_target.  HOSTADDRS, SIZES and KINDS are arrays
++   with MAPNUM entries, with addresses of the host objects,
++   sizes of the host objects (resp. for pointer kind pointer bias
++   and assumed sizeof (void *) size) and kinds.  */
++
++void
++GOMP_target (int device, void (*fn) (void *), const void *openmp_target,
++	     size_t mapnum, void **hostaddrs, size_t *sizes,
++	     unsigned char *kinds)
++{
++  /* Host fallback.  */
++  struct gomp_thread old_thr, *thr = gomp_thread ();
++  old_thr = *thr;
++  memset (thr, '\0', sizeof (*thr));
++  if (gomp_places_list)
++    {
++      thr->place = old_thr.place;
++      thr->ts.place_partition_len = gomp_places_list_len;
++    }
++  fn (hostaddrs);
++  gomp_free_thread (thr);
++  *thr = old_thr;
++}
++
++void
++GOMP_target_data (int device, const void *openmp_target, size_t mapnum,
++		  void **hostaddrs, size_t *sizes, unsigned char *kinds)
++{
++}
++
++void
++GOMP_target_end_data (void)
++{
++}
++
++void
++GOMP_target_update (int device, const void *openmp_target, size_t mapnum,
++		    void **hostaddrs, size_t *sizes, unsigned char *kinds)
++{
++}
++
++void
++GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
++{
++  if (thread_limit)
++    {
++      struct gomp_task_icv *icv = gomp_icv (true);
++      icv->thread_limit_var
++	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
++    }
++  (void) num_teams;
++}
+--- libgomp/parallel.c	(revision 210461)
++++ libgomp/parallel.c	(revision 210462)
+@@ -37,18 +37,19 @@
+ unsigned
+ gomp_resolve_num_threads (unsigned specified, unsigned count)
+ {
+-  struct gomp_thread *thread = gomp_thread();
++  struct gomp_thread *thr = gomp_thread ();
+   struct gomp_task_icv *icv;
+   unsigned threads_requested, max_num_threads, num_threads;
+-  unsigned long remaining;
++  unsigned long busy;
++  struct gomp_thread_pool *pool;
+ 
+   icv = gomp_icv (false);
+ 
+   if (specified == 1)
+     return 1;
+-  else if (thread->ts.active_level >= 1 && !icv->nest_var)
++  else if (thr->ts.active_level >= 1 && !icv->nest_var)
+     return 1;
+-  else if (thread->ts.active_level >= gomp_max_active_levels_var)
++  else if (thr->ts.active_level >= gomp_max_active_levels_var)
+     return 1;
+ 
+   /* If NUM_THREADS not specified, use nthreads_var.  */
+@@ -72,30 +73,46 @@ gomp_resolve_num_threads (unsigned speci
+ 	max_num_threads = count;
+     }
+ 
+-  /* ULONG_MAX stands for infinity.  */
+-  if (__builtin_expect (gomp_thread_limit_var == ULONG_MAX, 1)
++  /* UINT_MAX stands for infinity.  */
++  if (__builtin_expect (icv->thread_limit_var == UINT_MAX, 1)
+       || max_num_threads == 1)
+     return max_num_threads;
+ 
++  /* The threads_busy counter lives in thread_pool, if there
++     isn't a thread_pool yet, there must be just one thread
++     in the contention group.  If thr->team is NULL, this isn't
++     nested parallel, so there is just one thread in the
++     contention group as well, no need to handle it atomically.  */
++  pool = thr->thread_pool;
++  if (thr->ts.team == NULL)
++    {
++      num_threads = max_num_threads;
++      if (num_threads > icv->thread_limit_var)
++	num_threads = icv->thread_limit_var;
++      if (pool)
++	pool->threads_busy = num_threads;
++      return num_threads;
++    }
++
+ #ifdef HAVE_SYNC_BUILTINS
+   do
+     {
+-      remaining = gomp_remaining_threads_count;
++      busy = pool->threads_busy;
+       num_threads = max_num_threads;
+-      if (num_threads > remaining)
+-	num_threads = remaining + 1;
++      if (icv->thread_limit_var - busy + 1 < num_threads)
++	num_threads = icv->thread_limit_var - busy + 1;
+     }
+-  while (__sync_val_compare_and_swap (&gomp_remaining_threads_count,
+-				      remaining, remaining - num_threads + 1)
+-	 != remaining);
++  while (__sync_val_compare_and_swap (&pool->threads_busy,
++				      busy, busy + num_threads - 1)
++	 != busy);
+ #else
+-  gomp_mutex_lock (&gomp_remaining_threads_lock);
++  gomp_mutex_lock (&gomp_managed_threads_lock);
+   num_threads = max_num_threads;
+-  remaining = gomp_remaining_threads_count;
+-  if (num_threads > remaining)
+-    num_threads = remaining + 1;
+-  gomp_remaining_threads_count -= num_threads - 1;
+-  gomp_mutex_unlock (&gomp_remaining_threads_lock);
++  busy = pool->threads_busy;
++  if (icv->thread_limit_var - busy + 1 < num_threads)
++    num_threads = icv->thread_limit_var - busy + 1;
++  pool->threads_busy += num_threads - 1;
++  gomp_mutex_unlock (&gomp_managed_threads_lock);
+ #endif
+ 
+   return num_threads;
+@@ -105,13 +122,14 @@ void
+ GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads)
+ {
+   num_threads = gomp_resolve_num_threads (num_threads, 0);
+-  gomp_team_start (fn, data, num_threads, gomp_new_team (num_threads));
++  gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads));
+ }
+ 
+ void
+ GOMP_parallel_end (void)
+ {
+-  if (__builtin_expect (gomp_thread_limit_var != ULONG_MAX, 0))
++  struct gomp_task_icv *icv = gomp_icv (false);
++  if (__builtin_expect (icv->thread_limit_var != UINT_MAX, 0))
+     {
+       struct gomp_thread *thr = gomp_thread ();
+       struct gomp_team *team = thr->ts.team;
+@@ -119,20 +137,98 @@ GOMP_parallel_end (void)
+       gomp_team_end ();
+       if (nthreads > 1)
+ 	{
++	  /* If not nested, there is just one thread in the
++	     contention group left, no need for atomicity.  */
++	  if (thr->ts.team == NULL)
++	    thr->thread_pool->threads_busy = 1;
++	  else
++	    {
+ #ifdef HAVE_SYNC_BUILTINS
+-	  __sync_fetch_and_add (&gomp_remaining_threads_count,
+-				nthreads - 1);
++	      __sync_fetch_and_add (&thr->thread_pool->threads_busy,
++				    1UL - nthreads);
+ #else
+-	  gomp_mutex_lock (&gomp_remaining_threads_lock);
+-	  gomp_remaining_threads_count += nthreads - 1;
+-	  gomp_mutex_unlock (&gomp_remaining_threads_lock);
++	      gomp_mutex_lock (&gomp_managed_threads_lock);
++	      thr->thread_pool->threads_busy -= nthreads - 1;
++	      gomp_mutex_unlock (&gomp_managed_threads_lock);
+ #endif
++	    }
+ 	}
+     }
+   else
+     gomp_team_end ();
+ }
++ialias (GOMP_parallel_end)
++
++void
++GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags)
++{
++  num_threads = gomp_resolve_num_threads (num_threads, 0);
++  gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads));
++  fn (data);
++  ialias_call (GOMP_parallel_end) ();
++}
++
++bool
++GOMP_cancellation_point (int which)
++{
++  if (!gomp_cancel_var)
++    return false;
+ 
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++  if (which & (GOMP_CANCEL_LOOP | GOMP_CANCEL_SECTIONS))
++    {
++      if (team == NULL)
++	return false;
++      return team->work_share_cancelled != 0;
++    }
++  else if (which & GOMP_CANCEL_TASKGROUP)
++    {
++      if (thr->task->taskgroup && thr->task->taskgroup->cancelled)
++	return true;
++      /* FALLTHRU into the GOMP_CANCEL_PARALLEL case,
++	 as #pragma omp cancel parallel also cancels all explicit
++	 tasks.  */
++    }
++  if (team)
++    return gomp_team_barrier_cancelled (&team->barrier);
++  return false;
++}
++ialias (GOMP_cancellation_point)
++
++bool
++GOMP_cancel (int which, bool do_cancel)
++{
++  if (!gomp_cancel_var)
++    return false;
++
++  if (!do_cancel)
++    return ialias_call (GOMP_cancellation_point) (which);
++
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++  if (which & (GOMP_CANCEL_LOOP | GOMP_CANCEL_SECTIONS))
++    {
++      /* In orphaned worksharing region, all we want to cancel
++	 is current thread.  */
++      if (team != NULL)
++	team->work_share_cancelled = 1;
++      return true;
++    }
++  else if (which & GOMP_CANCEL_TASKGROUP)
++    {
++      if (thr->task->taskgroup && !thr->task->taskgroup->cancelled)
++	{
++	  gomp_mutex_lock (&team->task_lock);
++	  thr->task->taskgroup->cancelled = true;
++	  gomp_mutex_unlock (&team->task_lock);
++	}
++      return true;
++    }
++  team->team_cancelled = 1;
++  gomp_team_barrier_cancel (team);
++  return true;
++}
+ 
+ /* The public OpenMP API for thread and team related inquiries.  */
+ 
diff --git a/SPECS/gcc.spec b/SPECS/gcc.spec
index 7eb2349..2c3f959 100644
--- a/SPECS/gcc.spec
+++ b/SPECS/gcc.spec
@@ -75,7 +75,7 @@
 Summary: Various compilers (C, C++, Objective-C, Java, ...)
 Name: gcc
 Version: %{gcc_version}
-Release: %{gcc_release}%{?dist}
+Release: %{gcc_release}.2%{?dist}
 # libgcc, libgfortran, libmudflap, libgomp, libstdc++ and crtstuff have
 # GCC Runtime Exception.
 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
@@ -211,6 +211,8 @@ Patch21: gcc48-pr57896.patch
 Patch22: gcc48-pr60272.patch
 Patch23: gcc48-pr60233.patch
 Patch24: gcc48-pr60274.patch
+Patch25: gcc48-rh1121077.patch
+Patch26: gcc48-pr61801.patch
 
 Patch1000: fastjar-0.97-segfault.patch
 Patch1001: fastjar-0.97-len1.patch
@@ -780,6 +782,8 @@ rm -f libgo/go/crypto/elliptic/p224{,_test}.go
 %patch22 -p0 -b .pr60272~
 %patch23 -p0 -b .pr60233~
 %patch24 -p0 -b .pr60274~
+%patch25 -p0 -b .rh1121077~
+%patch26 -p0 -b .pr61801~
 
 %if 0%{?_enable_debug_packages}
 cat > split-debuginfo.sh <<\EOF
@@ -3052,6 +3056,15 @@ fi
 %{_prefix}/libexec/gcc/%{gcc_target_platform}/%{gcc_version}/plugin
 
 %changelog
+* Wed Aug  6 2014 Jakub Jelinek <jakub@redhat.com> 4.8.2-16.2
+- backport two further OpenMP 4.0 libgomp tasking fixes (#1121077)
+- fix scheduler wrong-code with DEBUG_INSNs containing volatile ASM_OPERANDS
+  (#1127120, PR rtl-optimization/61801)
+
+* Fri Jul 18 2014 Jakub Jelinek <jakub@redhat.com> 4.8.2-16.1
+- backport OpenMP 4.0 support to libgomp (library only; #1121077,
+  PR libgomp/58691)
+
 * Mon Mar  3 2014 Jakub Jelinek <jakub@redhat.com> 4.8.2-16
 - fix up compare_exchange_* in libatomic too (PR c++/60272)