Blob Blame History Raw
2014-08-04  Jakub Jelinek  <jakub@redhat.com>

	* task.c (GOMP_taskgroup_end): If taskgroup->num_children
	is not zero, but taskgroup->children is NULL and there are
	any task->children, schedule those instead of waiting.

2014-08-01  Jakub Jelinek  <jakub@redhat.com>

	* libgomp.h (struct gomp_task_depend_entry): Add redundant_out field.
	(struct gomp_taskwait): New type.
	(struct gomp_task): Add taskwait and parent_depends_on, remove
	in_taskwait and taskwait_sem fields.
	(gomp_finish_task): Don't destroy taskwait_sem.
	* task.c (gomp_init_task): Don't init in_taskwait, instead init
	taskwait and parent_depends_on.
	(GOMP_task): For if (0) tasks with depend clause that depend on
	earlier tasks don't defer them, instead call
	gomp_task_maybe_wait_for_dependencies to wait for the dependencies.
	Initialize redundant_out field, for redundant out entries just
	move them at the end of linked list instead of removing them
	completely, and set redundant_out flag instead of redundant.
	(gomp_task_run_pre): Update last_parent_depends_on if scheduling
	that task.
	(gomp_task_run_post_handle_dependers): If parent is in
	gomp_task_maybe_wait_for_dependencies and newly runnable task
	is not parent_depends_on, queue it in parent->children linked
	list after all runnable tasks with parent_depends_on set.
	Adjust for addition of taskwait indirection.
	(gomp_task_run_post_remove_parent): If parent is in
	gomp_task_maybe_wait_for_dependencies and task to be removed
	is parent_depends_on, decrement n_depend and if needed awake
	parent.  Adjust for addition of taskwait indirection.
	(GOMP_taskwait): Adjust for addition of taskwait indirection.
	(gomp_task_maybe_wait_for_dependencies): New function.

2013-10-14  Jakub Jelinek  <jakub@redhat.com>

	* env.c (parse_bind_var): Initialize value to avoid
	(false positive) warning.

2013-10-12  Jakub Jelinek  <jakub@redhat.com>

	PR libgomp/58691
	* config/linux/proc.c (gomp_cpuset_popcount): Add unused attribute
	to check variable.
	(gomp_init_num_threads): Move i variable declaration into
	#ifdef CPU_ALLOC_SIZE block.
	* config/linux/affinity.c (gomp_affinity_init_level): Test
	gomp_places_list_len == 0 rather than gomp_places_list == 0
	when checking for topology reading error.
	* team.c (gomp_team_start): Don't handle bind == omp_proc_bind_false.
	* env.c (parse_affinity): Add ignore argument, if true, don't populate
	gomp_places_list, only parse env var and always return false.
	(parse_places_var): Likewise.  Don't check gomp_global_icv.bind_var.
	(initialize_env): Always parse OMP_PLACES and GOMP_CPU_AFFINITY env
	vars, default to OMP_PROC_BIND=true if OMP_PROC_BIND wasn't specified
	and either of these variables were parsed correctly into a places
	list.

2013-10-11  Thomas Schwinge  <thomas@codesourcery.com>

        * testsuite/libgomp.c/lib-1.c (main): Add missing error check.
        * testsuite/libgomp.fortran/lib1.f90: Likewise.
        * testsuite/libgomp.fortran/lib2.f: Likewise.
        * testsuite/libgomp.fortran/lib3.f: Likewise.

2013-10-11  Jakub Jelinek  <jakub@redhat.com>
	    Tobias Burnus  <burnus@net-b.de>
	    Richard Henderson  <rth@redhat.com>

	* target.c: New file.
	* Makefile.am (libgomp_la_SOURCES): Add target.c.
	* Makefile.in: Regenerated.
	* libgomp_g.h (GOMP_task): Add depend argument.
	(GOMP_barrier_cancel, GOMP_loop_end_cancel,
	GOMP_sections_end_cancel, GOMP_target, GOMP_target_data,
	GOMP_target_end_data, GOMP_target_update, GOMP_teams,
	GOMP_parallel_loop_static, GOMP_parallel_loop_dynamic,
	GOMP_parallel_loop_guided, GOMP_parallel_loop_runtime,
	GOMP_parallel, GOMP_cancel, GOMP_cancellation_point,
	GOMP_taskgroup_start, GOMP_taskgroup_end,
	GOMP_parallel_sections): New prototypes.
	* fortran.c (omp_is_initial_device): Add ialias_redirect.
	(omp_is_initial_device_): New function.
	(ULP, STR1, STR2, ialias_redirect): Removed.
	(omp_get_cancellation_, omp_get_proc_bind_, omp_set_default_device_,
	omp_set_default_device_8_, omp_get_default_device_,
	omp_get_num_devices_, omp_get_num_teams_, omp_get_team_num_): New
	functions.
	* libgomp.map (GOMP_barrier_cancel, GOMP_loop_end_cancel,
	GOMP_sections_end_cancel, GOMP_target, GOMP_target_data,
	GOMP_target_end_data, GOMP_target_update, GOMP_teams): Export
	@@GOMP_4.0.
	(omp_is_initial_device, omp_is_initial_device_, omp_get_cancellation,
	omp_get_cancellation_, omp_get_proc_bind, omp_get_proc_bind_,
	omp_set_default_device, omp_set_default_device_,
	omp_set_default_device_8_, omp_get_default_device,
	omp_get_default_device_, omp_get_num_devices, omp_get_num_devices_,
	omp_get_num_teams, omp_get_num_teams_, omp_get_team_num,
	omp_get_team_num_): Export @@OMP_4.0.
	* team.c (struct gomp_thread_start_data): Add place field.
	(gomp_thread_start): Clear thr->thread_pool and
	thr->task before returning.  Use gomp_team_barrier_wait_final
	instead of gomp_team_barrier_wait.  Initialize thr->place.
	(gomp_new_team): Initialize work_shares_to_free, work_share_cancelled,
	team_cancelled and task_queued_count fields.
	(gomp_free_pool_helper): Clear thr->thread_pool and thr->task
	before calling pthread_exit.
	(gomp_free_thread): No longer static.  Use
	gomp_managed_threads_lock instead of gomp_remaining_threads_lock.
	(gomp_team_start): Add flags argument.  Set
	thr->thread_pool->threads_busy to nthreads immediately after creating
	new pool.  Use gomp_managed_threads_lock instead of
	gomp_remaining_threads_lock.  Handle OpenMP 4.0 affinity.
	(gomp_team_end): Use gomp_managed_threads_lock instead of
	gomp_remaining_threads_lock.  Use gomp_team_barrier_wait_final instead
	of gomp_team_barrier_wait.  If team->team_cancelled, call
	gomp_fini_worshare on ws chain starting at team->work_shares_to_free
	rather than thr->ts.work_share.
	(initialize_team): Don't call gomp_sem_init here.
	* sections.c (GOMP_parallel_sections_start): Adjust gomp_team_start
	caller.
	(GOMP_parallel_sections, GOMP_sections_end_cancel): New functions.
	* env.c (gomp_global_icv): Add default_device_var, target_data and
	bind_var initializers.
	(gomp_cpu_affinity, gomp_cpu_affinity_len): Remove.
	(gomp_bind_var_list, gomp_bind_var_list_len, gomp_places_list,
	gomp_places_list_len): New variables.
	(parse_bind_var, parse_one_place, parse_places_var): New functions.
	(parse_affinity): Rewritten to construct OMP_PLACES list with unit
	sized places.
	(gomp_cancel_var): New global variable.
	(parse_int): New function.
	(handle_omp_display_env): New function.
	(initialize_env): Use it.  Initialize default_device_var.
	Parse OMP_CANCELLATION env var.  Use parse_bind_var to parse
	OMP_PROC_BIND instead of parse_boolean.  Use parse_places_var for
	OMP_PLACES parsing.  Don't call parse_affinity if OMP_PLACES has
	been successfully parsed (and call gomp_init_affinity in that case).
	(omp_get_cancellation, omp_get_proc_bind, omp_set_default_device,
	omp_get_default_device, omp_get_num_devices, omp_get_num_teams,
	omp_get_team_num, omp_is_initial_device): New functions.
	* libgomp.h: Include stdlib.h.
	(ialias_ulp, ialias_str1, ialias_str2, ialias_redirect, ialias_call):
	Define.
	(struct target_mem_desc): Forward declare.
	(struct gomp_task_icv): Add default_device_var, target_data, bind_var
	and thread_limit_var fields.
	(gomp_get_num_devices): New prototype.
	(gomp_cancel_var): New extern decl.
	(struct gomp_team): Add work_shares_to_free, work_share_cancelled,
	team_cancelled and task_queued_count fields.  Add comments about
	task_{,queued_,running_}count.
	(gomp_cancel_kind): New enum.
	(gomp_work_share_end_cancel): New prototype.
	(struct gomp_task): Add next_taskgroup, prev_taskgroup, taskgroup,
	copy_ctors_done, dependers, depend_hash, depend_count, num_dependees
	and depend fields.
	(struct gomp_taskgroup): New type.
	(struct gomp_task_depend_entry,
	struct gomp_dependers_vec): New types.
	(gomp_finish_task): Free depend_hash if non-NULL.
	(struct gomp_team_state): Add place_partition_off
	and place_partition_len fields.
	(gomp_bind_var_list, gomp_bind_var_list_len, gomp_places_list,
	gomp_places_list_len): New extern decls.
	(struct gomp_thread): Add place field.
	(gomp_cpu_affinity, gomp_cpu_affinity_len): Remove.
	(gomp_init_thread_affinity): Add place argument.
	(gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus,
	gomp_affinity_remove_cpu, gomp_affinity_copy_place,
	gomp_affinity_same_place, gomp_affinity_finalize_place_list,
	gomp_affinity_init_level, gomp_affinity_print_place): New
	prototypes.
	(gomp_team_start): Add flags argument.
	(gomp_thread_limit_var, gomp_remaining_threads_count,
	gomp_remaining_threads_lock): Remove.
	(gomp_managed_threads_lock): New variable.
	(struct gomp_thread_pool): Add threads_busy field.
	(gomp_free_thread): New prototype.
	* task.c: Include hashtab.h.
	(hash_entry_type): New typedef.
	(htab_alloc, htab_free, htab_hash, htab_eq): New inlines.
	(gomp_init_task): Clear dependers, depend_hash, depend_count,
	copy_ctors_done and taskgroup fields.
	(GOMP_task): Add depend argument, handle depend clauses.  If
	gomp_team_barrier_cancelled or if it's taskgroup has been
	cancelled, don't queue or start new tasks.  Set copy_ctors_done
	field if needed.  Initialize taskgroup field.  If copy_ctors_done
	and already cancelled, don't discard the task.  If taskgroup is
	non-NULL, enqueue the task into taskgroup queue.  Increment
	num_children field in taskgroup.  Increment task_queued_count.
	(gomp_task_run_pre, gomp_task_run_post_remove_parent,
	gomp_task_run_post_remove_taskgroup): New inline functions.
	(gomp_task_run_post_handle_depend_hash,
	gomp_task_run_post_handle_dependers,
	gomp_task_run_post_handle_depend): New functions.
	(GOMP_taskwait): Use them.  If more than one new tasks
	have been queued, wake other threads if needed.
	(gomp_barrier_handle_tasks): Likewise.  If
	gomp_team_barrier_cancelled, don't start any new tasks, just free
	all tasks.
	(GOMP_taskgroup_start, GOMP_taskgroup_end): New functions.
	* loop.c (gomp_parallel_loop_start): Add flags argument, pass it
	through to gomp_team_start.
	(GOMP_parallel_loop_static_start, GOMP_parallel_loop_dynamic_start,
	GOMP_parallel_loop_guided_start, GOMP_parallel_loop_runtime_start):
	Adjust gomp_parallel_loop_start callers.
	(GOMP_parallel_loop_static, GOMP_parallel_loop_dynamic,
	GOMP_parallel_loop_guided, GOMP_parallel_loop_runtime,
	GOMP_loop_end_cancel): New functions.
	(GOMP_parallel_end): Add ialias_redirect.
	* hashtab.h: New file.
	* work.c (gomp_work_share_end, gomp_work_share_end_nowait): Set
	team->work_shares_to_free to thr->ts.work_share before calling
	free_work_share.
	(gomp_work_share_end_cancel): New function.
	* config/linux/proc.c: Include errno.h.
	(gomp_get_cpuset_size, gomp_cpuset_size, gomp_cpusetp): New variables.
	(gomp_cpuset_popcount): Add cpusetsize argument, use it instead of
	sizeof (cpu_set_t) to determine number of iterations.  Fix up check
	extern decl.  Use CPU_COUNT_S if available, or CPU_COUNT if
	gomp_cpuset_size is sizeof (cpu_set_t).
	(gomp_init_num_threads): Initialize gomp_cpuset_size,
	gomp_get_cpuset_size and gomp_cpusetp here, use gomp_cpusetp instead
	of &cpuset and pass gomp_cpuset_size instead of sizeof (cpu_set_t)
	to pthread_getaffinity_np.  Free and clear gomp_cpusetp if it didn't
	contain any logical CPUs.
	(get_num_procs): Don't call pthread_getaffinity_np if gomp_cpusetp
	is NULL.  Use gomp_cpusetp instead of &cpuset and pass
	gomp_get_cpuset_size instead of sizeof (cpu_set_t) to
	pthread_getaffinity_np.  Check gomp_places_list instead of
	gomp_cpu_affinity.  Adjust gomp_cpuset_popcount caller.
	* config/linux/bar.c (gomp_barrier_wait_end,
	gomp_barrier_wait_last): Use BAR_* defines.
	(gomp_team_barrier_wait_end): Likewise.  Clear BAR_CANCELLED
	from state where needed.  Set work_share_cancelled to 0 on last
	thread.
	(gomp_team_barrier_wait_final, gomp_team_barrier_wait_cancel_end,
	gomp_team_barrier_wait_cancel, gomp_team_barrier_cancel): New
	functions.
	* config/linux/proc.h (gomp_cpuset_popcount): Add attribute_hidden.
	Add cpusetsize argument.
	(gomp_cpuset_size, gomp_cpusetp): Declare.
	* config/linux/affinity.c: Include errno.h, stdio.h and string.h.
	(affinity_counter): Remove.
	(CPU_ISSET_S, CPU_ZERO_S, CPU_SET_S, CPU_CLR_S): Define
	if CPU_ALLOC_SIZE isn't defined.
	(gomp_init_affinity): Rewritten, if gomp_places_list is NULL, try
	silently create OMP_PLACES=threads, if it is non-NULL afterwards,
	bind current thread to the first place.
	(gomp_init_thread_affinity): Rewritten.  Add place argument, just
	pthread_setaffinity_np to gomp_places_list[place].
	(gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus,
	gomp_affinity_remove_cpu, gomp_affinity_copy_place,
	gomp_affinity_same_place, gomp_affinity_finalize_place_list,
	gomp_affinity_init_level, gomp_affinity_print_place): New functions.
	* config/linux/bar.h (BAR_TASK_PENDING, BAR_WAS_LAST,
	BAR_WAITING_FOR_TASK, BAR_INCR, BAR_CANCELLED): Define.
	(gomp_barrier_t): Add awaited_final field.
	(gomp_barrier_init): Initialize awaited_final field.
	(gomp_team_barrier_wait_final, gomp_team_barrier_wait_cancel,
	gomp_team_barrier_wait_cancel_end, gomp_team_barrier_cancel): New
	prototypes.
	(gomp_barrier_wait_start): Preserve BAR_CANCELLED bit.  Use BAR_*
	defines.
	(gomp_barrier_wait_cancel_start, gomp_team_barrier_wait_final_start,
	gomp_team_barrier_cancelled): New inline functions.
	(gomp_barrier_last_thread,
	gomp_team_barrier_set_task_pending,
	gomp_team_barrier_clear_task_pending,
	gomp_team_barrier_set_waiting_for_tasks,
	gomp_team_barrier_waiting_for_tasks,
	gomp_team_barrier_done): Use BAR_* defines.
	* config/posix/bar.c (gomp_barrier_init): Clear cancellable field.
	(gomp_barrier_wait_end): Use BAR_* defines.
	(gomp_team_barrier_wait_end): Clear BAR_CANCELLED from state.
	Set work_share_cancelled to 0 on last thread, use __atomic_load_n.
	Use BAR_* defines.
	(gomp_team_barrier_wait_cancel_end, gomp_team_barrier_wait_cancel,
	gomp_team_barrier_cancel): New functions.
	* config/posix/affinity.c (gomp_init_thread_affinity): Add place
	argument.
	(gomp_affinity_alloc, gomp_affinity_init_place, gomp_affinity_add_cpus,
	gomp_affinity_remove_cpu, gomp_affinity_copy_place,
	gomp_affinity_same_place, gomp_affinity_finalize_place_list,
	gomp_affinity_init_level, gomp_affinity_print_place): New stubs.
	* config/posix/bar.h (BAR_TASK_PENDING, BAR_WAS_LAST,
	BAR_WAITING_FOR_TASK, BAR_INCR, BAR_CANCELLED): Define.
	(gomp_barrier_t): Add cancellable field.
	(gomp_team_barrier_wait_cancel, gomp_team_barrier_wait_cancel_end,
	gomp_team_barrier_cancel): New prototypes.
	(gomp_barrier_wait_start): Preserve BAR_CANCELLED bit.
	(gomp_barrier_wait_cancel_start, gomp_team_barrier_wait_final,
	gomp_team_barrier_cancelled): New inline functions.
	(gomp_barrier_wait_start, gomp_barrier_last_thread,
	gomp_team_barrier_set_task_pending,
	gomp_team_barrier_clear_task_pending,
	gomp_team_barrier_set_waiting_for_tasks,
	gomp_team_barrier_waiting_for_tasks,
	gomp_team_barrier_done): Use BAR_* defines.
	* barrier.c (GOMP_barrier_cancel): New function.
	* parallel.c (GOMP_parallel, GOMP_cancel, GOMP_cancellation_point):
	New functions.
	(gomp_resolve_num_threads): Adjust for thread_limit now being in
	icv->thread_limit_var.  Use UINT_MAX instead of ULONG_MAX as
	infinity.  If not nested, just return minimum of max_num_threads
	and icv->thread_limit_var and if thr->thread_pool, set threads_busy
	to the returned value.  Otherwise, don't update atomically
	gomp_remaining_threads_count, but instead thr->thread_pool->threads_busy.
	(GOMP_parallel_end): Adjust for thread_limit now being in
	icv->thread_limit_var.  Use UINT_MAX instead of ULONG_MAX as
	infinity.  Adjust threads_busy in the pool rather than
	gomp_remaining_threads_count.  Remember team->nthreads and call
	gomp_team_end before adjusting threads_busy, if not nested
	afterwards, just set it to 1 non-atomically.  Add ialias.
	(GOMP_parallel_start): Adjust gomp_team_start caller.
	* testsuite/libgomp.c/atomic-14.c: Add parens to make it valid.

--- libgomp/Makefile.am	(revision 210461)
+++ libgomp/Makefile.am	(revision 210462)
@@ -60,7 +60,7 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_L
 libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
 	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
 	task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
-	time.c fortran.c affinity.c
+	time.c fortran.c affinity.c target.c
 
 nodist_noinst_HEADERS = libgomp_f.h
 nodist_libsubinclude_HEADERS = omp.h
--- libgomp/Makefile.in	(revision 210461)
+++ libgomp/Makefile.in	(revision 210462)
@@ -96,7 +96,7 @@ am_libgomp_la_OBJECTS = alloc.lo barrier
 	error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \
 	parallel.lo sections.lo single.lo task.lo team.lo work.lo \
 	lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \
-	fortran.lo affinity.lo
+	fortran.lo affinity.lo target.lo
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 DEFAULT_INCLUDES = -I.@am__isrc@
 depcomp = $(SHELL) $(top_srcdir)/../depcomp
@@ -317,7 +317,7 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_L
 libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
 	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
 	task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
-	time.c fortran.c affinity.c
+	time.c fortran.c affinity.c target.c
 
 nodist_noinst_HEADERS = libgomp_f.h
 nodist_libsubinclude_HEADERS = omp.h
@@ -474,6 +474,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@
--- libgomp/libgomp_g.h	(revision 210461)
+++ libgomp/libgomp_g.h	(revision 210462)
@@ -33,6 +33,7 @@
 /* barrier.c */
 
 extern void GOMP_barrier (void);
+extern bool GOMP_barrier_cancel (void);
 
 /* critical.c */
 
@@ -76,9 +77,22 @@ extern void GOMP_parallel_loop_guided_st
 					     unsigned, long, long, long, long);
 extern void GOMP_parallel_loop_runtime_start (void (*)(void *), void *,
 					      unsigned, long, long, long);
+extern void GOMP_parallel_loop_static (void (*)(void *), void *,
+				       unsigned, long, long, long, long,
+				       unsigned);
+extern void GOMP_parallel_loop_dynamic (void (*)(void *), void *,
+					unsigned, long, long, long, long,
+					unsigned);
+extern void GOMP_parallel_loop_guided (void (*)(void *), void *,
+				       unsigned, long, long, long, long,
+				       unsigned);
+extern void GOMP_parallel_loop_runtime (void (*)(void *), void *,
+					unsigned, long, long, long,
+					unsigned);
 
 extern void GOMP_loop_end (void);
 extern void GOMP_loop_end_nowait (void);
+extern bool GOMP_loop_end_cancel (void);
 
 /* loop_ull.c */
 
@@ -157,13 +171,18 @@ extern void GOMP_ordered_end (void);
 
 extern void GOMP_parallel_start (void (*) (void *), void *, unsigned);
 extern void GOMP_parallel_end (void);
+extern void GOMP_parallel (void (*) (void *), void *, unsigned, unsigned);
+extern bool GOMP_cancel (int, bool);
+extern bool GOMP_cancellation_point (int);
 
 /* task.c */
 
 extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *),
-		       long, long, bool, unsigned);
+		       long, long, bool, unsigned, void **);
 extern void GOMP_taskwait (void);
 extern void GOMP_taskyield (void);
+extern void GOMP_taskgroup_start (void);
+extern void GOMP_taskgroup_end (void);
 
 /* sections.c */
 
@@ -171,8 +190,11 @@ extern unsigned GOMP_sections_start (uns
 extern unsigned GOMP_sections_next (void);
 extern void GOMP_parallel_sections_start (void (*) (void *), void *,
 					  unsigned, unsigned);
+extern void GOMP_parallel_sections (void (*) (void *), void *,
+				    unsigned, unsigned, unsigned);
 extern void GOMP_sections_end (void);
 extern void GOMP_sections_end_nowait (void);
+extern bool GOMP_sections_end_cancel (void);
 
 /* single.c */
 
@@ -180,4 +202,15 @@ extern bool GOMP_single_start (void);
 extern void *GOMP_single_copy_start (void);
 extern void GOMP_single_copy_end (void *);
 
+/* target.c */
+
+extern void GOMP_target (int, void (*) (void *), const void *,
+			 size_t, void **, size_t *, unsigned char *);
+extern void GOMP_target_data (int, const void *,
+			      size_t, void **, size_t *, unsigned char *);
+extern void GOMP_target_end_data (void);
+extern void GOMP_target_update (int, const void *,
+				size_t, void **, size_t *, unsigned char *);
+extern void GOMP_teams (unsigned int, unsigned int);
+
 #endif /* LIBGOMP_G_H */
--- libgomp/fortran.c	(revision 210461)
+++ libgomp/fortran.c	(revision 210462)
@@ -31,11 +31,6 @@
 
 #ifdef HAVE_ATTRIBUTE_ALIAS
 /* Use internal aliases if possible.  */
-# define ULP		STR1(__USER_LABEL_PREFIX__)
-# define STR1(x)	STR2(x)
-# define STR2(x)	#x
-# define ialias_redirect(fn) \
-  extern __typeof (fn) fn __asm__ (ULP "gomp_ialias_" #fn) attribute_hidden;
 # ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
 ialias_redirect (omp_init_lock)
 ialias_redirect (omp_init_nest_lock)
@@ -70,6 +65,14 @@ ialias_redirect (omp_get_ancestor_thread
 ialias_redirect (omp_get_team_size)
 ialias_redirect (omp_get_active_level)
 ialias_redirect (omp_in_final)
+ialias_redirect (omp_get_cancellation)
+ialias_redirect (omp_get_proc_bind)
+ialias_redirect (omp_set_default_device)
+ialias_redirect (omp_get_default_device)
+ialias_redirect (omp_get_num_devices)
+ialias_redirect (omp_get_num_teams)
+ialias_redirect (omp_get_team_num)
+ialias_redirect (omp_is_initial_device)
 #endif
 
 #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
@@ -435,3 +438,57 @@ omp_in_final_ (void)
 {
   return omp_in_final ();
 }
+
+int32_t
+omp_get_cancellation_ (void)
+{
+  return omp_get_cancellation ();
+}
+
+int32_t
+omp_get_proc_bind_ (void)
+{
+  return omp_get_proc_bind ();
+}
+
+void
+omp_set_default_device_ (const int32_t *device_num)
+{
+  return omp_set_default_device (*device_num);
+}
+
+void
+omp_set_default_device_8_ (const int64_t *device_num)
+{
+  return omp_set_default_device (TO_INT (*device_num));
+}
+
+int32_t
+omp_get_default_device_ (void)
+{
+  return omp_get_default_device ();
+}
+
+int32_t
+omp_get_num_devices_ (void)
+{
+  return omp_get_num_devices ();
+}
+
+int32_t
+omp_get_num_teams_ (void)
+{
+  return omp_get_num_teams ();
+}
+
+int32_t
+omp_get_team_num_ (void)
+{
+  return omp_get_team_num ();
+}
+
+int32_t
+omp_is_initial_device_ (void)
+{
+  return omp_is_initial_device ();
+}
--- libgomp/libgomp.map	(revision 210461)
+++ libgomp/libgomp.map	(revision 210462)
@@ -113,6 +113,27 @@ OMP_3.1 {
 	omp_in_final_;
 } OMP_3.0;
 
+OMP_4.0 {
+  global:
+	omp_get_cancellation;
+	omp_get_cancellation_;
+	omp_get_proc_bind;
+	omp_get_proc_bind_;
+	omp_set_default_device;
+	omp_set_default_device_;
+	omp_set_default_device_8_;
+	omp_get_default_device;
+	omp_get_default_device_;
+	omp_get_num_devices;
+	omp_get_num_devices_;
+	omp_get_num_teams;
+	omp_get_num_teams_;
+	omp_get_team_num;
+	omp_get_team_num_;
+	omp_is_initial_device;
+	omp_is_initial_device_;
+} OMP_3.1;
+
 GOMP_1.0 {
   global:
 	GOMP_atomic_end;
@@ -184,3 +205,25 @@ GOMP_3.0 {
   global:
 	GOMP_taskyield;
 } GOMP_2.0;
+
+GOMP_4.0 {
+  global:
+	GOMP_barrier_cancel;
+	GOMP_cancel;
+	GOMP_cancellation_point;
+	GOMP_loop_end_cancel;
+	GOMP_parallel_loop_dynamic;
+	GOMP_parallel_loop_guided;
+	GOMP_parallel_loop_runtime;
+	GOMP_parallel_loop_static;
+	GOMP_parallel_sections;
+	GOMP_parallel;
+	GOMP_sections_end_cancel;
+	GOMP_taskgroup_start;
+	GOMP_taskgroup_end;
+	GOMP_target;
+	GOMP_target_data;
+	GOMP_target_end_data;
+	GOMP_target_update;
+	GOMP_teams;
+} GOMP_3.0;
--- libgomp/team.c	(revision 210461)
+++ libgomp/team.c	(revision 210462)
@@ -53,6 +53,7 @@ struct gomp_thread_start_data
   struct gomp_team_state ts;
   struct gomp_task *task;
   struct gomp_thread_pool *thread_pool;
+  unsigned int place;
   bool nested;
 };
 
@@ -84,6 +85,7 @@ gomp_thread_start (void *xdata)
   thr->thread_pool = data->thread_pool;
   thr->ts = data->ts;
   thr->task = data->task;
+  thr->place = data->place;
 
   thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
 
@@ -98,7 +100,7 @@ gomp_thread_start (void *xdata)
       gomp_barrier_wait (&team->barrier);
 
       local_fn (local_data);
-      gomp_team_barrier_wait (&team->barrier);
+      gomp_team_barrier_wait_final (&team->barrier);
       gomp_finish_task (task);
       gomp_barrier_wait_last (&team->barrier);
     }
@@ -113,7 +115,7 @@ gomp_thread_start (void *xdata)
 	  struct gomp_task *task = thr->task;
 
 	  local_fn (local_data);
-	  gomp_team_barrier_wait (&team->barrier);
+	  gomp_team_barrier_wait_final (&team->barrier);
 	  gomp_finish_task (task);
 
 	  gomp_barrier_wait (&pool->threads_dock);
@@ -126,6 +128,8 @@ gomp_thread_start (void *xdata)
     }
 
   gomp_sem_destroy (&thr->release);
+  thr->thread_pool = NULL;
+  thr->task = NULL;
   return NULL;
 }
 
@@ -149,6 +153,7 @@ gomp_new_team (unsigned nthreads)
 #else
   gomp_mutex_init (&team->work_share_list_free_lock);
 #endif
+  team->work_shares_to_free = &team->work_shares[0];
   gomp_init_work_share (&team->work_shares[0], false, nthreads);
   team->work_shares[0].next_alloc = NULL;
   team->work_share_list_free = NULL;
@@ -167,7 +172,10 @@ gomp_new_team (unsigned nthreads)
   gomp_mutex_init (&team->task_lock);
   team->task_queue = NULL;
   team->task_count = 0;
+  team->task_queued_count = 0;
   team->task_running_count = 0;
+  team->work_share_cancelled = 0;
+  team->team_cancelled = 0;
 
   return team;
 }
@@ -199,16 +207,19 @@ static struct gomp_thread_pool *gomp_new
 static void
 gomp_free_pool_helper (void *thread_pool)
 {
+  struct gomp_thread *thr = gomp_thread ();
   struct gomp_thread_pool *pool
     = (struct gomp_thread_pool *) thread_pool;
   gomp_barrier_wait_last (&pool->threads_dock);
-  gomp_sem_destroy (&gomp_thread ()->release);
+  gomp_sem_destroy (&thr->release);
+  thr->thread_pool = NULL;
+  thr->task = NULL;
   pthread_exit (NULL);
 }
 
 /* Free a thread pool and release its threads. */
 
-static void
+void
 gomp_free_thread (void *arg __attribute__((unused)))
 {
   struct gomp_thread *thr = gomp_thread ();
@@ -236,9 +247,9 @@ gomp_free_thread (void *arg __attribute_
 	  __sync_fetch_and_add (&gomp_managed_threads,
 				1L - pool->threads_used);
 #else
-	  gomp_mutex_lock (&gomp_remaining_threads_lock);
+	  gomp_mutex_lock (&gomp_managed_threads_lock);
 	  gomp_managed_threads -= pool->threads_used - 1L;
-	  gomp_mutex_unlock (&gomp_remaining_threads_lock);
+	  gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
 	}
       free (pool->threads);
@@ -259,7 +270,7 @@ gomp_free_thread (void *arg __attribute_
 
 void
 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
-		 struct gomp_team *team)
+		 unsigned flags, struct gomp_team *team)
 {
   struct gomp_thread_start_data *start_data;
   struct gomp_thread *thr, *nthr;
@@ -270,17 +281,24 @@ gomp_team_start (void (*fn) (void *), vo
   unsigned i, n, old_threads_used = 0;
   pthread_attr_t thread_attr, *attr;
   unsigned long nthreads_var;
+  char bind, bind_var;
+  unsigned int s = 0, rest = 0, p = 0, k = 0;
+  unsigned int affinity_count = 0;
+  struct gomp_thread **affinity_thr = NULL;
 
   thr = gomp_thread ();
   nested = thr->ts.team != NULL;
   if (__builtin_expect (thr->thread_pool == NULL, 0))
     {
       thr->thread_pool = gomp_new_thread_pool ();
+      thr->thread_pool->threads_busy = nthreads;
       pthread_setspecific (gomp_thread_destructor, thr);
     }
   pool = thr->thread_pool;
   task = thr->task;
   icv = task ? &task->icv : &gomp_global_icv;
+  if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
+    gomp_init_affinity ();
 
   /* Always save the previous state, even if this isn't a nested team.
      In particular, we should save any work share state from an outer
@@ -303,14 +321,90 @@ gomp_team_start (void (*fn) (void *), vo
   if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
       && thr->ts.level < gomp_nthreads_var_list_len)
     nthreads_var = gomp_nthreads_var_list[thr->ts.level];
+  bind_var = icv->bind_var;
+  if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
+    bind_var = flags & 7;
+  bind = bind_var;
+  if (__builtin_expect (gomp_bind_var_list != NULL, 0)
+      && thr->ts.level < gomp_bind_var_list_len)
+    bind_var = gomp_bind_var_list[thr->ts.level];
   gomp_init_task (thr->task, task, icv);
   team->implicit_task[0].icv.nthreads_var = nthreads_var;
+  team->implicit_task[0].icv.bind_var = bind_var;
 
   if (nthreads == 1)
     return;
 
   i = 1;
 
+  if (__builtin_expect (gomp_places_list != NULL, 0))
+    {
+      /* Depending on chosen proc_bind model, set subpartition
+	 for the master thread and initialize helper variables
+	 P and optionally S, K and/or REST used by later place
+	 computation for each additional thread.  */
+      p = thr->place - 1;
+      switch (bind)
+	{
+	case omp_proc_bind_true:
+	case omp_proc_bind_close:
+	  if (nthreads > thr->ts.place_partition_len)
+	    {
+	      /* T > P.  S threads will be placed in each place,
+		 and the final REM threads placed one by one
+		 into the already occupied places.  */
+	      s = nthreads / thr->ts.place_partition_len;
+	      rest = nthreads % thr->ts.place_partition_len;
+	    }
+	  else
+	    s = 1;
+	  k = 1;
+	  break;
+	case omp_proc_bind_master:
+	  /* Each thread will be bound to master's place.  */
+	  break;
+	case omp_proc_bind_spread:
+	  if (nthreads <= thr->ts.place_partition_len)
+	    {
+	      /* T <= P.  Each subpartition will have in between s
+		 and s+1 places (subpartitions starting at or
+		 after rest will have s places, earlier s+1 places),
+		 each thread will be bound to the first place in
+		 its subpartition (except for the master thread
+		 that can be bound to another place in its
+		 subpartition).  */
+	      s = thr->ts.place_partition_len / nthreads;
+	      rest = thr->ts.place_partition_len % nthreads;
+	      rest = (s + 1) * rest + thr->ts.place_partition_off;
+	      if (p < rest)
+		{
+		  p -= (p - thr->ts.place_partition_off) % (s + 1);
+		  thr->ts.place_partition_len = s + 1;
+		}
+	      else
+		{
+		  p -= (p - rest) % s;
+		  thr->ts.place_partition_len = s;
+		}
+	      thr->ts.place_partition_off = p;
+	    }
+	  else
+	    {
+	      /* T > P.  Each subpartition will have just a single
+		 place and we'll place between s and s+1
+		 threads into each subpartition.  */
+	      s = nthreads / thr->ts.place_partition_len;
+	      rest = nthreads % thr->ts.place_partition_len;
+	      thr->ts.place_partition_off = p;
+	      thr->ts.place_partition_len = 1;
+	      k = 1;
+	    }
+	  break;
+	}
+    }
+  else
+    bind = omp_proc_bind_false;
+
   /* We only allow the reuse of idle threads for non-nested PARALLEL
      regions.  This appears to be implied by the semantics of
      threadprivate variables, but perhaps that's reading too much into
@@ -341,47 +435,244 @@ gomp_team_start (void (*fn) (void *), vo
 	 team will exit.  */
       pool->threads_used = nthreads;
 
+      /* If necessary, expand the size of the gomp_threads array.  It is
+	 expected that changes in the number of threads are rare, thus we
+	 make no effort to expand gomp_threads_size geometrically.  */
+      if (nthreads >= pool->threads_size)
+	{
+	  pool->threads_size = nthreads + 1;
+	  pool->threads
+	    = gomp_realloc (pool->threads,
+			    pool->threads_size
+			    * sizeof (struct gomp_thread_data *));
+	}
+
       /* Release existing idle threads.  */
       for (; i < n; ++i)
 	{
-	  nthr = pool->threads[i];
+	  unsigned int place_partition_off = thr->ts.place_partition_off;
+	  unsigned int place_partition_len = thr->ts.place_partition_len;
+	  unsigned int place = 0;
+	  if (__builtin_expect (gomp_places_list != NULL, 0))
+	    {
+	      switch (bind)
+		{
+		case omp_proc_bind_true:
+		case omp_proc_bind_close:
+		  if (k == s)
+		    {
+		      ++p;
+		      if (p == (team->prev_ts.place_partition_off
+				+ team->prev_ts.place_partition_len))
+			p = team->prev_ts.place_partition_off;
+		      k = 1;
+		      if (i == nthreads - rest)
+			s = 1;
+		    }
+		  else
+		    ++k;
+		  break;
+		case omp_proc_bind_master:
+		  break;
+		case omp_proc_bind_spread:
+		  if (k == 0)
+		    {
+		      /* T <= P.  */
+		      if (p < rest)
+			p += s + 1;
+		      else
+			p += s;
+		      if (p == (team->prev_ts.place_partition_off
+				+ team->prev_ts.place_partition_len))
+			p = team->prev_ts.place_partition_off;
+		      place_partition_off = p;
+		      if (p < rest)
+			place_partition_len = s + 1;
+		      else
+			place_partition_len = s;
+		    }
+		  else
+		    {
+		      /* T > P.  */
+		      if (k == s)
+			{
+			  ++p;
+			  if (p == (team->prev_ts.place_partition_off
+				    + team->prev_ts.place_partition_len))
+			    p = team->prev_ts.place_partition_off;
+			  k = 1;
+			  if (i == nthreads - rest)
+			    s = 1;
+			}
+		      else
+			++k;
+		      place_partition_off = p;
+		      place_partition_len = 1;
+		    }
+		  break;
+		}
+	      if (affinity_thr != NULL
+		  || (bind != omp_proc_bind_true
+		      && pool->threads[i]->place != p + 1)
+		  || pool->threads[i]->place <= place_partition_off
+		  || pool->threads[i]->place > (place_partition_off
+						+ place_partition_len))
+		{
+		  unsigned int l;
+		  if (affinity_thr == NULL)
+		    {
+		      unsigned int j;
+
+		      if (team->prev_ts.place_partition_len > 64)
+			affinity_thr
+			  = gomp_malloc (team->prev_ts.place_partition_len
+					 * sizeof (struct gomp_thread *));
+		      else
+			affinity_thr
+			  = gomp_alloca (team->prev_ts.place_partition_len
+					 * sizeof (struct gomp_thread *));
+		      memset (affinity_thr, '\0',
+			      team->prev_ts.place_partition_len
+			      * sizeof (struct gomp_thread *));
+		      for (j = i; j < old_threads_used; j++)
+			{
+			  if (pool->threads[j]->place
+			      > team->prev_ts.place_partition_off
+			      && (pool->threads[j]->place
+				  <= (team->prev_ts.place_partition_off
+				      + team->prev_ts.place_partition_len)))
+			    {
+			      l = pool->threads[j]->place - 1
+				  - team->prev_ts.place_partition_off;
+			      pool->threads[j]->data = affinity_thr[l];
+			      affinity_thr[l] = pool->threads[j];
+			    }
+			  pool->threads[j] = NULL;
+			}
+		      if (nthreads > old_threads_used)
+			memset (&pool->threads[old_threads_used],
+				'\0', ((nthreads - old_threads_used)
+				       * sizeof (struct gomp_thread *)));
+		      n = nthreads;
+		      affinity_count = old_threads_used - i;
+		    }
+		  if (affinity_count == 0)
+		    break;
+		  l = p;
+		  if (affinity_thr[l - team->prev_ts.place_partition_off]
+		      == NULL)
+		    {
+		      if (bind != omp_proc_bind_true)
+			continue;
+		      for (l = place_partition_off;
+			   l < place_partition_off + place_partition_len;
+			   l++)
+			if (affinity_thr[l - team->prev_ts.place_partition_off]
+			    != NULL)
+			  break;
+		      if (l == place_partition_off + place_partition_len)
+			continue;
+		    }
+		  nthr = affinity_thr[l - team->prev_ts.place_partition_off];
+		  affinity_thr[l - team->prev_ts.place_partition_off]
+		    = (struct gomp_thread *) nthr->data;
+		  affinity_count--;
+		  pool->threads[i] = nthr;
+		}
+	      else
+		nthr = pool->threads[i];
+	      place = p + 1;
+	    }
+	  else
+	    nthr = pool->threads[i];
 	  nthr->ts.team = team;
 	  nthr->ts.work_share = &team->work_shares[0];
 	  nthr->ts.last_work_share = NULL;
 	  nthr->ts.team_id = i;
 	  nthr->ts.level = team->prev_ts.level + 1;
 	  nthr->ts.active_level = thr->ts.active_level;
+	  nthr->ts.place_partition_off = place_partition_off;
+	  nthr->ts.place_partition_len = place_partition_len;
 #ifdef HAVE_SYNC_BUILTINS
 	  nthr->ts.single_count = 0;
 #endif
 	  nthr->ts.static_trip = 0;
 	  nthr->task = &team->implicit_task[i];
+	  nthr->place = place;
 	  gomp_init_task (nthr->task, task, icv);
 	  team->implicit_task[i].icv.nthreads_var = nthreads_var;
+	  team->implicit_task[i].icv.bind_var = bind_var;
 	  nthr->fn = fn;
 	  nthr->data = data;
 	  team->ordered_release[i] = &nthr->release;
 	}
 
+      if (__builtin_expect (affinity_thr != NULL, 0))
+	{
+	  /* If AFFINITY_THR is non-NULL just because we had to
+	     permute some threads in the pool, but we've managed
+	     to find exactly as many old threads as we'd find
+	     without affinity, we don't need to handle this
+	     specially anymore.  */
+	  if (nthreads <= old_threads_used
+	      ? (affinity_count == old_threads_used - nthreads)
+	      : (i == old_threads_used))
+	    {
+	      if (team->prev_ts.place_partition_len > 64)
+		free (affinity_thr);
+	      affinity_thr = NULL;
+	      affinity_count = 0;
+	    }
+	  else
+	    {
+	      i = 1;
+	      /* We are going to compute the places/subpartitions
+		 again from the beginning.  So, we need to reinitialize
+		 vars modified by the switch (bind) above inside
+		 of the loop, to the state they had after the initial
+		 switch (bind).  */
+	      switch (bind)
+		{
+		case omp_proc_bind_true:
+		case omp_proc_bind_close:
+		  if (nthreads > thr->ts.place_partition_len)
+		    /* T > P.  S has been changed, so needs
+		       to be recomputed.  */
+		    s = nthreads / thr->ts.place_partition_len;
+		  k = 1;
+		  p = thr->place - 1;
+		  break;
+		case omp_proc_bind_master:
+		  /* No vars have been changed.  */
+		  break;
+		case omp_proc_bind_spread:
+		  p = thr->ts.place_partition_off;
+		  if (k != 0)
+		    {
+		      /* T > P.  */
+		      s = nthreads / team->prev_ts.place_partition_len;
+		      k = 1;
+		    }
+		  break;
+		}
+
+	      /* Increase the barrier threshold to make sure all new
+		 threads and all the threads we're going to let die
+		 arrive before the team is released.  */
+	      if (affinity_count)
+		gomp_barrier_reinit (&pool->threads_dock,
+				     nthreads + affinity_count);
+	    }
+	}
+
       if (i == nthreads)
 	goto do_release;
 
-      /* If necessary, expand the size of the gomp_threads array.  It is
-	 expected that changes in the number of threads are rare, thus we
-	 make no effort to expand gomp_threads_size geometrically.  */
-      if (nthreads >= pool->threads_size)
-	{
-	  pool->threads_size = nthreads + 1;
-	  pool->threads
-	    = gomp_realloc (pool->threads,
-			    pool->threads_size
-			    * sizeof (struct gomp_thread_data *));
-	}
     }
 
-  if (__builtin_expect (nthreads > old_threads_used, 0))
+  if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
     {
-      long diff = (long) nthreads - (long) old_threads_used;
+      long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
 
       if (old_threads_used == 0)
 	--diff;
@@ -389,14 +680,14 @@ gomp_team_start (void (*fn) (void *), vo
 #ifdef HAVE_SYNC_BUILTINS
       __sync_fetch_and_add (&gomp_managed_threads, diff);
 #else
-      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_mutex_lock (&gomp_managed_threads_lock);
       gomp_managed_threads += diff;
-      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+      gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
     }
 
   attr = &gomp_thread_attr;
-  if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
+  if (__builtin_expect (gomp_places_list != NULL, 0))
     {
       size_t stacksize;
       pthread_attr_init (&thread_attr);
@@ -410,11 +701,78 @@ gomp_team_start (void (*fn) (void *), vo
 			    * (nthreads-i));
 
   /* Launch new threads.  */
-  for (; i < nthreads; ++i, ++start_data)
+  for (; i < nthreads; ++i)
     {
       pthread_t pt;
       int err;
 
+      start_data->ts.place_partition_off = thr->ts.place_partition_off;
+      start_data->ts.place_partition_len = thr->ts.place_partition_len;
+      start_data->place = 0;
+      if (__builtin_expect (gomp_places_list != NULL, 0))
+	{
+	  switch (bind)
+	    {
+	    case omp_proc_bind_true:
+	    case omp_proc_bind_close:
+	      if (k == s)
+		{
+		  ++p;
+		  if (p == (team->prev_ts.place_partition_off
+			    + team->prev_ts.place_partition_len))
+		    p = team->prev_ts.place_partition_off;
+		  k = 1;
+		  if (i == nthreads - rest)
+		    s = 1;
+		}
+	      else
+		++k;
+	      break;
+	    case omp_proc_bind_master:
+	      break;
+	    case omp_proc_bind_spread:
+	      if (k == 0)
+		{
+		  /* T <= P.  */
+		  if (p < rest)
+		    p += s + 1;
+		  else
+		    p += s;
+		  if (p == (team->prev_ts.place_partition_off
+			    + team->prev_ts.place_partition_len))
+		    p = team->prev_ts.place_partition_off;
+		  start_data->ts.place_partition_off = p;
+		  if (p < rest)
+		    start_data->ts.place_partition_len = s + 1;
+		  else
+		    start_data->ts.place_partition_len = s;
+		}
+	      else
+		{
+		  /* T > P.  */
+		  if (k == s)
+		    {
+		      ++p;
+		      if (p == (team->prev_ts.place_partition_off
+				+ team->prev_ts.place_partition_len))
+			p = team->prev_ts.place_partition_off;
+		      k = 1;
+		      if (i == nthreads - rest)
+			s = 1;
+		    }
+		  else
+		    ++k;
+		  start_data->ts.place_partition_off = p;
+		  start_data->ts.place_partition_len = 1;
+		}
+	      break;
+	    }
+	  start_data->place = p + 1;
+	  if (affinity_thr != NULL && pool->threads[i] != NULL)
+	    continue;
+	  gomp_init_thread_affinity (attr, p);
+	}
+
       start_data->fn = fn;
       start_data->fn_data = data;
       start_data->ts.team = team;
@@ -430,18 +788,16 @@ gomp_team_start (void (*fn) (void *), vo
       start_data->task = &team->implicit_task[i];
       gomp_init_task (start_data->task, task, icv);
       team->implicit_task[i].icv.nthreads_var = nthreads_var;
+      team->implicit_task[i].icv.bind_var = bind_var;
       start_data->thread_pool = pool;
       start_data->nested = nested;
 
-      if (gomp_cpu_affinity != NULL)
-	gomp_init_thread_affinity (attr);
-
-      err = pthread_create (&pt, attr, gomp_thread_start, start_data);
+      err = pthread_create (&pt, attr, gomp_thread_start, start_data++);
       if (err != 0)
 	gomp_fatal ("Thread creation failed: %s", strerror (err));
     }
 
-  if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
+  if (__builtin_expect (gomp_places_list != NULL, 0))
     pthread_attr_destroy (&thread_attr);
 
  do_release:
@@ -450,21 +806,32 @@ gomp_team_start (void (*fn) (void *), vo
   /* Decrease the barrier threshold to match the number of threads
      that should arrive back at the end of this team.  The extra
      threads should be exiting.  Note that we arrange for this test
-     to never be true for nested teams.  */
-  if (__builtin_expect (nthreads < old_threads_used, 0))
+     to never be true for nested teams.  If AFFINITY_COUNT is non-zero,
+     the barrier as well as gomp_managed_threads was temporarily
+     set to NTHREADS + AFFINITY_COUNT.  For NTHREADS < OLD_THREADS_COUNT,
+     AFFINITY_COUNT if non-zero will be always at least
+     OLD_THREADS_COUNT - NTHREADS.  */
+  if (__builtin_expect (nthreads < old_threads_used, 0)
+      || __builtin_expect (affinity_count, 0))
     {
       long diff = (long) nthreads - (long) old_threads_used;
 
+      if (affinity_count)
+	diff = -affinity_count;
+
       gomp_barrier_reinit (&pool->threads_dock, nthreads);
 
 #ifdef HAVE_SYNC_BUILTINS
       __sync_fetch_and_add (&gomp_managed_threads, diff);
 #else
-      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_mutex_lock (&gomp_managed_threads_lock);
       gomp_managed_threads += diff;
-      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+      gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
     }
+  if (__builtin_expect (affinity_thr != NULL, 0)
+      && team->prev_ts.place_partition_len > 64)
+    free (affinity_thr);
 }
 
 
@@ -477,9 +844,26 @@ gomp_team_end (void)
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
 
-  /* This barrier handles all pending explicit threads.  */
-  gomp_team_barrier_wait (&team->barrier);
-  gomp_fini_work_share (thr->ts.work_share);
+  /* This barrier handles all pending explicit threads.
+     As #pragma omp cancel parallel might get awaited count in
+     team->barrier in a inconsistent state, we need to use a different
+     counter here.  */
+  gomp_team_barrier_wait_final (&team->barrier);
+  if (__builtin_expect (team->team_cancelled, 0))
+    {
+      struct gomp_work_share *ws = team->work_shares_to_free;
+      do
+	{
+	  struct gomp_work_share *next_ws = gomp_ptrlock_get (&ws->next_ws);
+	  if (next_ws == NULL)
+	    gomp_ptrlock_set (&ws->next_ws, ws);
+	  gomp_fini_work_share (ws);
+	  ws = next_ws;
+	}
+      while (ws != NULL);
+    }
+  else
+    gomp_fini_work_share (thr->ts.work_share);
 
   gomp_end_task ();
   thr->ts = team->prev_ts;
@@ -489,9 +873,9 @@ gomp_team_end (void)
 #ifdef HAVE_SYNC_BUILTINS
       __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
 #else
-      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_mutex_lock (&gomp_managed_threads_lock);
       gomp_managed_threads -= team->nthreads - 1L;
-      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+      gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
       /* This barrier has gomp_barrier_wait_last counterparts
 	 and ensures the team can be safely destroyed.  */
@@ -532,8 +916,6 @@ gomp_team_end (void)
 static void __attribute__((constructor))
 initialize_team (void)
 {
-  struct gomp_thread *thr;
-
 #ifndef HAVE_TLS
   static struct gomp_thread initial_thread_tls_data;
 
@@ -543,13 +925,6 @@ initialize_team (void)
 
   if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
     gomp_fatal ("could not create thread pool destructor.");
-
-#ifdef HAVE_TLS
-  thr = &gomp_tls_data;
-#else
-  thr = &initial_thread_tls_data;
-#endif
-  gomp_sem_init (&thr->release, 0);
 }
 
 static void __attribute__((destructor))
--- libgomp/sections.c	(revision 210461)
+++ libgomp/sections.c	(revision 210462)
@@ -139,11 +139,27 @@ GOMP_parallel_sections_start (void (*fn)
   num_threads = gomp_resolve_num_threads (num_threads, count);
   team = gomp_new_team (num_threads);
   gomp_sections_init (&team->work_shares[0], count);
-  gomp_team_start (fn, data, num_threads, team);
+  gomp_team_start (fn, data, num_threads, 0, team);
+}
+
+ialias_redirect (GOMP_parallel_end)
+
+void
+GOMP_parallel_sections (void (*fn) (void *), void *data,
+			unsigned num_threads, unsigned count, unsigned flags)
+{
+  struct gomp_team *team;
+
+  num_threads = gomp_resolve_num_threads (num_threads, count);
+  team = gomp_new_team (num_threads);
+  gomp_sections_init (&team->work_shares[0], count);
+  gomp_team_start (fn, data, num_threads, flags, team);
+  fn (data);
+  GOMP_parallel_end ();
 }
 
 /* The GOMP_section_end* routines are called after the thread is told
-   that all sections are complete.  This first version synchronizes
+   that all sections are complete.  The first two versions synchronize
    all threads; the nowait version does not.  */
 
 void
@@ -152,6 +168,12 @@ GOMP_sections_end (void)
   gomp_work_share_end ();
 }
 
+bool
+GOMP_sections_end_cancel (void)
+{
+  return gomp_work_share_end_cancel ();
+}
+
 void
 GOMP_sections_end_nowait (void)
 {
--- libgomp/env.c	(revision 210461)
+++ libgomp/env.c	(revision 210462)
@@ -29,6 +29,10 @@
 #include "libgomp_f.h"
 #include <ctype.h>
 #include <stdlib.h>
+#include <stdio.h>
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>	/* For PRIu64.  */
+#endif
 #ifdef STRING_WITH_STRINGS
 # include <string.h>
 # include <strings.h>
@@ -50,23 +54,28 @@
 
 struct gomp_task_icv gomp_global_icv = {
   .nthreads_var = 1,
+  .thread_limit_var = UINT_MAX,
   .run_sched_var = GFS_DYNAMIC,
   .run_sched_modifier = 1,
+  .default_device_var = 0,
   .dyn_var = false,
-  .nest_var = false
+  .nest_var = false,
+  .bind_var = omp_proc_bind_false,
+  .target_data = NULL
 };
 
-unsigned short *gomp_cpu_affinity;
-size_t gomp_cpu_affinity_len;
 unsigned long gomp_max_active_levels_var = INT_MAX;
-unsigned long gomp_thread_limit_var = ULONG_MAX;
-unsigned long gomp_remaining_threads_count;
+bool gomp_cancel_var = false;
 #ifndef HAVE_SYNC_BUILTINS
-gomp_mutex_t gomp_remaining_threads_lock;
+gomp_mutex_t gomp_managed_threads_lock;
 #endif
 unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
 unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
 unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
+char *gomp_bind_var_list;
+unsigned long gomp_bind_var_list_len;
+void **gomp_places_list;
+unsigned long gomp_places_list_len;
 
 /* Parse the OMP_SCHEDULE environment variable.  */
 
@@ -184,6 +193,24 @@ parse_unsigned_long (const char *name, u
   return false;
 }
 
+/* Parse a positive int environment variable.  Return true if one was
+   present and it was successfully parsed.  */
+
+static bool
+parse_int (const char *name, int *pvalue, bool allow_zero)
+{
+  unsigned long value;
+  if (!parse_unsigned_long (name, &value, allow_zero))
+    return false;
+  if (value > INT_MAX)
+    {
+      gomp_error ("Invalid value for environment variable %s", name);
+      return false;
+    }
+  *pvalue = (int) value;
+  return true;
+}
+
 /* Parse an unsigned long list environment variable.  Return true if one was
    present and it was successfully parsed.  */
 
@@ -273,6 +300,416 @@ parse_unsigned_long_list (const char *na
   return false;
 }
 
+/* Parse environment variable set to a boolean or list of omp_proc_bind_t
+   enum values.  Return true if one was present and it was successfully
+   parsed.  */
+
+static bool
+parse_bind_var (const char *name, char *p1stvalue,
+		char **pvalues, unsigned long *pnvalues)
+{
+  char *env;
+  char value = omp_proc_bind_false, *values = NULL;
+  int i;
+  static struct proc_bind_kinds
+  {
+    const char name[7];
+    const char len;
+    omp_proc_bind_t kind;
+  } kinds[] =
+  {
+    { "false", 5, omp_proc_bind_false },
+    { "true", 4, omp_proc_bind_true },
+    { "master", 6, omp_proc_bind_master },
+    { "close", 5, omp_proc_bind_close },
+    { "spread", 6, omp_proc_bind_spread }
+  };
+
+  env = getenv (name);
+  if (env == NULL)
+    return false;
+
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env == '\0')
+    goto invalid;
+
+  for (i = 0; i < 5; i++)
+    if (strncasecmp (env, kinds[i].name, kinds[i].len) == 0)
+      {
+	value = kinds[i].kind;
+	env += kinds[i].len;
+	break;
+      }
+  if (i == 5)
+    goto invalid;
+
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env != '\0')
+    {
+      if (*env == ',')
+	{
+	  unsigned long nvalues = 0, nalloced = 0;
+
+	  if (value == omp_proc_bind_false
+	      || value == omp_proc_bind_true)
+	    goto invalid;
+
+	  do
+	    {
+	      env++;
+	      if (nvalues == nalloced)
+		{
+		  char *n;
+		  nalloced = nalloced ? nalloced * 2 : 16;
+		  n = realloc (values, nalloced);
+		  if (n == NULL)
+		    {
+		      free (values);
+		      gomp_error ("Out of memory while trying to parse"
+				  " environment variable %s", name);
+		      return false;
+		    }
+		  values = n;
+		  if (nvalues == 0)
+		    values[nvalues++] = value;
+		}
+
+	      while (isspace ((unsigned char) *env))
+		++env;
+	      if (*env == '\0')
+		goto invalid;
+
+	      for (i = 2; i < 5; i++)
+		if (strncasecmp (env, kinds[i].name, kinds[i].len) == 0)
+		  {
+		    value = kinds[i].kind;
+		    env += kinds[i].len;
+		    break;
+		  }
+	      if (i == 5)
+		goto invalid;
+
+	      values[nvalues++] = value;
+	      while (isspace ((unsigned char) *env))
+		++env;
+	      if (*env == '\0')
+		break;
+	      if (*env != ',')
+		goto invalid;
+	    }
+	  while (1);
+	  *p1stvalue = values[0];
+	  *pvalues = values;
+	  *pnvalues = nvalues;
+	  return true;
+	}
+      goto invalid;
+    }
+
+  *p1stvalue = value;
+  return true;
+
+ invalid:
+  free (values);
+  gomp_error ("Invalid value for environment variable %s", name);
+  return false;
+}
+
+static bool
+parse_one_place (char **envp, bool *negatep, unsigned long *lenp,
+		 long *stridep)
+{
+  char *env = *envp, *start;
+  void *p = gomp_places_list ? gomp_places_list[gomp_places_list_len] : NULL;
+  unsigned long len = 1;
+  long stride = 1;
+  int pass;
+  bool any_negate = false;
+  *negatep = false;
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env == '!')
+    {
+      *negatep = true;
+      ++env;
+      while (isspace ((unsigned char) *env))
+	++env;
+    }
+  if (*env != '{')
+    return false;
+  ++env;
+  while (isspace ((unsigned char) *env))
+    ++env;
+  start = env;
+  for (pass = 0; pass < (any_negate ? 2 : 1); pass++)
+    {
+      env = start;
+      do
+	{
+	  unsigned long this_num, this_len = 1;
+	  long this_stride = 1;
+	  bool this_negate = (*env == '!');
+	  if (this_negate)
+	    {
+	      if (gomp_places_list)
+		any_negate = true;
+	      ++env;
+	      while (isspace ((unsigned char) *env))
+		++env;
+	    }
+
+	  errno = 0;
+	  this_num = strtoul (env, &env, 10);
+	  if (errno)
+	    return false;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+	  if (*env == ':')
+	    {
+	      ++env;
+	      while (isspace ((unsigned char) *env))
+		++env;
+	      errno = 0;
+	      this_len = strtoul (env, &env, 10);
+	      if (errno || this_len == 0)
+		return false;
+	      while (isspace ((unsigned char) *env))
+		++env;
+	      if (*env == ':')
+		{
+		  ++env;
+		  while (isspace ((unsigned char) *env))
+		    ++env;
+		  errno = 0;
+		  this_stride = strtol (env, &env, 10);
+		  if (errno)
+		    return false;
+		  while (isspace ((unsigned char) *env))
+		    ++env;
+		}
+	    }
+	  if (this_negate && this_len != 1)
+	    return false;
+	  if (gomp_places_list && pass == this_negate)
+	    {
+	      if (this_negate)
+		{
+		  if (!gomp_affinity_remove_cpu (p, this_num))
+		    return false;
+		}
+	      else if (!gomp_affinity_add_cpus (p, this_num, this_len,
+						this_stride, false))
+		return false;
+	    }
+	  if (*env == '}')
+	    break;
+	  if (*env != ',')
+	    return false;
+	  ++env;
+	}
+      while (1);
+    }
+
+  ++env;
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env == ':')
+    {
+      ++env;
+      while (isspace ((unsigned char) *env))
+	++env;
+      errno = 0;
+      len = strtoul (env, &env, 10);
+      if (errno || len == 0 || len >= 65536)
+	return false;
+      while (isspace ((unsigned char) *env))
+	++env;
+      if (*env == ':')
+	{
+	  ++env;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+	  errno = 0;
+	  stride = strtol (env, &env, 10);
+	  if (errno)
+	    return false;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+	}
+    }
+  if (*negatep && len != 1)
+    return false;
+  *envp = env;
+  *lenp = len;
+  *stridep = stride;
+  return true;
+}
+
+static bool
+parse_places_var (const char *name, bool ignore)
+{
+  char *env = getenv (name), *end;
+  bool any_negate = false;
+  int level = 0;
+  unsigned long count = 0;
+  if (env == NULL)
+    return false;
+
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env == '\0')
+    goto invalid;
+
+  if (strncasecmp (env, "threads", 7) == 0)
+    {
+      env += 7;
+      level = 1;
+    }
+  else if (strncasecmp (env, "cores", 5) == 0)
+    {
+      env += 5;
+      level = 2;
+    }
+  else if (strncasecmp (env, "sockets", 7) == 0)
+    {
+      env += 7;
+      level = 3;
+    }
+  if (level)
+    {
+      count = ULONG_MAX;
+      while (isspace ((unsigned char) *env))
+	++env;
+      if (*env != '\0')
+	{
+	  if (*env++ != '(')
+	    goto invalid;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+
+	  errno = 0;
+	  count = strtoul (env, &end, 10);
+	  if (errno)
+	    goto invalid;
+	  env = end;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+	  if (*env != ')')
+	    goto invalid;
+	  ++env;
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+	  if (*env != '\0')
+	    goto invalid;
+	}
+
+      if (ignore)
+	return false;
+
+      return gomp_affinity_init_level (level, count, false);
+    }
+
+  count = 0;
+  end = env;
+  do
+    {
+      bool negate;
+      unsigned long len;
+      long stride;
+      if (!parse_one_place (&end, &negate, &len, &stride))
+	goto invalid;
+      if (negate)
+	{
+	  if (!any_negate)
+	    count++;
+	  any_negate = true;
+	}
+      else
+	count += len;
+      if (count > 65536)
+	goto invalid;
+      if (*end == '\0')
+	break;
+      if (*end != ',')
+	goto invalid;
+      end++;
+    }
+  while (1);
+
+  if (ignore)
+    return false;
+
+  gomp_places_list_len = 0;
+  gomp_places_list = gomp_affinity_alloc (count, false);
+  if (gomp_places_list == NULL)
+    return false;
+
+  do
+    {
+      bool negate;
+      unsigned long len;
+      long stride;
+      gomp_affinity_init_place (gomp_places_list[gomp_places_list_len]);
+      if (!parse_one_place (&env, &negate, &len, &stride))
+	goto invalid;
+      if (negate)
+	{
+	  void *p;
+	  for (count = 0; count < gomp_places_list_len; count++)
+	    if (gomp_affinity_same_place
+			(gomp_places_list[count],
+			 gomp_places_list[gomp_places_list_len]))
+	      break;
+	  if (count == gomp_places_list_len)
+	    {
+	      gomp_error ("Trying to remove a non-existing place from list "
+			  "of places");
+	      goto invalid;
+	    }
+	  p = gomp_places_list[count];
+	  memmove (&gomp_places_list[count],
+		   &gomp_places_list[count + 1],
+		   (gomp_places_list_len - count - 1) * sizeof (void *));
+	  --gomp_places_list_len;
+	  gomp_places_list[gomp_places_list_len] = p;
+	}
+      else if (len == 1)
+	++gomp_places_list_len;
+      else
+	{
+	  for (count = 0; count < len - 1; count++)
+	    if (!gomp_affinity_copy_place
+			(gomp_places_list[gomp_places_list_len + count + 1],
+			 gomp_places_list[gomp_places_list_len + count],
+			 stride))
+	      goto invalid;
+	  gomp_places_list_len += len;
+	}
+      if (*env == '\0')
+	break;
+      env++;
+    }
+  while (1);
+
+  if (gomp_places_list_len == 0)
+    {
+      gomp_error ("All places have been removed");
+      goto invalid;
+    }
+  if (!gomp_affinity_finalize_place_list (false))
+    goto invalid;
+  return true;
+
+ invalid:
+  free (gomp_places_list);
+  gomp_places_list = NULL;
+  gomp_places_list_len = 0;
+  gomp_error ("Invalid value for environment variable %s", name);
+  return false;
+}
+
 /* Parse the OMP_STACKSIZE environment varible.  Return true if one was
    present and it was successfully parsed.  */
 
@@ -478,86 +915,95 @@ parse_wait_policy (void)
    present and it was successfully parsed.  */
 
 static bool
-parse_affinity (void)
+parse_affinity (bool ignore)
 {
-  char *env, *end;
+  char *env, *end, *start;
+  int pass;
   unsigned long cpu_beg, cpu_end, cpu_stride;
-  unsigned short *cpus = NULL;
-  size_t allocated = 0, used = 0, needed;
+  size_t count = 0, needed;
 
   env = getenv ("GOMP_CPU_AFFINITY");
   if (env == NULL)
     return false;
 
-  do
+  start = env;
+  for (pass = 0; pass < 2; pass++)
     {
-      while (*env == ' ' || *env == '\t')
-	env++;
-
-      cpu_beg = strtoul (env, &end, 0);
-      cpu_end = cpu_beg;
-      cpu_stride = 1;
-      if (env == end || cpu_beg >= 65536)
-	goto invalid;
+      env = start;
+      if (pass == 1)
+	{
+	  if (ignore)
+	    return false;
 
-      env = end;
-      if (*env == '-')
+	  gomp_places_list_len = 0;
+	  gomp_places_list = gomp_affinity_alloc (count, true);
+	  if (gomp_places_list == NULL)
+	    return false;
+	}
+      do
 	{
-	  cpu_end = strtoul (++env, &end, 0);
-	  if (env == end || cpu_end >= 65536 || cpu_end < cpu_beg)
+	  while (isspace ((unsigned char) *env))
+	    ++env;
+
+	  errno = 0;
+	  cpu_beg = strtoul (env, &end, 0);
+	  if (errno || cpu_beg >= 65536)
 	    goto invalid;
+	  cpu_end = cpu_beg;
+	  cpu_stride = 1;
 
 	  env = end;
-	  if (*env == ':')
+	  if (*env == '-')
 	    {
-	      cpu_stride = strtoul (++env, &end, 0);
-	      if (env == end || cpu_stride == 0 || cpu_stride >= 65536)
+	      errno = 0;
+	      cpu_end = strtoul (++env, &end, 0);
+	      if (errno || cpu_end >= 65536 || cpu_end < cpu_beg)
 		goto invalid;
 
 	      env = end;
-	    }
-	}
+	      if (*env == ':')
+		{
+		  errno = 0;
+		  cpu_stride = strtoul (++env, &end, 0);
+		  if (errno || cpu_stride == 0 || cpu_stride >= 65536)
+		    goto invalid;
 
-      needed = (cpu_end - cpu_beg) / cpu_stride + 1;
-      if (used + needed >= allocated)
-	{
-	  unsigned short *new_cpus;
+		  env = end;
+		}
+	    }
 
-	  if (allocated < 64)
-	    allocated = 64;
-	  if (allocated > needed)
-	    allocated <<= 1;
+	  needed = (cpu_end - cpu_beg) / cpu_stride + 1;
+	  if (pass == 0)
+	    count += needed;
 	  else
-	    allocated += 2 * needed;
-	  new_cpus = realloc (cpus, allocated * sizeof (unsigned short));
-	  if (new_cpus == NULL)
 	    {
-	      free (cpus);
-	      gomp_error ("not enough memory to store GOMP_CPU_AFFINITY list");
-	      return false;
+	      while (needed--)
+		{
+		  void *p = gomp_places_list[gomp_places_list_len];
+		  gomp_affinity_init_place (p);
+		  if (gomp_affinity_add_cpus (p, cpu_beg, 1, 0, true))
+		    ++gomp_places_list_len;
+		  cpu_beg += cpu_stride;
+		}
 	    }
 
-	  cpus = new_cpus;
-	}
+	  while (isspace ((unsigned char) *env))
+	    ++env;
 
-      while (needed--)
-	{
-	  cpus[used++] = cpu_beg;
-	  cpu_beg += cpu_stride;
+	  if (*env == ',')
+	    env++;
+	  else if (*env == '\0')
+	    break;
 	}
-
-      while (*env == ' ' || *env == '\t')
-	env++;
-
-      if (*env == ',')
-	env++;
-      else if (*env == '\0')
-	break;
+      while (1);
     }
-  while (1);
 
-  gomp_cpu_affinity = cpus;
-  gomp_cpu_affinity_len = used;
+  if (gomp_places_list_len == 0)
+    {
+      free (gomp_places_list);
+      gomp_places_list = NULL;
+      return false;
+    }
   return true;
 
  invalid:
@@ -565,12 +1011,160 @@ parse_affinity (void)
   return false;
 }
 
+
+static void
+handle_omp_display_env (unsigned long stacksize, int wait_policy)
+{
+  const char *env;
+  bool display = false;
+  bool verbose = false;
+  int i;
+
+  env = getenv ("OMP_DISPLAY_ENV");
+  if (env == NULL)
+    return;
+
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (strncasecmp (env, "true", 4) == 0)
+    {
+      display = true;
+      env += 4;
+    }
+  else if (strncasecmp (env, "false", 5) == 0)
+    {
+      display = false;
+      env += 5;
+    }
+  else if (strncasecmp (env, "verbose", 7) == 0)
+    {
+      display = true;
+      verbose = true;
+      env += 7;
+    }
+  else
+    env = "X";
+  while (isspace ((unsigned char) *env))
+    ++env;
+  if (*env != '\0')
+    gomp_error ("Invalid value for environment variable OMP_DISPLAY_ENV");
+
+  if (!display)
+    return;
+
+  fputs ("\nOPENMP DISPLAY ENVIRONMENT BEGIN\n", stderr);
+
+  fputs ("  _OPENMP = '201307'\n", stderr);
+  fprintf (stderr, "  OMP_DYNAMIC = '%s'\n",
+	   gomp_global_icv.dyn_var ? "TRUE" : "FALSE");
+  fprintf (stderr, "  OMP_NESTED = '%s'\n",
+	   gomp_global_icv.nest_var ? "TRUE" : "FALSE");
+
+  fprintf (stderr, "  OMP_NUM_THREADS = '%lu", gomp_global_icv.nthreads_var);
+  for (i = 1; i < gomp_nthreads_var_list_len; i++)
+    fprintf (stderr, ",%lu", gomp_nthreads_var_list[i]);
+  fputs ("'\n", stderr);
+
+  fprintf (stderr, "  OMP_SCHEDULE = '");
+  switch (gomp_global_icv.run_sched_var)
+    {
+    case GFS_RUNTIME:
+      fputs ("RUNTIME", stderr);
+      break;
+    case GFS_STATIC:
+      fputs ("STATIC", stderr);
+      break;
+    case GFS_DYNAMIC:
+      fputs ("DYNAMIC", stderr);
+      break;
+    case GFS_GUIDED:
+      fputs ("GUIDED", stderr);
+      break;
+    case GFS_AUTO:
+      fputs ("AUTO", stderr);
+      break;
+    }
+  fputs ("'\n", stderr);
+
+  fputs ("  OMP_PROC_BIND = '", stderr);
+  switch (gomp_global_icv.bind_var)
+    {
+    case omp_proc_bind_false:
+      fputs ("FALSE", stderr);
+      break;
+    case omp_proc_bind_true:
+      fputs ("TRUE", stderr);
+      break;
+    case omp_proc_bind_master:
+      fputs ("MASTER", stderr);
+      break;
+    case omp_proc_bind_close:
+      fputs ("CLOSE", stderr);
+      break;
+    case omp_proc_bind_spread:
+      fputs ("SPREAD", stderr);
+      break;
+    }
+  for (i = 1; i < gomp_bind_var_list_len; i++)
+    switch (gomp_bind_var_list[i])
+      {
+      case omp_proc_bind_master:
+	fputs (",MASTER", stderr);
+	break;
+      case omp_proc_bind_close:
+	fputs (",CLOSE", stderr);
+	break;
+      case omp_proc_bind_spread:
+	fputs (",SPREAD", stderr);
+	break;
+      }
+  fputs ("'\n", stderr);
+  fputs ("  OMP_PLACES = '", stderr);
+  for (i = 0; i < gomp_places_list_len; i++)
+    {
+      fputs ("{", stderr);
+      gomp_affinity_print_place (gomp_places_list[i]);
+      fputs (i + 1 == gomp_places_list_len ? "}" : "},", stderr);
+    }
+  fputs ("'\n", stderr);
+
+  fprintf (stderr, "  OMP_STACKSIZE = '%lu'\n", stacksize);
+
+  /* GOMP's default value is actually neither active nor passive.  */
+  fprintf (stderr, "  OMP_WAIT_POLICY = '%s'\n",
+	   wait_policy > 0 ? "ACTIVE" : "PASSIVE");
+  fprintf (stderr, "  OMP_THREAD_LIMIT = '%u'\n",
+	   gomp_global_icv.thread_limit_var);
+  fprintf (stderr, "  OMP_MAX_ACTIVE_LEVELS = '%lu'\n",
+	   gomp_max_active_levels_var);
+
+  fprintf (stderr, "  OMP_CANCELLATION = '%s'\n",
+	   gomp_cancel_var ? "TRUE" : "FALSE");
+  fprintf (stderr, "  OMP_DEFAULT_DEVICE = '%d'\n",
+	   gomp_global_icv.default_device_var);
+
+  if (verbose)
+    {
+      fputs ("  GOMP_CPU_AFFINITY = ''\n", stderr);
+      fprintf (stderr, "  GOMP_STACKSIZE = '%lu'\n", stacksize);
+#ifdef HAVE_INTTYPES_H
+      fprintf (stderr, "  GOMP_SPINCOUNT = '%"PRIu64"'\n",
+	       (uint64_t) gomp_spin_count_var);
+#else
+      fprintf (stderr, "  GOMP_SPINCOUNT = '%lu'\n",
+	       (unsigned long) gomp_spin_count_var);
+#endif
+    }
+
+  fputs ("OPENMP DISPLAY ENVIRONMENT END\n", stderr);
+}
+
+
 static void __attribute__((constructor))
 initialize_env (void)
 {
-  unsigned long stacksize;
+  unsigned long thread_limit_var, stacksize;
   int wait_policy;
-  bool bind_var = false;
 
   /* Do a compile time check that mkomp_h.pl did good job.  */
   omp_check_defines ();
@@ -578,14 +1172,17 @@ initialize_env (void)
   parse_schedule ();
   parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var);
   parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
-  parse_boolean ("OMP_PROC_BIND", &bind_var);
+  parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var);
+  parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true);
   parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var,
 		       true);
-  parse_unsigned_long ("OMP_THREAD_LIMIT", &gomp_thread_limit_var, false);
-  if (gomp_thread_limit_var != ULONG_MAX)
-    gomp_remaining_threads_count = gomp_thread_limit_var - 1;
+  if (parse_unsigned_long ("OMP_THREAD_LIMIT", &thread_limit_var, false))
+    {
+      gomp_global_icv.thread_limit_var
+	= thread_limit_var > INT_MAX ? UINT_MAX : thread_limit_var;
+    }
 #ifndef HAVE_SYNC_BUILTINS
-  gomp_mutex_init (&gomp_remaining_threads_lock);
+  gomp_mutex_init (&gomp_managed_threads_lock);
 #endif
   gomp_init_num_threads ();
   gomp_available_cpus = gomp_global_icv.nthreads_var;
@@ -594,7 +1191,34 @@ initialize_env (void)
 				 &gomp_nthreads_var_list,
 				 &gomp_nthreads_var_list_len))
     gomp_global_icv.nthreads_var = gomp_available_cpus;
-  if (parse_affinity () || bind_var)
+  bool ignore = false;
+  if (parse_bind_var ("OMP_PROC_BIND",
+		      &gomp_global_icv.bind_var,
+		      &gomp_bind_var_list,
+		      &gomp_bind_var_list_len)
+      && gomp_global_icv.bind_var == omp_proc_bind_false)
+    ignore = true;
+  /* Make sure OMP_PLACES and GOMP_CPU_AFFINITY env vars are always
+     parsed if present in the environment.  If OMP_PROC_BIND was set
+     explictly to false, don't populate places list though.  If places
+     list was successfully set from OMP_PLACES, only parse but don't process
+     GOMP_CPU_AFFINITY.  If OMP_PROC_BIND was not set in the environment,
+     default to OMP_PROC_BIND=true if OMP_PLACES or GOMP_CPU_AFFINITY
+     was successfully parsed into a places list, otherwise to
+     OMP_PROC_BIND=false.  */
+  if (parse_places_var ("OMP_PLACES", ignore))
+    {
+      if (gomp_global_icv.bind_var == omp_proc_bind_false)
+	gomp_global_icv.bind_var = true;
+      ignore = true;
+    }
+  if (parse_affinity (ignore))
+    {
+      if (gomp_global_icv.bind_var == omp_proc_bind_false)
+	gomp_global_icv.bind_var = true;
+      ignore = true;
+    }
+  if (gomp_global_icv.bind_var != omp_proc_bind_false)
     gomp_init_affinity ();
   wait_policy = parse_wait_policy ();
   if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
@@ -645,6 +1269,8 @@ initialize_env (void)
       if (err != 0)
 	gomp_error ("Stack size change failed: %s", strerror (err));
     }
+
+  handle_omp_display_env (stacksize, wait_policy);
 }
 
 
@@ -728,7 +1354,8 @@ omp_get_max_threads (void)
 int
 omp_get_thread_limit (void)
 {
-  return gomp_thread_limit_var > INT_MAX ? INT_MAX : gomp_thread_limit_var;
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var;
 }
 
 void
@@ -744,6 +1371,60 @@ omp_get_max_active_levels (void)
   return gomp_max_active_levels_var;
 }
 
+int
+omp_get_cancellation (void)
+{
+  return gomp_cancel_var;
+}
+
+omp_proc_bind_t
+omp_get_proc_bind (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->bind_var;
+}
+
+void
+omp_set_default_device (int device_num)
+{
+  struct gomp_task_icv *icv = gomp_icv (true);
+  icv->default_device_var = device_num >= 0 ? device_num : 0;
+}
+
+int
+omp_get_default_device (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  return icv->default_device_var;
+}
+
+int
+omp_get_num_devices (void)
+{
+  return gomp_get_num_devices ();
+}
+
+int
+omp_get_num_teams (void)
+{
+  /* Hardcoded to 1 on host, MIC, HSAIL?  Maybe variable on PTX.  */
+  return 1;
+}
+
+int
+omp_get_team_num (void)
+{
+  /* Hardcoded to 0 on host, MIC, HSAIL?  Maybe variable on PTX.  */
+  return 0;
+}
+
+int
+omp_is_initial_device (void)
+{
+  /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX.  */
+  return 1;
+}
+
 ialias (omp_set_dynamic)
 ialias (omp_set_nested)
 ialias (omp_set_num_threads)
@@ -755,3 +1436,11 @@ ialias (omp_get_max_threads)
 ialias (omp_get_thread_limit)
 ialias (omp_set_max_active_levels)
 ialias (omp_get_max_active_levels)
+ialias (omp_get_cancellation)
+ialias (omp_get_proc_bind)
+ialias (omp_set_default_device)
+ialias (omp_get_default_device)
+ialias (omp_get_num_devices)
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
+ialias (omp_is_initial_device)
--- libgomp/libgomp.h	(revision 210461)
+++ libgomp/libgomp.h	(revision 213654)
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2013 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2014 Free Software Foundation, Inc.
    Contributed by Richard Henderson <rth@redhat.com>.
 
    This file is part of the GNU OpenMP Library (libgomp).
@@ -39,6 +39,7 @@
 
 #include <pthread.h>
 #include <stdbool.h>
+#include <stdlib.h>
 
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility push(hidden)
@@ -201,6 +202,10 @@ struct gomp_team_state
   /* Active nesting level.  Only active parallel regions are counted.  */
   unsigned active_level;
 
+  /* Place-partition-var, offset and length into gomp_places_list array.  */
+  unsigned place_partition_off;
+  unsigned place_partition_len;
+
 #ifdef HAVE_SYNC_BUILTINS
   /* Number of single stmts encountered.  */
   unsigned long single_count;
@@ -214,30 +219,40 @@ struct gomp_team_state
   unsigned long static_trip;
 };
 
-/* These are the OpenMP 3.0 Internal Control Variables described in
+struct target_mem_desc;
+
+/* These are the OpenMP 4.0 Internal Control Variables described in
    section 2.3.1.  Those described as having one copy per task are
    stored within the structure; those described as having one copy
    for the whole program are (naturally) global variables.  */
-
+   
 struct gomp_task_icv
 {
   unsigned long nthreads_var;
   enum gomp_schedule_type run_sched_var;
   int run_sched_modifier;
+  int default_device_var;
+  unsigned int thread_limit_var;
   bool dyn_var;
   bool nest_var;
+  char bind_var;
+  /* Internal ICV.  */
+  struct target_mem_desc *target_data;
 };
 
 extern struct gomp_task_icv gomp_global_icv;
-extern unsigned long gomp_thread_limit_var;
-extern unsigned long gomp_remaining_threads_count;
 #ifndef HAVE_SYNC_BUILTINS
-extern gomp_mutex_t gomp_remaining_threads_lock;
+extern gomp_mutex_t gomp_managed_threads_lock;
 #endif
 extern unsigned long gomp_max_active_levels_var;
+extern bool gomp_cancel_var;
 extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
 extern unsigned long gomp_available_cpus, gomp_managed_threads;
 extern unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
+extern char *gomp_bind_var_list;
+extern unsigned long gomp_bind_var_list_len;
+extern void **gomp_places_list;
+extern unsigned long gomp_places_list_len;
 
 enum gomp_task_kind
 {
@@ -247,6 +262,39 @@ enum gomp_task_kind
   GOMP_TASK_TIED
 };
 
+struct gomp_task;
+struct gomp_taskgroup;
+struct htab;
+
+struct gomp_task_depend_entry
+{
+  void *addr;
+  struct gomp_task_depend_entry *next;
+  struct gomp_task_depend_entry *prev;
+  struct gomp_task *task;
+  bool is_in;
+  bool redundant;
+  bool redundant_out;
+};
+
+struct gomp_dependers_vec
+{
+  size_t n_elem;
+  size_t allocated;
+  struct gomp_task *elem[];
+};
+
+/* Used when in GOMP_taskwait or in gomp_task_maybe_wait_for_dependencies.  */
+
+struct gomp_taskwait
+{
+  bool in_taskwait;
+  bool in_depend_wait;
+  size_t n_depend;
+  struct gomp_task *last_parent_depends_on;
+  gomp_sem_t taskwait_sem;
+};
+
 /* This structure describes a "task" to be run by a thread.  */
 
 struct gomp_task
@@ -257,14 +305,33 @@ struct gomp_task
   struct gomp_task *prev_child;
   struct gomp_task *next_queue;
   struct gomp_task *prev_queue;
+  struct gomp_task *next_taskgroup;
+  struct gomp_task *prev_taskgroup;
+  struct gomp_taskgroup *taskgroup;
+  struct gomp_dependers_vec *dependers;
+  struct htab *depend_hash;
+  struct gomp_taskwait *taskwait;
+  size_t depend_count;
+  size_t num_dependees;
   struct gomp_task_icv icv;
   void (*fn) (void *);
   void *fn_data;
   enum gomp_task_kind kind;
-  bool in_taskwait;
   bool in_tied_task;
   bool final_task;
-  gomp_sem_t taskwait_sem;
+  bool copy_ctors_done;
+  bool parent_depends_on;
+  struct gomp_task_depend_entry depend[];
+};
+
+struct gomp_taskgroup
+{
+  struct gomp_taskgroup *prev;
+  struct gomp_task *children;
+  bool in_taskgroup_wait;
+  bool cancelled;
+  gomp_sem_t taskgroup_sem;
+  size_t num_children;
 };
 
 /* This structure describes a "team" of threads.  These are the threads
@@ -293,6 +360,12 @@ struct gomp_team
      of the threads in the team.  */
   gomp_sem_t **ordered_release;
 
+  /* List of work shares on which gomp_fini_work_share hasn't been
+     called yet.  If the team hasn't been cancelled, this should be
+     equal to each thr->ts.work_share, but otherwise it can be a possibly
+     long list of workshares.  */
+  struct gomp_work_share *work_shares_to_free;
+
   /* List of gomp_work_share structs chained through next_free fields.
      This is populated and taken off only by the first thread in the
      team encountering a new work sharing construct, in a critical
@@ -324,8 +397,20 @@ struct gomp_team
 
   gomp_mutex_t task_lock;
   struct gomp_task *task_queue;
-  int task_count;
-  int task_running_count;
+  /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team.  */
+  unsigned int task_count;
+  /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled.  */
+  unsigned int task_queued_count;
+  /* Number of GOMP_TASK_{WAITING,TIED} tasks currently running
+     directly in gomp_barrier_handle_tasks; tasks spawned
+     from e.g. GOMP_taskwait or GOMP_taskgroup_end don't count, even when
+     that is called from a task run from gomp_barrier_handle_tasks.
+     task_running_count should be always <= team->nthreads,
+     and if current task isn't in_tied_task, then it will be
+     even < team->nthreads.  */
+  unsigned int task_running_count;
+  int work_share_cancelled;
+  int team_cancelled;
 
   /* This array contains structures for implicit tasks.  */
   struct gomp_task implicit_task[];
@@ -350,7 +435,11 @@ struct gomp_thread
   /* This semaphore is used for ordered loops.  */
   gomp_sem_t release;
 
-  /* user pthread thread pool */
+  /* Place this thread is bound to plus one, or zero if not bound
+     to any place.  */
+  unsigned int place;
+
+  /* User pthread thread pool */
   struct gomp_thread_pool *thread_pool;
 };
 
@@ -363,11 +452,23 @@ struct gomp_thread_pool
   unsigned threads_size;
   unsigned threads_used;
   struct gomp_team *last_team;
+  /* Number of threads running in this contention group.  */
+  unsigned long threads_busy;
 
   /* This barrier holds and releases threads waiting in threads.  */
   gomp_barrier_t threads_dock;
 };
 
+enum gomp_cancel_kind
+{
+  GOMP_CANCEL_PARALLEL = 1,
+  GOMP_CANCEL_LOOP = 2,
+  GOMP_CANCEL_FOR = GOMP_CANCEL_LOOP,
+  GOMP_CANCEL_DO = GOMP_CANCEL_LOOP,
+  GOMP_CANCEL_SECTIONS = 4,
+  GOMP_CANCEL_TASKGROUP = 8
+};
+
 /* ... and here is that TLS data.  */
 
 #ifdef HAVE_TLS
@@ -402,17 +503,22 @@ static inline struct gomp_task_icv *gomp
 /* The attributes to be used during thread creation.  */
 extern pthread_attr_t gomp_thread_attr;
 
-/* Other variables.  */
-
-extern unsigned short *gomp_cpu_affinity;
-extern size_t gomp_cpu_affinity_len;
-
 /* Function prototypes.  */
 
 /* affinity.c */
 
 extern void gomp_init_affinity (void);
-extern void gomp_init_thread_affinity (pthread_attr_t *);
+extern void gomp_init_thread_affinity (pthread_attr_t *, unsigned int);
+extern void **gomp_affinity_alloc (unsigned long, bool);
+extern void gomp_affinity_init_place (void *);
+extern bool gomp_affinity_add_cpus (void *, unsigned long, unsigned long,
+				    long, bool);
+extern bool gomp_affinity_remove_cpu (void *, unsigned long);
+extern bool gomp_affinity_copy_place (void *, void *, long);
+extern bool gomp_affinity_same_place (void *, void *);
+extern bool gomp_affinity_finalize_place_list (bool);
+extern bool gomp_affinity_init_level (int, unsigned long, bool);
+extern void gomp_affinity_print_place (void *);
 
 /* alloc.c */
 
@@ -486,15 +592,21 @@ extern void gomp_barrier_handle_tasks (g
 static void inline
 gomp_finish_task (struct gomp_task *task)
 {
-  gomp_sem_destroy (&task->taskwait_sem);
+  if (__builtin_expect (task->depend_hash != NULL, 0))
+    free (task->depend_hash);
 }
 
 /* team.c */
 
 extern struct gomp_team *gomp_new_team (unsigned);
 extern void gomp_team_start (void (*) (void *), void *, unsigned,
-			     struct gomp_team *);
+			     unsigned, struct gomp_team *);
 extern void gomp_team_end (void);
+extern void gomp_free_thread (void *);
+
+/* target.c */
+
+extern int gomp_get_num_devices (void);
 
 /* work.c */
 
@@ -502,6 +614,7 @@ extern void gomp_init_work_share (struct
 extern void gomp_fini_work_share (struct gomp_work_share *);
 extern bool gomp_work_share_start (bool);
 extern void gomp_work_share_end (void);
+extern bool gomp_work_share_end_cancel (void);
 extern void gomp_work_share_end_nowait (void);
 
 static inline void
@@ -524,6 +637,26 @@ gomp_work_share_init_done (void)
 #define _LIBGOMP_OMP_LOCK_DEFINED 1
 #include "omp.h.in"
 
+typedef enum omp_proc_bind_t
+{
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+} omp_proc_bind_t;
+
+extern int omp_get_cancellation (void) __GOMP_NOTHROW;
+extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW;
+
+extern void omp_set_default_device (int) __GOMP_NOTHROW;
+extern int omp_get_default_device (void) __GOMP_NOTHROW;
+extern int omp_get_num_devices (void) __GOMP_NOTHROW;
+extern int omp_get_num_teams (void) __GOMP_NOTHROW;
+extern int omp_get_team_num (void) __GOMP_NOTHROW;
+
+extern int omp_is_initial_device (void) __GOMP_NOTHROW;
+
 #if !defined (HAVE_ATTRIBUTE_VISIBILITY) \
     || !defined (HAVE_ATTRIBUTE_ALIAS) \
     || !defined (HAVE_AS_SYMVER_DIRECTIVE) \
@@ -580,11 +713,19 @@ extern int gomp_test_nest_lock_25 (omp_n
 #endif
 
 #ifdef HAVE_ATTRIBUTE_ALIAS
+# define ialias_ulp	ialias_str1(__USER_LABEL_PREFIX__)
+# define ialias_str1(x)	ialias_str2(x)
+# define ialias_str2(x)	#x
 # define ialias(fn) \
   extern __typeof (fn) gomp_ialias_##fn \
     __attribute__ ((alias (#fn))) attribute_hidden;
+# define ialias_redirect(fn) \
+  extern __typeof (fn) fn __asm__ (ialias_ulp "gomp_ialias_" #fn) attribute_hidden;
+# define ialias_call(fn) gomp_ialias_ ## fn
 #else
 # define ialias(fn)
+# define ialias_redirect(fn)
+# define ialias_call(fn) fn
 #endif
 
 #endif /* LIBGOMP_H */
--- libgomp/task.c	(revision 210461)
+++ libgomp/task.c	(revision 213654)
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2013 Free Software Foundation, Inc.
+/* Copyright (C) 2007-2014 Free Software Foundation, Inc.
    Contributed by Richard Henderson <rth@redhat.com>.
 
    This file is part of the GNU OpenMP Library (libgomp).
@@ -29,6 +29,33 @@
 #include <stdlib.h>
 #include <string.h>
 
+typedef struct gomp_task_depend_entry *hash_entry_type;
+
+static inline void *
+htab_alloc (size_t size)
+{
+  return gomp_malloc (size);
+}
+
+static inline void
+htab_free (void *ptr)
+{
+  free (ptr);
+}
+
+#include "hashtab.h"
+
+static inline hashval_t
+htab_hash (hash_entry_type element)
+{
+  return hash_pointer (element->addr);
+}
+
+static inline bool
+htab_eq (hash_entry_type x, hash_entry_type y)
+{
+  return x->addr == y->addr;
+}
 
 /* Create a new task data structure.  */
 
@@ -39,11 +66,16 @@ gomp_init_task (struct gomp_task *task,
   task->parent = parent_task;
   task->icv = *prev_icv;
   task->kind = GOMP_TASK_IMPLICIT;
-  task->in_taskwait = false;
+  task->taskwait = NULL;
   task->in_tied_task = false;
   task->final_task = false;
+  task->copy_ctors_done = false;
+  task->parent_depends_on = false;
   task->children = NULL;
-  gomp_sem_init (&task->taskwait_sem, 0);
+  task->taskgroup = NULL;
+  task->dependers = NULL;
+  task->depend_hash = NULL;
+  task->depend_count = 0;
 }
 
 /* Clean up a task, after completing it.  */
@@ -72,13 +104,16 @@ gomp_clear_parent (struct gomp_task *chi
     while (task != children);
 }
 
+static void gomp_task_maybe_wait_for_dependencies (void **depend);
+
 /* Called when encountering an explicit task directive.  If IF_CLAUSE is
    false, then we must not delay in executing the task.  If UNTIED is true,
    then the task may be executed by any member of the team.  */
 
 void
 GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
-	   long arg_size, long arg_align, bool if_clause, unsigned flags)
+	   long arg_size, long arg_align, bool if_clause, unsigned flags,
+	   void **depend)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -94,17 +129,35 @@ GOMP_task (void (*fn) (void *), void *da
     flags &= ~1;
 #endif
 
+  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+  if (team
+      && (gomp_team_barrier_cancelled (&team->barrier)
+	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
+    return;
+
   if (!if_clause || team == NULL
       || (thr->task && thr->task->final_task)
       || team->task_count > 64 * team->nthreads)
     {
       struct gomp_task task;
 
+      /* If there are depend clauses and earlier deferred sibling tasks
+	 with depend clauses, check if there isn't a dependency.  If there
+	 is, we need to wait for them.  There is no need to handle
+	 depend clauses for non-deferred tasks other than this, because
+	 the parent task is suspended until the child task finishes and thus
+	 it can't start further child tasks.  */
+      if ((flags & 8) && thr->task && thr->task->depend_hash)
+	gomp_task_maybe_wait_for_dependencies (depend);
+
       gomp_init_task (&task, thr->task, gomp_icv (false));
       task.kind = GOMP_TASK_IFFALSE;
       task.final_task = (thr->task && thr->task->final_task) || (flags & 2);
       if (thr->task)
-	task.in_tied_task = thr->task->in_tied_task;
+	{
+	  task.in_tied_task = thr->task->in_tied_task;
+	  task.taskgroup = thr->task->taskgroup;
+	}
       thr->task = &task;
       if (__builtin_expect (cpyfn != NULL, 0))
 	{
@@ -137,27 +190,174 @@ GOMP_task (void (*fn) (void *), void *da
     {
       struct gomp_task *task;
       struct gomp_task *parent = thr->task;
+      struct gomp_taskgroup *taskgroup = parent->taskgroup;
       char *arg;
       bool do_wake;
+      size_t depend_size = 0;
 
-      task = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1);
-      arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1)
+      if (flags & 8)
+	depend_size = ((uintptr_t) depend[0]
+		       * sizeof (struct gomp_task_depend_entry));
+      task = gomp_malloc (sizeof (*task) + depend_size
+			  + arg_size + arg_align - 1);
+      arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
 		      & ~(uintptr_t) (arg_align - 1));
       gomp_init_task (task, parent, gomp_icv (false));
       task->kind = GOMP_TASK_IFFALSE;
       task->in_tied_task = parent->in_tied_task;
+      task->taskgroup = taskgroup;
       thr->task = task;
       if (cpyfn)
-	cpyfn (arg, data);
+	{
+	  cpyfn (arg, data);
+	  task->copy_ctors_done = true;
+	}
       else
 	memcpy (arg, data, arg_size);
       thr->task = parent;
       task->kind = GOMP_TASK_WAITING;
       task->fn = fn;
       task->fn_data = arg;
-      task->in_tied_task = true;
       task->final_task = (flags & 2) >> 1;
       gomp_mutex_lock (&team->task_lock);
+      /* If parallel or taskgroup has been cancelled, don't start new
+	 tasks.  */
+      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
+			     || (taskgroup && taskgroup->cancelled))
+			    && !task->copy_ctors_done, 0))
+	{
+	  gomp_mutex_unlock (&team->task_lock);
+	  gomp_finish_task (task);
+	  free (task);
+	  return;
+	}
+      if (taskgroup)
+	taskgroup->num_children++;
+      if (depend_size)
+	{
+	  size_t ndepend = (uintptr_t) depend[0];
+	  size_t nout = (uintptr_t) depend[1];
+	  size_t i;
+	  hash_entry_type ent;
+
+	  task->depend_count = ndepend;
+	  task->num_dependees = 0;
+	  if (parent->depend_hash == NULL)
+	    parent->depend_hash
+	      = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
+	  for (i = 0; i < ndepend; i++)
+	    {
+	      task->depend[i].addr = depend[2 + i];
+	      task->depend[i].next = NULL;
+	      task->depend[i].prev = NULL;
+	      task->depend[i].task = task;
+	      task->depend[i].is_in = i >= nout;
+	      task->depend[i].redundant = false;
+	      task->depend[i].redundant_out = false;
+
+	      hash_entry_type *slot
+		= htab_find_slot (&parent->depend_hash, &task->depend[i],
+				  INSERT);
+	      hash_entry_type out = NULL, last = NULL;
+	      if (*slot)
+		{
+		  /* If multiple depends on the same task are the
+		     same, all but the first one are redundant.
+		     As inout/out come first, if any of them is
+		     inout/out, it will win, which is the right
+		     semantics.  */
+		  if ((*slot)->task == task)
+		    {
+		      task->depend[i].redundant = true;
+		      continue;
+		    }
+		  for (ent = *slot; ent; ent = ent->next)
+		    {
+		      if (ent->redundant_out)
+			break;
+
+		      last = ent;
+
+		      /* depend(in:...) doesn't depend on earlier
+			 depend(in:...).  */
+		      if (i >= nout && ent->is_in)
+			continue;
+
+		      if (!ent->is_in)
+			out = ent;
+
+		      struct gomp_task *tsk = ent->task;
+		      if (tsk->dependers == NULL)
+			{
+			  tsk->dependers
+			    = gomp_malloc (sizeof (struct gomp_dependers_vec)
+					   + 6 * sizeof (struct gomp_task *));
+			  tsk->dependers->n_elem = 1;
+			  tsk->dependers->allocated = 6;
+			  tsk->dependers->elem[0] = task;
+			  task->num_dependees++;
+			  continue;
+			}
+		      /* We already have some other dependency on tsk
+			 from earlier depend clause.  */
+		      else if (tsk->dependers->n_elem
+			       && (tsk->dependers->elem[tsk->dependers->n_elem
+							- 1]
+				   == task))
+			continue;
+		      else if (tsk->dependers->n_elem
+			       == tsk->dependers->allocated)
+			{
+			  tsk->dependers->allocated
+			    = tsk->dependers->allocated * 2 + 2;
+			  tsk->dependers
+			    = gomp_realloc (tsk->dependers,
+					    sizeof (struct gomp_dependers_vec)
+					    + (tsk->dependers->allocated
+					       * sizeof (struct gomp_task *)));
+			}
+		      tsk->dependers->elem[tsk->dependers->n_elem++] = task;
+		      task->num_dependees++;
+		    }
+		  task->depend[i].next = *slot;
+		  (*slot)->prev = &task->depend[i];
+		}
+	      *slot = &task->depend[i];
+
+	      /* There is no need to store more than one depend({,in}out:)
+		 task per address in the hash table chain for the purpose
+		 of creation of deferred tasks, because each out
+		 depends on all earlier outs, thus it is enough to record
+		 just the last depend({,in}out:).  For depend(in:), we need
+		 to keep all of the previous ones not terminated yet, because
+		 a later depend({,in}out:) might need to depend on all of
+		 them.  So, if the new task's clause is depend({,in}out:),
+		 we know there is at most one other depend({,in}out:) clause
+		 in the list (out).  For non-deferred tasks we want to see
+		 all outs, so they are moved to the end of the chain,
+		 after first redundant_out entry all following entries
+		 should be redundant_out.  */
+	      if (!task->depend[i].is_in && out)
+		{
+		  if (out != last)
+		    {
+		      out->next->prev = out->prev;
+		      out->prev->next = out->next;
+		      out->next = last->next;
+		      out->prev = last;
+		      last->next = out;
+		      if (out->next)
+			out->next->prev = out;
+		    }
+		  out->redundant_out = true;
+		}
+	    }
+	  if (task->num_dependees)
+	    {
+	      gomp_mutex_unlock (&team->task_lock);
+	      return;
+	    }
+	}
       if (parent->children)
 	{
 	  task->next_child = parent->children;
@@ -171,6 +371,22 @@ GOMP_task (void (*fn) (void *), void *da
 	  task->prev_child = task;
 	}
       parent->children = task;
+      if (taskgroup)
+	{
+	  if (taskgroup->children)
+	    {
+	      task->next_taskgroup = taskgroup->children;
+	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+	      task->next_taskgroup->prev_taskgroup = task;
+	      task->prev_taskgroup->next_taskgroup = task;
+	    }
+	  else
+	    {
+	      task->next_taskgroup = task;
+	      task->prev_taskgroup = task;
+	    }
+	  taskgroup->children = task;
+	}
       if (team->task_queue)
 	{
 	  task->next_queue = team->task_queue;
@@ -185,6 +401,7 @@ GOMP_task (void (*fn) (void *), void *da
 	  team->task_queue = task;
 	}
       ++team->task_count;
+      ++team->task_queued_count;
       gomp_team_barrier_set_task_pending (&team->barrier);
       do_wake = team->task_running_count + !parent->in_tied_task
 		< team->nthreads;
@@ -194,6 +411,265 @@ GOMP_task (void (*fn) (void *), void *da
     }
 }
 
+static inline bool
+gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
+		   struct gomp_taskgroup *taskgroup, struct gomp_team *team)
+{
+  if (parent)
+    {
+      if (parent->children == child_task)
+	parent->children = child_task->next_child;
+      if (__builtin_expect (child_task->parent_depends_on, 0)
+	  && parent->taskwait->last_parent_depends_on == child_task)
+	{
+	  if (child_task->prev_child->kind == GOMP_TASK_WAITING
+	      && child_task->prev_child->parent_depends_on)
+	    parent->taskwait->last_parent_depends_on = child_task->prev_child;
+	  else
+	    parent->taskwait->last_parent_depends_on = NULL;
+	}
+    }
+  if (taskgroup && taskgroup->children == child_task)
+    taskgroup->children = child_task->next_taskgroup;
+  child_task->prev_queue->next_queue = child_task->next_queue;
+  child_task->next_queue->prev_queue = child_task->prev_queue;
+  if (team->task_queue == child_task)
+    {
+      if (child_task->next_queue != child_task)
+	team->task_queue = child_task->next_queue;
+      else
+	team->task_queue = NULL;
+    }
+  child_task->kind = GOMP_TASK_TIED;
+  if (--team->task_queued_count == 0)
+    gomp_team_barrier_clear_task_pending (&team->barrier);
+  if ((gomp_team_barrier_cancelled (&team->barrier)
+       || (taskgroup && taskgroup->cancelled))
+      && !child_task->copy_ctors_done)
+    return true;
+  return false;
+}
+
+static void
+gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task)
+{
+  struct gomp_task *parent = child_task->parent;
+  size_t i;
+
+  for (i = 0; i < child_task->depend_count; i++)
+    if (!child_task->depend[i].redundant)
+      {
+	if (child_task->depend[i].next)
+	  child_task->depend[i].next->prev = child_task->depend[i].prev;
+	if (child_task->depend[i].prev)
+	  child_task->depend[i].prev->next = child_task->depend[i].next;
+	else
+	  {
+	    hash_entry_type *slot
+	      = htab_find_slot (&parent->depend_hash, &child_task->depend[i],
+				NO_INSERT);
+	    if (*slot != &child_task->depend[i])
+	      abort ();
+	    if (child_task->depend[i].next)
+	      *slot = child_task->depend[i].next;
+	    else
+	      htab_clear_slot (parent->depend_hash, slot);
+	  }
+      }
+}
+
+static size_t
+gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
+				     struct gomp_team *team)
+{
+  struct gomp_task *parent = child_task->parent;
+  size_t i, count = child_task->dependers->n_elem, ret = 0;
+  for (i = 0; i < count; i++)
+    {
+      struct gomp_task *task = child_task->dependers->elem[i];
+      if (--task->num_dependees != 0)
+	continue;
+
+      struct gomp_taskgroup *taskgroup = task->taskgroup;
+      if (parent)
+	{
+	  if (parent->children)
+	    {
+	      /* If parent is in gomp_task_maybe_wait_for_dependencies
+		 and it doesn't need to wait for this task, put it after
+		 all ready to run tasks it needs to wait for.  */
+	      if (parent->taskwait && parent->taskwait->last_parent_depends_on
+		  && !task->parent_depends_on)
+		{
+		  struct gomp_task *last_parent_depends_on
+		    = parent->taskwait->last_parent_depends_on;
+		  task->next_child = last_parent_depends_on->next_child;
+		  task->prev_child = last_parent_depends_on;
+		}
+	      else
+		{
+		  task->next_child = parent->children;
+		  task->prev_child = parent->children->prev_child;
+		  parent->children = task;
+		}
+	      task->next_child->prev_child = task;
+	      task->prev_child->next_child = task;
+	    }
+	  else
+	    {
+	      task->next_child = task;
+	      task->prev_child = task;
+	      parent->children = task;
+	    }
+	  if (parent->taskwait)
+	    {
+	      if (parent->taskwait->in_taskwait)
+		{
+		  parent->taskwait->in_taskwait = false;
+		  gomp_sem_post (&parent->taskwait->taskwait_sem);
+		}
+	      else if (parent->taskwait->in_depend_wait)
+		{
+		  parent->taskwait->in_depend_wait = false;
+		  gomp_sem_post (&parent->taskwait->taskwait_sem);
+		}
+	      if (parent->taskwait->last_parent_depends_on == NULL
+		  && task->parent_depends_on)
+		parent->taskwait->last_parent_depends_on = task;
+	    }
+	}
+      if (taskgroup)
+	{
+	  if (taskgroup->children)
+	    {
+	      task->next_taskgroup = taskgroup->children;
+	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+	      task->next_taskgroup->prev_taskgroup = task;
+	      task->prev_taskgroup->next_taskgroup = task;
+	    }
+	  else
+	    {
+	      task->next_taskgroup = task;
+	      task->prev_taskgroup = task;
+	    }
+	  taskgroup->children = task;
+	  if (taskgroup->in_taskgroup_wait)
+	    {
+	      taskgroup->in_taskgroup_wait = false;
+	      gomp_sem_post (&taskgroup->taskgroup_sem);
+	    }
+	}
+      if (team->task_queue)
+	{
+	  task->next_queue = team->task_queue;
+	  task->prev_queue = team->task_queue->prev_queue;
+	  task->next_queue->prev_queue = task;
+	  task->prev_queue->next_queue = task;
+	}
+      else
+	{
+	  task->next_queue = task;
+	  task->prev_queue = task;
+	  team->task_queue = task;
+	}
+      ++team->task_count;
+      ++team->task_queued_count;
+      ++ret;
+    }
+  free (child_task->dependers);
+  child_task->dependers = NULL;
+  if (ret > 1)
+    gomp_team_barrier_set_task_pending (&team->barrier);
+  return ret;
+}
+
+static inline size_t
+gomp_task_run_post_handle_depend (struct gomp_task *child_task,
+				  struct gomp_team *team)
+{
+  if (child_task->depend_count == 0)
+    return 0;
+
+  /* If parent is gone already, the hash table is freed and nothing
+     will use the hash table anymore, no need to remove anything from it.  */
+  if (child_task->parent != NULL)
+    gomp_task_run_post_handle_depend_hash (child_task);
+
+  if (child_task->dependers == NULL)
+    return 0;
+
+  return gomp_task_run_post_handle_dependers (child_task, team);
+}
+
+static inline void
+gomp_task_run_post_remove_parent (struct gomp_task *child_task)
+{
+  struct gomp_task *parent = child_task->parent;
+  if (parent == NULL)
+    return;
+  if (__builtin_expect (child_task->parent_depends_on, 0)
+      && --parent->taskwait->n_depend == 0
+      && parent->taskwait->in_depend_wait)
+    {
+      parent->taskwait->in_depend_wait = false;
+      gomp_sem_post (&parent->taskwait->taskwait_sem);
+    }
+  child_task->prev_child->next_child = child_task->next_child;
+  child_task->next_child->prev_child = child_task->prev_child;
+  if (parent->children != child_task)
+    return;
+  if (child_task->next_child != child_task)
+    parent->children = child_task->next_child;
+  else
+    {
+      /* We access task->children in GOMP_taskwait
+	 outside of the task lock mutex region, so
+	 need a release barrier here to ensure memory
+	 written by child_task->fn above is flushed
+	 before the NULL is written.  */
+      __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE);
+      if (parent->taskwait && parent->taskwait->in_taskwait)
+	{
+	  parent->taskwait->in_taskwait = false;
+	  gomp_sem_post (&parent->taskwait->taskwait_sem);
+	}
+    }
+}
+
+static inline void
+gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
+{
+  struct gomp_taskgroup *taskgroup = child_task->taskgroup;
+  if (taskgroup == NULL)
+    return;
+  child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup;
+  child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup;
+  if (taskgroup->num_children > 1)
+    --taskgroup->num_children;
+  else
+    {
+      /* We access taskgroup->num_children in GOMP_taskgroup_end
+	 outside of the task lock mutex region, so
+	 need a release barrier here to ensure memory
+	 written by child_task->fn above is flushed
+	 before the NULL is written.  */
+      __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
+    }
+  if (taskgroup->children != child_task)
+    return;
+  if (child_task->next_taskgroup != child_task)
+    taskgroup->children = child_task->next_taskgroup;
+  else
+    {
+      taskgroup->children = NULL;
+      if (taskgroup->in_taskgroup_wait)
+	{
+	  taskgroup->in_taskgroup_wait = false;
+	  gomp_sem_post (&taskgroup->taskgroup_sem);
+	}
+    }
+}
+
 void
 gomp_barrier_handle_tasks (gomp_barrier_state_t state)
 {
@@ -202,6 +678,7 @@ gomp_barrier_handle_tasks (gomp_barrier_
   struct gomp_task *task = thr->task;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
+  int do_wake = 0;
 
   gomp_mutex_lock (&team->task_lock);
   if (gomp_barrier_last_thread (state))
@@ -218,26 +695,31 @@ gomp_barrier_handle_tasks (gomp_barrier_
 
   while (1)
     {
+      bool cancelled = false;
       if (team->task_queue != NULL)
 	{
-	  struct gomp_task *parent;
-
 	  child_task = team->task_queue;
-	  parent = child_task->parent;
-	  if (parent && parent->children == child_task)
-	    parent->children = child_task->next_child;
-	  child_task->prev_queue->next_queue = child_task->next_queue;
-	  child_task->next_queue->prev_queue = child_task->prev_queue;
-	  if (child_task->next_queue != child_task)
-	    team->task_queue = child_task->next_queue;
-	  else
-	    team->task_queue = NULL;
-	  child_task->kind = GOMP_TASK_TIED;
+	  cancelled = gomp_task_run_pre (child_task, child_task->parent,
+					 child_task->taskgroup, team);
+	  if (__builtin_expect (cancelled, 0))
+	    {
+	      if (to_free)
+		{
+		  gomp_finish_task (to_free);
+		  free (to_free);
+		  to_free = NULL;
+		}
+	      goto finish_cancelled;
+	    }
 	  team->task_running_count++;
-	  if (team->task_count == team->task_running_count)
-	    gomp_team_barrier_clear_task_pending (&team->barrier);
+	  child_task->in_tied_task = true;
 	}
       gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	{
+	  gomp_team_barrier_wake (&team->barrier, do_wake);
+	  do_wake = 0;
+	}
       if (to_free)
 	{
 	  gomp_finish_task (to_free);
@@ -255,33 +737,22 @@ gomp_barrier_handle_tasks (gomp_barrier_
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
 	{
-	  struct gomp_task *parent = child_task->parent;
-	  if (parent)
-	    {
-	      child_task->prev_child->next_child = child_task->next_child;
-	      child_task->next_child->prev_child = child_task->prev_child;
-	      if (parent->children == child_task)
-		{
-		  if (child_task->next_child != child_task)
-		    parent->children = child_task->next_child;
-		  else
-		    {
-		      /* We access task->children in GOMP_taskwait
-			 outside of the task lock mutex region, so
-			 need a release barrier here to ensure memory
-			 written by child_task->fn above is flushed
-			 before the NULL is written.  */
-		      __atomic_store_n (&parent->children, NULL,
-					MEMMODEL_RELEASE);
-		      if (parent->in_taskwait)
-			gomp_sem_post (&parent->taskwait_sem);
-		    }
-		}
-	    }
+	 finish_cancelled:;
+	  size_t new_tasks
+	    = gomp_task_run_post_handle_depend (child_task, team);
+	  gomp_task_run_post_remove_parent (child_task);
 	  gomp_clear_parent (child_task->children);
+	  gomp_task_run_post_remove_taskgroup (child_task);
 	  to_free = child_task;
 	  child_task = NULL;
-	  team->task_running_count--;
+	  if (!cancelled)
+	    team->task_running_count--;
+	  if (new_tasks > 1)
+	    {
+	      do_wake = team->nthreads - team->task_running_count;
+	      if (do_wake > new_tasks)
+		do_wake = new_tasks;
+	    }
 	  if (--team->task_count == 0
 	      && gomp_team_barrier_waiting_for_tasks (&team->barrier))
 	    {
@@ -304,9 +775,11 @@ GOMP_taskwait (void)
   struct gomp_task *task = thr->task;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
+  struct gomp_taskwait taskwait;
+  int do_wake = 0;
 
   /* The acquire barrier on load of task->children here synchronizes
-     with the write of a NULL in gomp_barrier_handle_tasks.  It is
+     with the write of a NULL in gomp_task_run_post_remove_parent.  It is
      not necessary that we synchronize with other non-NULL writes at
      this point, but we must ensure that all writes to memory by a
      child thread task work function are seen before we exit from
@@ -315,42 +788,60 @@ GOMP_taskwait (void)
       || __atomic_load_n (&task->children, MEMMODEL_ACQUIRE) == NULL)
     return;
 
+  memset (&taskwait, 0, sizeof (taskwait));
   gomp_mutex_lock (&team->task_lock);
   while (1)
     {
+      bool cancelled = false;
       if (task->children == NULL)
 	{
+	  bool destroy_taskwait = task->taskwait != NULL;
+	  task->taskwait = NULL;
 	  gomp_mutex_unlock (&team->task_lock);
 	  if (to_free)
 	    {
 	      gomp_finish_task (to_free);
 	      free (to_free);
 	    }
+	  if (destroy_taskwait)
+	    gomp_sem_destroy (&taskwait.taskwait_sem);
 	  return;
 	}
       if (task->children->kind == GOMP_TASK_WAITING)
 	{
 	  child_task = task->children;
-	  task->children = child_task->next_child;
-	  child_task->prev_queue->next_queue = child_task->next_queue;
-	  child_task->next_queue->prev_queue = child_task->prev_queue;
-	  if (team->task_queue == child_task)
+	  cancelled
+	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
+				 team);
+	  if (__builtin_expect (cancelled, 0))
 	    {
-	      if (child_task->next_queue != child_task)
-		team->task_queue = child_task->next_queue;
-	      else
-		team->task_queue = NULL;
+	      if (to_free)
+		{
+		  gomp_finish_task (to_free);
+		  free (to_free);
+		  to_free = NULL;
+		}
+	      goto finish_cancelled;
 	    }
-	  child_task->kind = GOMP_TASK_TIED;
-	  team->task_running_count++;
-	  if (team->task_count == team->task_running_count)
-	    gomp_team_barrier_clear_task_pending (&team->barrier);
 	}
       else
-	/* All tasks we are waiting for are already running
-	   in other threads.  Wait for them.  */
-	task->in_taskwait = true;
+	{
+	  /* All tasks we are waiting for are already running
+	     in other threads.  Wait for them.  */
+	  if (task->taskwait == NULL)
+	    {
+	      taskwait.in_depend_wait = false;
+	      gomp_sem_init (&taskwait.taskwait_sem, 0);
+	      task->taskwait = &taskwait;
+	    }
+	  taskwait.in_taskwait = true;
+	}
       gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	{
+	  gomp_team_barrier_wake (&team->barrier, do_wake);
+	  do_wake = 0;
+	}
       if (to_free)
 	{
 	  gomp_finish_task (to_free);
@@ -364,14 +855,178 @@ GOMP_taskwait (void)
 	  thr->task = task;
 	}
       else
+	gomp_sem_wait (&taskwait.taskwait_sem);
+      gomp_mutex_lock (&team->task_lock);
+      if (child_task)
 	{
-	  gomp_sem_wait (&task->taskwait_sem);
-	  task->in_taskwait = false;
+	 finish_cancelled:;
+	  size_t new_tasks
+	    = gomp_task_run_post_handle_depend (child_task, team);
+	  child_task->prev_child->next_child = child_task->next_child;
+	  child_task->next_child->prev_child = child_task->prev_child;
+	  if (task->children == child_task)
+	    {
+	      if (child_task->next_child != child_task)
+		task->children = child_task->next_child;
+	      else
+		task->children = NULL;
+	    }
+	  gomp_clear_parent (child_task->children);
+	  gomp_task_run_post_remove_taskgroup (child_task);
+	  to_free = child_task;
+	  child_task = NULL;
+	  team->task_count--;
+	  if (new_tasks > 1)
+	    {
+	      do_wake = team->nthreads - team->task_running_count
+			- !task->in_tied_task;
+	      if (do_wake > new_tasks)
+		do_wake = new_tasks;
+	    }
+	}
+    }
+}
+
+/* This is like GOMP_taskwait, but we only wait for tasks that the
+   upcoming task depends on.  */
+
+static void
+gomp_task_maybe_wait_for_dependencies (void **depend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_task *task = thr->task;
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_task_depend_entry elem, *ent = NULL;
+  struct gomp_taskwait taskwait;
+  struct gomp_task *last_parent_depends_on = NULL;
+  size_t ndepend = (uintptr_t) depend[0];
+  size_t nout = (uintptr_t) depend[1];
+  size_t i;
+  size_t num_awaited = 0;
+  struct gomp_task *child_task = NULL;
+  struct gomp_task *to_free = NULL;
+  int do_wake = 0;
+
+  gomp_mutex_lock (&team->task_lock);
+  for (i = 0; i < ndepend; i++)
+    {
+      elem.addr = depend[i + 2];
+      ent = htab_find (task->depend_hash, &elem);
+      for (; ent; ent = ent->next)
+	if (i >= nout && ent->is_in)
+	  continue;
+	else
+	  {
+	    struct gomp_task *tsk = ent->task;
+	    if (!tsk->parent_depends_on)
+	      {
+		tsk->parent_depends_on = true;
+		++num_awaited;
+		if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING)
+		  {
+		    /* If a task we need to wait for is not already
+		       running and is ready to be scheduled, move it
+		       to front, so that we run it as soon as possible.  */
+		    if (last_parent_depends_on)
+		      {
+			tsk->prev_child->next_child = tsk->next_child;
+			tsk->next_child->prev_child = tsk->prev_child;
+			tsk->prev_child = last_parent_depends_on;
+			tsk->next_child = last_parent_depends_on->next_child;
+			tsk->prev_child->next_child = tsk;
+			tsk->next_child->prev_child = tsk;
+		      }
+		    else if (tsk != task->children)
+		      {
+			tsk->prev_child->next_child = tsk->next_child;
+			tsk->next_child->prev_child = tsk->prev_child;
+			tsk->prev_child = task->children;
+			tsk->next_child = task->children->next_child;
+			task->children = tsk;
+			tsk->prev_child->next_child = tsk;
+			tsk->next_child->prev_child = tsk;
+		      }
+		    last_parent_depends_on = tsk;
+		  }
+	      }
+	  }
+    }
+  if (num_awaited == 0)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      return;
+    }
+
+  memset (&taskwait, 0, sizeof (taskwait));
+  taskwait.n_depend = num_awaited;
+  taskwait.last_parent_depends_on = last_parent_depends_on;
+  gomp_sem_init (&taskwait.taskwait_sem, 0);
+  task->taskwait = &taskwait;
+
+  while (1)
+    {
+      bool cancelled = false;
+      if (taskwait.n_depend == 0)
+	{
+	  task->taskwait = NULL;
+	  gomp_mutex_unlock (&team->task_lock);
+	  if (to_free)
+	    {
+	      gomp_finish_task (to_free);
+	      free (to_free);
+	    }
+	  gomp_sem_destroy (&taskwait.taskwait_sem);
 	  return;
 	}
+      if (task->children->kind == GOMP_TASK_WAITING)
+	{
+	  child_task = task->children;
+	  cancelled
+	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
+				 team);
+	  if (__builtin_expect (cancelled, 0))
+	    {
+	      if (to_free)
+		{
+		  gomp_finish_task (to_free);
+		  free (to_free);
+		  to_free = NULL;
+		}
+	      goto finish_cancelled;
+	    }
+	}
+      else
+	/* All tasks we are waiting for are already running
+	   in other threads.  Wait for them.  */
+	taskwait.in_depend_wait = true;
+      gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	{
+	  gomp_team_barrier_wake (&team->barrier, do_wake);
+	  do_wake = 0;
+	}
+      if (to_free)
+	{
+	  gomp_finish_task (to_free);
+	  free (to_free);
+	  to_free = NULL;
+	}
+      if (child_task)
+	{
+	  thr->task = child_task;
+	  child_task->fn (child_task->fn_data);
+	  thr->task = task;
+	}
+      else
+	gomp_sem_wait (&taskwait.taskwait_sem);
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
 	{
+	 finish_cancelled:;
+	  size_t new_tasks
+	    = gomp_task_run_post_handle_depend (child_task, team);
+	  if (child_task->parent_depends_on)
+	    --taskwait.n_depend;
 	  child_task->prev_child->next_child = child_task->next_child;
 	  child_task->next_child->prev_child = child_task->prev_child;
 	  if (task->children == child_task)
@@ -382,10 +1037,17 @@ GOMP_taskwait (void)
 		task->children = NULL;
 	    }
 	  gomp_clear_parent (child_task->children);
+	  gomp_task_run_post_remove_taskgroup (child_task);
 	  to_free = child_task;
 	  child_task = NULL;
 	  team->task_count--;
-	  team->task_running_count--;
+	  if (new_tasks > 1)
+	    {
+	      do_wake = team->nthreads - team->task_running_count
+			- !task->in_tied_task;
+	      if (do_wake > new_tasks)
+		do_wake = new_tasks;
+	    }
 	}
     }
 }
@@ -398,6 +1060,151 @@ GOMP_taskyield (void)
   /* Nothing at the moment.  */
 }
 
+void
+GOMP_taskgroup_start (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_task *task = thr->task;
+  struct gomp_taskgroup *taskgroup;
+
+  /* If team is NULL, all tasks are executed as
+     GOMP_TASK_IFFALSE tasks and thus all children tasks of
+     taskgroup and their descendant tasks will be finished
+     by the time GOMP_taskgroup_end is called.  */
+  if (team == NULL)
+    return;
+  taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup));
+  taskgroup->prev = task->taskgroup;
+  taskgroup->children = NULL;
+  taskgroup->in_taskgroup_wait = false;
+  taskgroup->cancelled = false;
+  taskgroup->num_children = 0;
+  gomp_sem_init (&taskgroup->taskgroup_sem, 0);
+  task->taskgroup = taskgroup;
+}
+
+void
+GOMP_taskgroup_end (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_task *task = thr->task;
+  struct gomp_taskgroup *taskgroup;
+  struct gomp_task *child_task = NULL;
+  struct gomp_task *to_free = NULL;
+  int do_wake = 0;
+
+  if (team == NULL)
+    return;
+  taskgroup = task->taskgroup;
+
+  /* The acquire barrier on load of taskgroup->num_children here
+     synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup.
+     It is not necessary that we synchronize with other non-0 writes at
+     this point, but we must ensure that all writes to memory by a
+     child thread task work function are seen before we exit from
+     GOMP_taskgroup_end.  */
+  if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
+    goto finish;
+
+  gomp_mutex_lock (&team->task_lock);
+  while (1)
+    {
+      bool cancelled = false;
+      if (taskgroup->children == NULL)
+	{
+	  if (taskgroup->num_children)
+	    {
+	      if (task->children == NULL)
+		goto do_wait;
+	      child_task = task->children;
+            }
+          else
+	    {
+	      gomp_mutex_unlock (&team->task_lock);
+	      if (to_free)
+		{
+		  gomp_finish_task (to_free);
+		  free (to_free);
+		}
+	      goto finish;
+	    }
+	}
+      else
+	child_task = taskgroup->children;
+      if (child_task->kind == GOMP_TASK_WAITING)
+	{
+	  cancelled
+	    = gomp_task_run_pre (child_task, child_task->parent, taskgroup,
+				 team);
+	  if (__builtin_expect (cancelled, 0))
+	    {
+	      if (to_free)
+		{
+		  gomp_finish_task (to_free);
+		  free (to_free);
+		  to_free = NULL;
+		}
+	      goto finish_cancelled;
+	    }
+	}
+      else
+	{
+	  child_task = NULL;
+	 do_wait:
+	  /* All tasks we are waiting for are already running
+	     in other threads.  Wait for them.  */
+	  taskgroup->in_taskgroup_wait = true;
+	}
+      gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	{
+	  gomp_team_barrier_wake (&team->barrier, do_wake);
+	  do_wake = 0;
+	}
+      if (to_free)
+	{
+	  gomp_finish_task (to_free);
+	  free (to_free);
+	  to_free = NULL;
+	}
+      if (child_task)
+	{
+	  thr->task = child_task;
+	  child_task->fn (child_task->fn_data);
+	  thr->task = task;
+	}
+      else
+	gomp_sem_wait (&taskgroup->taskgroup_sem);
+      gomp_mutex_lock (&team->task_lock);
+      if (child_task)
+	{
+	 finish_cancelled:;
+	  size_t new_tasks
+	    = gomp_task_run_post_handle_depend (child_task, team);
+	  gomp_task_run_post_remove_parent (child_task);
+	  gomp_clear_parent (child_task->children);
+	  gomp_task_run_post_remove_taskgroup (child_task);
+	  to_free = child_task;
+	  child_task = NULL;
+	  team->task_count--;
+	  if (new_tasks > 1)
+	    {
+	      do_wake = team->nthreads - team->task_running_count
+			- !task->in_tied_task;
+	      if (do_wake > new_tasks)
+		do_wake = new_tasks;
+	    }
+	}
+    }
+
+ finish:
+  task->taskgroup = taskgroup->prev;
+  gomp_sem_destroy (&taskgroup->taskgroup_sem);
+  free (taskgroup);
+}
+
 int
 omp_in_final (void)
 {
--- libgomp/testsuite/libgomp.fortran/lib3.f	(revision 210461)
+++ libgomp/testsuite/libgomp.fortran/lib3.f	(revision 210462)
@@ -66,6 +66,7 @@ C$OMP END PARALLEL
 C$OMP PARALLEL REDUCTION (.OR.:L) IF (.TRUE.)
       L = .NOT. OMP_IN_PARALLEL ()
 C$OMP END PARALLEL
+      IF (L) CALL ABORT
 
       E = OMP_GET_WTIME ()
       IF (D .GT. E) CALL ABORT
--- libgomp/testsuite/libgomp.fortran/lib1.f90	(revision 210461)
+++ libgomp/testsuite/libgomp.fortran/lib1.f90	(revision 210462)
@@ -66,6 +66,7 @@
 !$omp parallel reduction (.or.:l) if (.true.)
   l = .not. omp_in_parallel ()
 !$omp end parallel
+  if (l) call abort
 
   e = omp_get_wtime ()
   if (d .gt. e) call abort
--- libgomp/testsuite/libgomp.fortran/lib2.f	(revision 210461)
+++ libgomp/testsuite/libgomp.fortran/lib2.f	(revision 210462)
@@ -66,6 +66,7 @@ C$OMP END PARALLEL
 C$OMP PARALLEL REDUCTION (.OR.:L) IF (.TRUE.)
       L = .NOT. OMP_IN_PARALLEL ()
 C$OMP END PARALLEL
+      IF (L) CALL ABORT
 
       E = OMP_GET_WTIME ()
       IF (D .GT. E) CALL ABORT
--- libgomp/testsuite/libgomp.c/atomic-14.c	(revision 210461)
+++ libgomp/testsuite/libgomp.c/atomic-14.c	(revision 210462)
@@ -16,7 +16,7 @@ main ()
   #pragma omp atomic update
     x = x + 7;
   #pragma omp atomic
-    x = x + 7 + 6;
+    x = x + (7 + 6);
   #pragma omp atomic update
     x = x + 2 * 3;
   #pragma omp atomic
@@ -65,7 +65,7 @@ main ()
   if (v != -8)
     abort ();
   #pragma omp atomic
-    x = x * -4 / 2;
+    x = x * (-4 / 2);
   #pragma omp atomic read
     v = x;
   if (v != 16)
--- libgomp/testsuite/libgomp.c/lib-1.c	(revision 210461)
+++ libgomp/testsuite/libgomp.c/lib-1.c	(revision 210462)
@@ -85,6 +85,8 @@ main (void)
   l = ! omp_in_parallel ();
 #pragma omp parallel reduction (|:l) if (1)
   l = ! omp_in_parallel ();
+  if (l)
+    abort ();
 
   e = omp_get_wtime ();
   if (d > e)
--- libgomp/loop.c	(revision 210461)
+++ libgomp/loop.c	(revision 210462)
@@ -439,14 +439,14 @@ static void
 gomp_parallel_loop_start (void (*fn) (void *), void *data,
 			  unsigned num_threads, long start, long end,
 			  long incr, enum gomp_schedule_type sched,
-			  long chunk_size)
+			  long chunk_size, unsigned int flags)
 {
   struct gomp_team *team;
 
   num_threads = gomp_resolve_num_threads (num_threads, 0);
   team = gomp_new_team (num_threads);
   gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size);
-  gomp_team_start (fn, data, num_threads, team);
+  gomp_team_start (fn, data, num_threads, flags, team);
 }
 
 void
@@ -455,7 +455,7 @@ GOMP_parallel_loop_static_start (void (*
 				 long incr, long chunk_size)
 {
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    GFS_STATIC, chunk_size);
+			    GFS_STATIC, chunk_size, 0);
 }
 
 void
@@ -464,7 +464,7 @@ GOMP_parallel_loop_dynamic_start (void (
 				  long incr, long chunk_size)
 {
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    GFS_DYNAMIC, chunk_size);
+			    GFS_DYNAMIC, chunk_size, 0);
 }
 
 void
@@ -473,7 +473,7 @@ GOMP_parallel_loop_guided_start (void (*
 				 long incr, long chunk_size)
 {
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    GFS_GUIDED, chunk_size);
+			    GFS_GUIDED, chunk_size, 0);
 }
 
 void
@@ -483,11 +483,59 @@ GOMP_parallel_loop_runtime_start (void (
 {
   struct gomp_task_icv *icv = gomp_icv (false);
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    icv->run_sched_var, icv->run_sched_modifier);
+			    icv->run_sched_var, icv->run_sched_modifier, 0);
+}
+
+ialias_redirect (GOMP_parallel_end)
+
+void
+GOMP_parallel_loop_static (void (*fn) (void *), void *data,
+			   unsigned num_threads, long start, long end,
+			   long incr, long chunk_size, unsigned flags)
+{
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    GFS_STATIC, chunk_size, flags);
+  fn (data);
+  GOMP_parallel_end ();
+}
+
+void
+GOMP_parallel_loop_dynamic (void (*fn) (void *), void *data,
+			    unsigned num_threads, long start, long end,
+			    long incr, long chunk_size, unsigned flags)
+{
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    GFS_DYNAMIC, chunk_size, flags);
+  fn (data);
+  GOMP_parallel_end ();
+}
+
+void
+GOMP_parallel_loop_guided (void (*fn) (void *), void *data,
+			  unsigned num_threads, long start, long end,
+			  long incr, long chunk_size, unsigned flags)
+{
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    GFS_GUIDED, chunk_size, flags);
+  fn (data);
+  GOMP_parallel_end ();
+}
+
+void
+GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
+			    unsigned num_threads, long start, long end,
+			    long incr, unsigned flags)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    icv->run_sched_var, icv->run_sched_modifier,
+			    flags);
+  fn (data);
+  GOMP_parallel_end ();
 }
 
 /* The GOMP_loop_end* routines are called after the thread is told that
-   all loop iterations are complete.  This first version synchronizes
+   all loop iterations are complete.  The first two versions synchronize
    all threads; the nowait version does not.  */
 
 void
@@ -496,6 +544,12 @@ GOMP_loop_end (void)
   gomp_work_share_end ();
 }
 
+bool
+GOMP_loop_end_cancel (void)
+{
+  return gomp_work_share_end_cancel ();
+}
+
 void
 GOMP_loop_end_nowait (void)
 {
--- libgomp/hashtab.h	(revision 0)
+++ libgomp/hashtab.h	(revision 210462)
@@ -0,0 +1,443 @@
+/* An expandable hash tables datatype.
+   Copyright (C) 1999-2013
+   Free Software Foundation, Inc.
+   Contributed by Vladimir Makarov <vmakarov@cygnus.com>.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+/* The hash table code copied from include/hashtab.[hc] and adjusted,
+   so that the hash table entries are in the flexible array at the end
+   of the control structure, no callbacks are used and the elements in the
+   table are of the hash_entry_type type.
+   Before including this file, define hash_entry_type type and
+   htab_alloc and htab_free functions.  After including it, define
+   htab_hash and htab_eq inline functions.   */
+
+/* This package implements basic hash table functionality.  It is possible
+   to search for an entry, create an entry and destroy an entry.
+
+   Elements in the table are generic pointers.
+
+   The size of the table is not fixed; if the occupancy of the table
+   grows too high the hash table will be expanded.
+
+   The abstract data implementation is based on generalized Algorithm D
+   from Knuth's book "The art of computer programming".  Hash table is
+   expanded by creation of new hash table and transferring elements from
+   the old table to the new table.  */
+
+/* The type for a hash code.  */
+typedef unsigned int hashval_t;
+
+static inline hashval_t htab_hash (hash_entry_type);
+static inline bool htab_eq (hash_entry_type, hash_entry_type);
+
+/* This macro defines reserved value for empty table entry.  */
+
+#define HTAB_EMPTY_ENTRY    ((hash_entry_type) 0)
+
+/* This macro defines reserved value for table entry which contained
+   a deleted element. */
+
+#define HTAB_DELETED_ENTRY  ((hash_entry_type) 1)
+
+/* Hash tables are of the following type.  The structure
+   (implementation) of this type is not needed for using the hash
+   tables.  All work with hash table should be executed only through
+   functions mentioned below.  The size of this structure is subject to
+   change.  */
+
+struct htab {
+  /* Current size (in entries) of the hash table.  */
+  size_t size;
+
+  /* Current number of elements including also deleted elements.  */
+  size_t n_elements;
+
+  /* Current number of deleted elements in the table.  */
+  size_t n_deleted;
+
+  /* Current size (in entries) of the hash table, as an index into the
+     table of primes.  */
+  unsigned int size_prime_index;
+
+  /* Table itself.  */
+  hash_entry_type entries[];
+};
+
+typedef struct htab *htab_t;
+
+/* An enum saying whether we insert into the hash table or not.  */
+enum insert_option {NO_INSERT, INSERT};
+
+/* Table of primes and multiplicative inverses.
+
+   Note that these are not minimally reduced inverses.  Unlike when generating
+   code to divide by a constant, we want to be able to use the same algorithm
+   all the time.  All of these inverses (are implied to) have bit 32 set.
+
+   For the record, the function that computed the table is in
+   libiberty/hashtab.c.  */
+
+struct prime_ent
+{
+  hashval_t prime;
+  hashval_t inv;
+  hashval_t inv_m2;	/* inverse of prime-2 */
+  hashval_t shift;
+};
+
+static struct prime_ent const prime_tab[] = {
+  {          7, 0x24924925, 0x9999999b, 2 },
+  {         13, 0x3b13b13c, 0x745d1747, 3 },
+  {         31, 0x08421085, 0x1a7b9612, 4 },
+  {         61, 0x0c9714fc, 0x15b1e5f8, 5 },
+  {        127, 0x02040811, 0x0624dd30, 6 },
+  {        251, 0x05197f7e, 0x073260a5, 7 },
+  {        509, 0x01824366, 0x02864fc8, 8 },
+  {       1021, 0x00c0906d, 0x014191f7, 9 },
+  {       2039, 0x0121456f, 0x0161e69e, 10 },
+  {       4093, 0x00300902, 0x00501908, 11 },
+  {       8191, 0x00080041, 0x00180241, 12 },
+  {      16381, 0x000c0091, 0x00140191, 13 },
+  {      32749, 0x002605a5, 0x002a06e6, 14 },
+  {      65521, 0x000f00e2, 0x00110122, 15 },
+  {     131071, 0x00008001, 0x00018003, 16 },
+  {     262139, 0x00014002, 0x0001c004, 17 },
+  {     524287, 0x00002001, 0x00006001, 18 },
+  {    1048573, 0x00003001, 0x00005001, 19 },
+  {    2097143, 0x00004801, 0x00005801, 20 },
+  {    4194301, 0x00000c01, 0x00001401, 21 },
+  {    8388593, 0x00001e01, 0x00002201, 22 },
+  {   16777213, 0x00000301, 0x00000501, 23 },
+  {   33554393, 0x00001381, 0x00001481, 24 },
+  {   67108859, 0x00000141, 0x000001c1, 25 },
+  {  134217689, 0x000004e1, 0x00000521, 26 },
+  {  268435399, 0x00000391, 0x000003b1, 27 },
+  {  536870909, 0x00000019, 0x00000029, 28 },
+  { 1073741789, 0x0000008d, 0x00000095, 29 },
+  { 2147483647, 0x00000003, 0x00000007, 30 },
+  /* Avoid "decimal constant so large it is unsigned" for 4294967291.  */
+  { 0xfffffffb, 0x00000006, 0x00000008, 31 }
+};
+
+/* The following function returns an index into the above table of the
+   nearest prime number which is greater than N, and near a power of two. */
+
+static unsigned int
+higher_prime_index (unsigned long n)
+{
+  unsigned int low = 0;
+  unsigned int high = sizeof(prime_tab) / sizeof(prime_tab[0]);
+
+  while (low != high)
+    {
+      unsigned int mid = low + (high - low) / 2;
+      if (n > prime_tab[mid].prime)
+	low = mid + 1;
+      else
+	high = mid;
+    }
+
+  /* If we've run out of primes, abort.  */
+  if (n > prime_tab[low].prime)
+    abort ();
+
+  return low;
+}
+
+/* Return the current size of given hash table.  */
+
+static inline size_t
+htab_size (htab_t htab)
+{
+  return htab->size;
+}
+
+/* Return the current number of elements in given hash table. */
+
+static inline size_t
+htab_elements (htab_t htab)
+{
+  return htab->n_elements - htab->n_deleted;
+}
+
+/* Return X % Y.  */
+
+static inline hashval_t
+htab_mod_1 (hashval_t x, hashval_t y, hashval_t inv, int shift)
+{
+  /* The multiplicative inverses computed above are for 32-bit types, and
+     requires that we be able to compute a highpart multiply.  */
+  if (sizeof (hashval_t) * __CHAR_BIT__ <= 32)
+    {
+      hashval_t t1, t2, t3, t4, q, r;
+
+      t1 = ((unsigned long long)x * inv) >> 32;
+      t2 = x - t1;
+      t3 = t2 >> 1;
+      t4 = t1 + t3;
+      q  = t4 >> shift;
+      r  = x - (q * y);
+
+      return r;
+    }
+
+  /* Otherwise just use the native division routines.  */
+  return x % y;
+}
+
+/* Compute the primary hash for HASH given HTAB's current size.  */
+
+static inline hashval_t
+htab_mod (hashval_t hash, htab_t htab)
+{
+  const struct prime_ent *p = &prime_tab[htab->size_prime_index];
+  return htab_mod_1 (hash, p->prime, p->inv, p->shift);
+}
+
+/* Compute the secondary hash for HASH given HTAB's current size.  */
+
+static inline hashval_t
+htab_mod_m2 (hashval_t hash, htab_t htab)
+{
+  const struct prime_ent *p = &prime_tab[htab->size_prime_index];
+  return 1 + htab_mod_1 (hash, p->prime - 2, p->inv_m2, p->shift);
+}
+
+/* Create hash table of size SIZE.  */
+
+static htab_t
+htab_create (size_t size)
+{
+  htab_t result;
+  unsigned int size_prime_index;
+
+  size_prime_index = higher_prime_index (size);
+  size = prime_tab[size_prime_index].prime;
+
+  result = (htab_t) htab_alloc (sizeof (struct htab)
+				+ size * sizeof (hash_entry_type));
+  result->size = size;
+  result->n_elements = 0;
+  result->n_deleted = 0;
+  result->size_prime_index = size_prime_index;
+  memset (result->entries, 0, size * sizeof (hash_entry_type));
+  return result;
+}
+
+/* Similar to htab_find_slot, but without several unwanted side effects:
+    - Does not call htab_eq when it finds an existing entry.
+    - Does not change the count of elements in the hash table.
+   This function also assumes there are no deleted entries in the table.
+   HASH is the hash value for the element to be inserted.  */
+
+static hash_entry_type *
+find_empty_slot_for_expand (htab_t htab, hashval_t hash)
+{
+  hashval_t index = htab_mod (hash, htab);
+  size_t size = htab_size (htab);
+  hash_entry_type *slot = htab->entries + index;
+  hashval_t hash2;
+
+  if (*slot == HTAB_EMPTY_ENTRY)
+    return slot;
+  else if (*slot == HTAB_DELETED_ENTRY)
+    abort ();
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+	index -= size;
+
+      slot = htab->entries + index;
+      if (*slot == HTAB_EMPTY_ENTRY)
+	return slot;
+      else if (*slot == HTAB_DELETED_ENTRY)
+	abort ();
+    }
+}
+
+/* The following function changes size of memory allocated for the
+   entries and repeatedly inserts the table elements.  The occupancy
+   of the table after the call will be about 50%.  Naturally the hash
+   table must already exist.  Remember also that the place of the
+   table entries is changed.  */
+
+static htab_t
+htab_expand (htab_t htab)
+{
+  htab_t nhtab;
+  hash_entry_type *olimit;
+  hash_entry_type *p;
+  size_t osize, elts;
+
+  osize = htab->size;
+  olimit = htab->entries + osize;
+  elts = htab_elements (htab);
+
+  /* Resize only when table after removal of unused elements is either
+     too full or too empty.  */
+  if (elts * 2 > osize || (elts * 8 < osize && osize > 32))
+    nhtab = htab_create (elts * 2);
+  else
+    nhtab = htab_create (osize - 1);
+  nhtab->n_elements = htab->n_elements - htab->n_deleted;
+
+  p = htab->entries;
+  do
+    {
+      hash_entry_type x = *p;
+
+      if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY)
+	*find_empty_slot_for_expand (nhtab, htab_hash (x)) = x;
+
+      p++;
+    }
+  while (p < olimit);
+
+  htab_free (htab);
+  return nhtab;
+}
+
+/* This function searches for a hash table entry equal to the given
+   element.  It cannot be used to insert or delete an element.  */
+
+static hash_entry_type
+htab_find (htab_t htab, const hash_entry_type element)
+{
+  hashval_t index, hash2, hash = htab_hash (element);
+  size_t size;
+  hash_entry_type entry;
+
+  size = htab_size (htab);
+  index = htab_mod (hash, htab);
+
+  entry = htab->entries[index];
+  if (entry == HTAB_EMPTY_ENTRY
+      || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element)))
+    return entry;
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+	index -= size;
+
+      entry = htab->entries[index];
+      if (entry == HTAB_EMPTY_ENTRY
+	  || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element)))
+	return entry;
+    }
+}
+
+/* This function searches for a hash table slot containing an entry
+   equal to the given element.  To delete an entry, call this with
+   insert=NO_INSERT, then call htab_clear_slot on the slot returned
+   (possibly after doing some checks).  To insert an entry, call this
+   with insert=INSERT, then write the value you want into the returned
+   slot.  */
+
+static hash_entry_type *
+htab_find_slot (htab_t *htabp, const hash_entry_type element,
+		enum insert_option insert)
+{
+  hash_entry_type *first_deleted_slot;
+  hashval_t index, hash2, hash = htab_hash (element);
+  size_t size;
+  hash_entry_type entry;
+  htab_t htab = *htabp;
+
+  size = htab_size (htab);
+  if (insert == INSERT && size * 3 <= htab->n_elements * 4)
+    {
+      htab = *htabp = htab_expand (htab);
+      size = htab_size (htab);
+    }
+
+  index = htab_mod (hash, htab);
+
+  first_deleted_slot = NULL;
+
+  entry = htab->entries[index];
+  if (entry == HTAB_EMPTY_ENTRY)
+    goto empty_entry;
+  else if (entry == HTAB_DELETED_ENTRY)
+    first_deleted_slot = &htab->entries[index];
+  else if (htab_eq (entry, element))
+    return &htab->entries[index];
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+	index -= size;
+
+      entry = htab->entries[index];
+      if (entry == HTAB_EMPTY_ENTRY)
+	goto empty_entry;
+      else if (entry == HTAB_DELETED_ENTRY)
+	{
+	  if (!first_deleted_slot)
+	    first_deleted_slot = &htab->entries[index];
+	}
+      else if (htab_eq (entry, element))
+	return &htab->entries[index];
+    }
+
+ empty_entry:
+  if (insert == NO_INSERT)
+    return NULL;
+
+  if (first_deleted_slot)
+    {
+      htab->n_deleted--;
+      *first_deleted_slot = HTAB_EMPTY_ENTRY;
+      return first_deleted_slot;
+    }
+
+  htab->n_elements++;
+  return &htab->entries[index];
+}
+
+/* This function clears a specified slot in a hash table.  It is
+   useful when you've already done the lookup and don't want to do it
+   again.  */
+
+static inline void
+htab_clear_slot (htab_t htab, hash_entry_type *slot)
+{
+  if (slot < htab->entries || slot >= htab->entries + htab_size (htab)
+      || *slot == HTAB_EMPTY_ENTRY || *slot == HTAB_DELETED_ENTRY)
+    abort ();
+
+  *slot = HTAB_DELETED_ENTRY;
+  htab->n_deleted++;
+}
+
+/* Returns a hash code for pointer P. Simplified version of evahash */
+
+static inline hashval_t
+hash_pointer (const void *p)
+{
+  uintptr_t v = (uintptr_t) p;
+  if (sizeof (v) > sizeof (hashval_t))
+    v ^= v >> (sizeof (uintptr_t) / 2 * __CHAR_BIT__);
+  return v;
+}
--- libgomp/work.c	(revision 210461)
+++ libgomp/work.c	(revision 210462)
@@ -221,7 +221,10 @@ gomp_work_share_end (void)
   if (gomp_barrier_last_thread (bstate))
     {
       if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
-	free_work_share (team, thr->ts.last_work_share);
+	{
+	  team->work_shares_to_free = thr->ts.work_share;
+	  free_work_share (team, thr->ts.last_work_share);
+	}
     }
 
   gomp_team_barrier_wait_end (&team->barrier, bstate);
@@ -229,6 +232,32 @@ gomp_work_share_end (void)
 }
 
 /* The current thread is done with its current work sharing construct.
+   This version implies a cancellable barrier at the end of the work-share.  */
+
+bool
+gomp_work_share_end_cancel (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  gomp_barrier_state_t bstate;
+
+  /* Cancellable work sharing constructs cannot be orphaned.  */
+  bstate = gomp_barrier_wait_cancel_start (&team->barrier);
+
+  if (gomp_barrier_last_thread (bstate))
+    {
+      if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
+	{
+	  team->work_shares_to_free = thr->ts.work_share;
+	  free_work_share (team, thr->ts.last_work_share);
+	}
+    }
+  thr->ts.last_work_share = NULL;
+
+  return gomp_team_barrier_wait_cancel_end (&team->barrier, bstate);
+}
+
+/* The current thread is done with its current work sharing construct.
    This version does NOT imply a barrier at the end of the work-share.  */
 
 void
@@ -259,6 +288,9 @@ gomp_work_share_end_nowait (void)
 #endif
 
   if (completed == team->nthreads)
-    free_work_share (team, thr->ts.last_work_share);
+    {
+      team->work_shares_to_free = thr->ts.work_share;
+      free_work_share (team, thr->ts.last_work_share);
+    }
   thr->ts.last_work_share = NULL;
 }
--- libgomp/config/linux/proc.c	(revision 210461)
+++ libgomp/config/linux/proc.c	(revision 210462)
@@ -30,6 +30,7 @@
 #endif
 #include "libgomp.h"
 #include "proc.h"
+#include <errno.h>
 #include <stdlib.h>
 #include <unistd.h>
 #ifdef HAVE_GETLOADAVG
@@ -39,19 +40,28 @@
 #endif
 
 #ifdef HAVE_PTHREAD_AFFINITY_NP
+unsigned long gomp_cpuset_size;
+static unsigned long gomp_get_cpuset_size;
+cpu_set_t *gomp_cpusetp;
+
 unsigned long
-gomp_cpuset_popcount (cpu_set_t *cpusetp)
+gomp_cpuset_popcount (unsigned long cpusetsize, cpu_set_t *cpusetp)
 {
-#ifdef CPU_COUNT
-  /* glibc 2.6 and above provide a macro for this.  */
-  return CPU_COUNT (cpusetp);
+#ifdef CPU_COUNT_S
+  /* glibc 2.7 and above provide a macro for this.  */
+  return CPU_COUNT_S (cpusetsize, cpusetp);
 #else
+#ifdef CPU_COUNT
+  if (cpusetsize == sizeof (cpu_set_t))
+    /* glibc 2.6 and above provide a macro for this.  */
+    return CPU_COUNT (cpusetp);
+#endif
   size_t i;
   unsigned long ret = 0;
-  extern int check[sizeof (cpusetp->__bits[0]) == sizeof (unsigned long int)];
+  extern int check[sizeof (cpusetp->__bits[0]) == sizeof (unsigned long int)
+		   ? 1 : -1] __attribute__((unused));
 
-  (void) check;
-  for (i = 0; i < sizeof (*cpusetp) / sizeof (cpusetp->__bits[0]); i++)
+  for (i = 0; i < cpusetsize / sizeof (cpusetp->__bits[0]); i++)
     {
       unsigned long int mask = cpusetp->__bits[i];
       if (mask == 0)
@@ -70,16 +80,63 @@ void
 gomp_init_num_threads (void)
 {
 #ifdef HAVE_PTHREAD_AFFINITY_NP
-  cpu_set_t cpuset;
+#if defined (_SC_NPROCESSORS_CONF) && defined (CPU_ALLOC_SIZE)
+  gomp_cpuset_size = sysconf (_SC_NPROCESSORS_CONF);
+  gomp_cpuset_size = CPU_ALLOC_SIZE (gomp_cpuset_size);
+#else
+  gomp_cpuset_size = sizeof (cpu_set_t);
+#endif
 
-  if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset), &cpuset) == 0)
+  gomp_cpusetp = (cpu_set_t *) gomp_malloc (gomp_cpuset_size);
+  do
     {
-      /* Count only the CPUs this process can use.  */
-      gomp_global_icv.nthreads_var = gomp_cpuset_popcount (&cpuset);
-      if (gomp_global_icv.nthreads_var == 0)
-	gomp_global_icv.nthreads_var = 1;
-      return;
+      int ret = pthread_getaffinity_np (pthread_self (), gomp_cpuset_size,
+					gomp_cpusetp);
+      if (ret == 0)
+	{
+	  /* Count only the CPUs this process can use.  */
+	  gomp_global_icv.nthreads_var
+	    = gomp_cpuset_popcount (gomp_cpuset_size, gomp_cpusetp);
+	  if (gomp_global_icv.nthreads_var == 0)
+	    break;
+	  gomp_get_cpuset_size = gomp_cpuset_size;
+#ifdef CPU_ALLOC_SIZE
+	  unsigned long i;
+	  for (i = gomp_cpuset_size * 8; i; i--)
+	    if (CPU_ISSET_S (i - 1, gomp_cpuset_size, gomp_cpusetp))
+	      break;
+	  gomp_cpuset_size = CPU_ALLOC_SIZE (i);
+#endif
+	  return;
+	}
+      if (ret != EINVAL)
+	break;
+#ifdef CPU_ALLOC_SIZE
+      if (gomp_cpuset_size < sizeof (cpu_set_t))
+	gomp_cpuset_size = sizeof (cpu_set_t);
+      else
+	gomp_cpuset_size = gomp_cpuset_size * 2;
+      if (gomp_cpuset_size < 8 * sizeof (cpu_set_t))
+	gomp_cpusetp
+	  = (cpu_set_t *) gomp_realloc (gomp_cpusetp, gomp_cpuset_size);
+      else
+	{
+	  /* Avoid gomp_fatal if too large memory allocation would be
+	     requested, e.g. kernel returning EINVAL all the time.  */
+	  void *p = realloc (gomp_cpusetp, gomp_cpuset_size);
+	  if (p == NULL)
+	    break;
+	  gomp_cpusetp = (cpu_set_t *) p;
+	}
+#else
+      break;
+#endif
     }
+  while (1);
+  gomp_cpuset_size = 0;
+  gomp_global_icv.nthreads_var = 1;
+  free (gomp_cpusetp);
+  gomp_cpusetp = NULL;
 #endif
 #ifdef _SC_NPROCESSORS_ONLN
   gomp_global_icv.nthreads_var = sysconf (_SC_NPROCESSORS_ONLN);
@@ -90,15 +147,14 @@ static int
 get_num_procs (void)
 {
 #ifdef HAVE_PTHREAD_AFFINITY_NP
-  cpu_set_t cpuset;
-
-  if (gomp_cpu_affinity == NULL)
+  if (gomp_places_list == NULL)
     {
       /* Count only the CPUs this process can use.  */
-      if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset),
-				  &cpuset) == 0)
+      if (gomp_cpusetp
+	  && pthread_getaffinity_np (pthread_self (), gomp_get_cpuset_size,
+				     gomp_cpusetp) == 0)
 	{
-	  int ret = gomp_cpuset_popcount (&cpuset);
+	  int ret = gomp_cpuset_popcount (gomp_get_cpuset_size, gomp_cpusetp);
 	  return ret != 0 ? ret : 1;
 	}
     }
--- libgomp/config/linux/bar.c	(revision 210461)
+++ libgomp/config/linux/bar.c	(revision 210462)
@@ -33,11 +33,11 @@
 void
 gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
 {
-  if (__builtin_expect ((state & 1) != 0, 0))
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
     {
       /* Next time we'll be awaiting TOTAL threads again.  */
       bar->awaited = bar->total;
-      __atomic_store_n (&bar->generation, bar->generation + 4,
+      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
 			MEMMODEL_RELEASE);
       futex_wake ((int *) &bar->generation, INT_MAX);
     }
@@ -66,7 +66,7 @@ void
 gomp_barrier_wait_last (gomp_barrier_t *bar)
 {
   gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
-  if (state & 1)
+  if (state & BAR_WAS_LAST)
     gomp_barrier_wait_end (bar, state);
 }
 
@@ -81,40 +81,43 @@ gomp_team_barrier_wait_end (gomp_barrier
 {
   unsigned int generation, gen;
 
-  if (__builtin_expect ((state & 1) != 0, 0))
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
     {
       /* Next time we'll be awaiting TOTAL threads again.  */
       struct gomp_thread *thr = gomp_thread ();
       struct gomp_team *team = thr->ts.team;
 
       bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
       if (__builtin_expect (team->task_count, 0))
 	{
 	  gomp_barrier_handle_tasks (state);
-	  state &= ~1;
+	  state &= ~BAR_WAS_LAST;
 	}
       else
 	{
-	  __atomic_store_n (&bar->generation, state + 3, MEMMODEL_RELEASE);
+	  state &= ~BAR_CANCELLED;
+	  state += BAR_INCR - BAR_WAS_LAST;
+	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
 	  futex_wake ((int *) &bar->generation, INT_MAX);
 	  return;
 	}
     }
 
   generation = state;
+  state &= ~BAR_CANCELLED;
   do
     {
       do_wait ((int *) &bar->generation, generation);
       gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
-      if (__builtin_expect (gen & 1, 0))
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
 	{
 	  gomp_barrier_handle_tasks (state);
 	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
 	}
-      if ((gen & 2) != 0)
-	generation |= 2;
+      generation |= gen & BAR_WAITING_FOR_TASK;
     }
-  while (gen != state + 4);
+  while (gen != state + BAR_INCR);
 }
 
 void
@@ -122,3 +125,86 @@ gomp_team_barrier_wait (gomp_barrier_t *
 {
   gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
 }
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    bar->awaited_final = bar->total;
+  gomp_team_barrier_wait_end (bar, state);
+}
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+				   gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      /* BAR_CANCELLED should never be set in state here, because
+	 cancellation means that at least one of the threads has been
+	 cancelled, thus on a cancellable barrier we should never see
+	 all threads to arrive.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  state &= ~BAR_WAS_LAST;
+	}
+      else
+	{
+	  state += BAR_INCR - BAR_WAS_LAST;
+	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+	  futex_wake ((int *) &bar->generation, INT_MAX);
+	  return false;
+	}
+    }
+
+  if (__builtin_expect (state & BAR_CANCELLED, 0))
+    return true;
+
+  generation = state;
+  do
+    {
+      do_wait ((int *) &bar->generation, generation);
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & BAR_CANCELLED, 0))
+	return true;
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	}
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+
+  return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      return;
+    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);
+  futex_wake ((int *) &team->barrier.generation, INT_MAX);
+}
--- libgomp/config/linux/proc.h	(revision 210461)
+++ libgomp/config/linux/proc.h	(revision 210462)
@@ -28,7 +28,10 @@
 #include <sched.h>
 
 #ifdef HAVE_PTHREAD_AFFINITY_NP
-extern unsigned long gomp_cpuset_popcount (cpu_set_t *);
+extern unsigned long gomp_cpuset_size attribute_hidden;
+extern cpu_set_t *gomp_cpusetp attribute_hidden;
+extern unsigned long gomp_cpuset_popcount (unsigned long, cpu_set_t *)
+     attribute_hidden;
 #endif
 
 #endif /* GOMP_PROC_H */
--- libgomp/config/linux/affinity.c	(revision 210461)
+++ libgomp/config/linux/affinity.c	(revision 210462)
@@ -29,90 +29,327 @@
 #endif
 #include "libgomp.h"
 #include "proc.h"
+#include <errno.h>
 #include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
 #include <unistd.h>
 
 #ifdef HAVE_PTHREAD_AFFINITY_NP
 
-static unsigned int affinity_counter;
+#ifndef CPU_ALLOC_SIZE
+#define CPU_ISSET_S(idx, size, set) CPU_ISSET(idx, set)
+#define CPU_ZERO_S(size, set) CPU_ZERO(set)
+#define CPU_SET_S(idx, size, set) CPU_SET(idx, set)
+#define CPU_CLR_S(idx, size, set) CPU_CLR(idx, set)
+#endif
 
 void
 gomp_init_affinity (void)
 {
-  cpu_set_t cpuset, cpusetnew;
-  size_t idx, widx;
-  unsigned long cpus = 0;
-
-  if (pthread_getaffinity_np (pthread_self (), sizeof (cpuset), &cpuset))
-    {
-      gomp_error ("could not get CPU affinity set");
-      free (gomp_cpu_affinity);
-      gomp_cpu_affinity = NULL;
-      gomp_cpu_affinity_len = 0;
-      return;
-    }
-
-  CPU_ZERO (&cpusetnew);
-  if (gomp_cpu_affinity_len == 0)
-    {
-      unsigned long count = gomp_cpuset_popcount (&cpuset);
-      if (count >= 65536)
-	count = 65536;
-      gomp_cpu_affinity = malloc (count * sizeof (unsigned short));
-      if (gomp_cpu_affinity == NULL)
+  if (gomp_places_list == NULL)
+    {
+      if (!gomp_affinity_init_level (1, ULONG_MAX, true))
+	return;
+    }
+
+  struct gomp_thread *thr = gomp_thread ();
+  pthread_setaffinity_np (pthread_self (), gomp_cpuset_size,
+			  (cpu_set_t *) gomp_places_list[0]);
+  thr->place = 1;
+  thr->ts.place_partition_off = 0;
+  thr->ts.place_partition_len = gomp_places_list_len;
+}
+
+void
+gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place)
+{
+  pthread_attr_setaffinity_np (attr, gomp_cpuset_size,
+			       (cpu_set_t *) gomp_places_list[place]);
+}
+
+void **
+gomp_affinity_alloc (unsigned long count, bool quiet)
+{
+  unsigned long i;
+  void **ret;
+  char *p;
+
+  if (gomp_cpusetp == NULL)
+    {
+      if (!quiet)
+	gomp_error ("Could not get CPU affinity set");
+      return NULL;
+    }
+
+  ret = malloc (count * sizeof (void *) + count * gomp_cpuset_size);
+  if (ret == NULL)
+    {
+      if (!quiet)
+	gomp_error ("Out of memory trying to allocate places list");
+      return NULL;
+    }
+
+  p = (char *) (ret + count);
+  for (i = 0; i < count; i++, p += gomp_cpuset_size)
+    ret[i] = p;
+  return ret;
+}
+
+void
+gomp_affinity_init_place (void *p)
+{
+  cpu_set_t *cpusetp = (cpu_set_t *) p;
+  CPU_ZERO_S (gomp_cpuset_size, cpusetp);
+}
+
+bool
+gomp_affinity_add_cpus (void *p, unsigned long num,
+			unsigned long len, long stride, bool quiet)
+{
+  cpu_set_t *cpusetp = (cpu_set_t *) p;
+  unsigned long max = 8 * gomp_cpuset_size;
+  for (;;)
+    {
+      if (num >= max)
 	{
-	  gomp_error ("not enough memory to store CPU affinity list");
-	  return;
+	  if (!quiet)
+	    gomp_error ("Logical CPU number %lu out of range", num);
+	  return false;
 	}
-      for (widx = idx = 0; widx < count && idx < 65536; idx++)
-	if (CPU_ISSET (idx, &cpuset))
+      CPU_SET_S (num, gomp_cpuset_size, cpusetp);
+      if (--len == 0)
+	return true;
+      if ((stride < 0 && num + stride > num)
+	  || (stride > 0 && num + stride < num))
+	{
+	  if (!quiet)
+	    gomp_error ("Logical CPU number %lu+%ld out of range",
+			num, stride);
+	  return false;
+	}
+      num += stride;
+    }
+}
+
+bool
+gomp_affinity_remove_cpu (void *p, unsigned long num)
+{
+  cpu_set_t *cpusetp = (cpu_set_t *) p;
+  if (num >= 8 * gomp_cpuset_size)
+    {
+      gomp_error ("Logical CPU number %lu out of range", num);
+      return false;
+    }
+  if (!CPU_ISSET_S (num, gomp_cpuset_size, cpusetp))
+    {
+      gomp_error ("Logical CPU %lu to be removed is not in the set", num);
+      return false;
+    }
+  CPU_CLR_S (num, gomp_cpuset_size, cpusetp);
+  return true;
+}
+
+bool
+gomp_affinity_copy_place (void *p, void *q, long stride)
+{
+  unsigned long i, max = 8 * gomp_cpuset_size;
+  cpu_set_t *destp = (cpu_set_t *) p;
+  cpu_set_t *srcp = (cpu_set_t *) q;
+
+  CPU_ZERO_S (gomp_cpuset_size, destp);
+  for (i = 0; i < max; i++)
+    if (CPU_ISSET_S (i, gomp_cpuset_size, srcp))
+      {
+	if ((stride < 0 && i + stride > i)
+	    || (stride > 0 && (i + stride < i || i + stride >= max)))
+	  {
+	    gomp_error ("Logical CPU number %lu+%ld out of range", i, stride);
+	    return false;
+	  }
+	CPU_SET_S (i + stride, gomp_cpuset_size, destp);
+      }
+  return true;
+}
+
+bool
+gomp_affinity_same_place (void *p, void *q)
+{
+#ifdef CPU_EQUAL_S
+  return CPU_EQUAL_S (gomp_cpuset_size, (cpu_set_t *) p, (cpu_set_t *) q);
+#else
+  return memcmp (p, q, gomp_cpuset_size) == 0;
+#endif
+}
+
+bool
+gomp_affinity_finalize_place_list (bool quiet)
+{
+  unsigned long i, j;
+
+  for (i = 0, j = 0; i < gomp_places_list_len; i++)
+    {
+      cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[i];
+      bool nonempty = false;
+#ifdef CPU_AND_S
+      CPU_AND_S (gomp_cpuset_size, cpusetp, cpusetp, gomp_cpusetp);
+      nonempty = gomp_cpuset_popcount (gomp_cpuset_size, cpusetp) != 0;
+#else
+      unsigned long k, max = gomp_cpuset_size / sizeof (cpusetp->__bits[0]);
+      for (k = 0; k < max; k++)
+	if ((cpusetp->__bits[k] &= gomp_cpusetp->__bits[k]) != 0)
+	  nonempty = true;
+#endif
+      if (nonempty)
+	gomp_places_list[j++] = gomp_places_list[i];
+    }
+
+  if (j == 0)
+    {
+      if (!quiet)
+	gomp_error ("None of the places contain usable logical CPUs");
+      return false;
+    }
+  else if (j < gomp_places_list_len)
+    {
+      if (!quiet)
+	gomp_error ("Number of places reduced from %ld to %ld because some "
+		    "places didn't contain any usable logical CPUs",
+		    gomp_places_list_len, j);
+      gomp_places_list_len = j;
+    }
+  return true;
+}
+
+bool
+gomp_affinity_init_level (int level, unsigned long count, bool quiet)
+{
+  unsigned long i, max = 8 * gomp_cpuset_size;
+
+  if (gomp_cpusetp)
+    {
+      unsigned long maxcount
+	= gomp_cpuset_popcount (gomp_cpuset_size, gomp_cpusetp);
+      if (count > maxcount)
+	count = maxcount;
+    }
+  gomp_places_list = gomp_affinity_alloc (count, quiet);
+  gomp_places_list_len = 0;
+  if (gomp_places_list == NULL)
+    return false;
+  /* SMT (threads).  */
+  if (level == 1)
+    {
+      for (i = 0; i < max && gomp_places_list_len < count; i++)
+	if (CPU_ISSET_S (i, gomp_cpuset_size, gomp_cpusetp))
 	  {
-	    cpus++;
-	    gomp_cpu_affinity[widx++] = idx;
+	    gomp_affinity_init_place (gomp_places_list[gomp_places_list_len]);
+	    gomp_affinity_add_cpus (gomp_places_list[gomp_places_list_len],
+				    i, 1, 0, true);
+	    ++gomp_places_list_len;
 	  }
+      return true;
     }
   else
-    for (widx = idx = 0; idx < gomp_cpu_affinity_len; idx++)
-      if (gomp_cpu_affinity[idx] < CPU_SETSIZE
-	  && CPU_ISSET (gomp_cpu_affinity[idx], &cpuset))
+    {
+      char name[sizeof ("/sys/devices/system/cpu/cpu/topology/"
+			"thread_siblings_list") + 3 * sizeof (unsigned long)];
+      size_t prefix_len = sizeof ("/sys/devices/system/cpu/cpu") - 1;
+      cpu_set_t *copy = gomp_alloca (gomp_cpuset_size);
+      FILE *f;
+      char *line = NULL;
+      size_t linelen = 0;
+
+      memcpy (name, "/sys/devices/system/cpu/cpu", prefix_len);
+      memcpy (copy, gomp_cpusetp, gomp_cpuset_size);
+      for (i = 0; i < max && gomp_places_list_len < count; i++)
+	if (CPU_ISSET_S (i, gomp_cpuset_size, copy))
+	  {
+	    sprintf (name + prefix_len, "%lu/topology/%s_siblings_list",
+		     i, level == 2 ? "thread" : "core");
+	    f = fopen (name, "r");
+	    if (f != NULL)
+	      {
+		if (getline (&line, &linelen, f) > 0)
+		  {
+		    char *p = line;
+		    bool seen_i = false;
+		    void *pl = gomp_places_list[gomp_places_list_len];
+		    gomp_affinity_init_place (pl);
+		    while (*p && *p != '\n')
+		      {
+			unsigned long first, last;
+			errno = 0;
+			first = strtoul (p, &p, 10);
+			if (errno)
+			  break;
+			last = first;
+			if (*p == '-')
+			  {
+			    errno = 0;
+			    last = strtoul (p + 1, &p, 10);
+			    if (errno || last < first)
+			      break;
+			  }
+			for (; first <= last; first++)
+			  if (CPU_ISSET_S (first, gomp_cpuset_size, copy)
+			      && gomp_affinity_add_cpus (pl, first, 1, 0,
+							 true))
+			    {
+			      CPU_CLR_S (first, gomp_cpuset_size, copy);
+			      if (first == i)
+				seen_i = true;
+			    }
+			if (*p == ',')
+			  ++p;
+		      }
+		    if (seen_i)
+		      gomp_places_list_len++;
+		  }
+		fclose (f);
+	      }
+	  }
+      if (gomp_places_list_len == 0)
 	{
-	  if (! CPU_ISSET (gomp_cpu_affinity[idx], &cpusetnew))
-	    {
-	      cpus++;
-	      CPU_SET (gomp_cpu_affinity[idx], &cpusetnew);
-	    }
-	  gomp_cpu_affinity[widx++] = gomp_cpu_affinity[idx];
+	  if (!quiet)
+	    gomp_error ("Error reading %s topology",
+			level == 2 ? "core" : "socket");
+	  free (gomp_places_list);
+	  gomp_places_list = NULL;
+	  return false;
 	}
-
-  if (widx == 0)
-    {
-      gomp_error ("no CPUs left for affinity setting");
-      free (gomp_cpu_affinity);
-      gomp_cpu_affinity = NULL;
-      gomp_cpu_affinity_len = 0;
-      return;
-    }
-
-  gomp_cpu_affinity_len = widx;
-  if (cpus < gomp_available_cpus)
-    gomp_available_cpus = cpus;
-  CPU_ZERO (&cpuset);
-  CPU_SET (gomp_cpu_affinity[0], &cpuset);
-  pthread_setaffinity_np (pthread_self (), sizeof (cpuset), &cpuset);
-  affinity_counter = 1;
+      return true;
+    }
+  return false;
 }
 
 void
-gomp_init_thread_affinity (pthread_attr_t *attr)
+gomp_affinity_print_place (void *p)
 {
-  unsigned int cpu;
-  cpu_set_t cpuset;
+  unsigned long i, max = 8 * gomp_cpuset_size, len;
+  cpu_set_t *cpusetp = (cpu_set_t *) p;
+  bool notfirst = false;
 
-  cpu = __atomic_fetch_add (&affinity_counter, 1, MEMMODEL_RELAXED);
-  cpu %= gomp_cpu_affinity_len;
-  CPU_ZERO (&cpuset);
-  CPU_SET (gomp_cpu_affinity[cpu], &cpuset);
-  pthread_attr_setaffinity_np (attr, sizeof (cpu_set_t), &cpuset);
+  for (i = 0, len = 0; i < max; i++)
+    if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp))
+      {
+	if (len == 0)
+	  {
+	    if (notfirst)
+	      fputc (',', stderr);
+	    notfirst = true;
+	    fprintf (stderr, "%lu", i);
+	  }
+	++len;
+      }
+    else
+      {
+	if (len > 1)
+	  fprintf (stderr, ":%lu", len);
+	len = 0;
+      }
+  if (len > 1)
+    fprintf (stderr, ":%lu", len);
 }
 
 #else
--- libgomp/config/linux/bar.h	(revision 210461)
+++ libgomp/config/linux/bar.h	(revision 210462)
@@ -38,13 +38,25 @@ typedef struct
   unsigned total __attribute__((aligned (64)));
   unsigned generation;
   unsigned awaited __attribute__((aligned (64)));
+  unsigned awaited_final;
 } gomp_barrier_t;
+
 typedef unsigned int gomp_barrier_state_t;
 
+/* The generation field contains a counter in the high bits, with a few
+   low bits dedicated to flags.  Note that TASK_PENDING and WAS_LAST can
+   share space because WAS_LAST is never stored back to generation.  */
+#define BAR_TASK_PENDING	1
+#define BAR_WAS_LAST		1
+#define BAR_WAITING_FOR_TASK	2
+#define BAR_CANCELLED		4
+#define BAR_INCR		8
+
 static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
 {
   bar->total = count;
   bar->awaited = count;
+  bar->awaited_final = count;
   bar->generation = 0;
 }
 
@@ -62,27 +74,55 @@ extern void gomp_barrier_wait (gomp_barr
 extern void gomp_barrier_wait_last (gomp_barrier_t *);
 extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
 extern void gomp_team_barrier_wait (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
 extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
 					gomp_barrier_state_t);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
+					       gomp_barrier_state_t);
 extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
+struct gomp_team;
+extern void gomp_team_barrier_cancel (struct gomp_team *);
 
 static inline gomp_barrier_state_t
 gomp_barrier_wait_start (gomp_barrier_t *bar)
 {
-  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE) & ~3;
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+  ret &= -BAR_INCR | BAR_CANCELLED;
   /* A memory barrier is needed before exiting from the various forms
      of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
      2.8.6 flush Construct, which says there is an implicit flush during
      a barrier region.  This is a convenient place to add the barrier,
      so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE.  */
-  ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0;
+  if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+{
+  return gomp_barrier_wait_start (bar);
+}
+
+/* This is like gomp_barrier_wait_start, except it decrements
+   bar->awaited_final rather than bar->awaited and should be used
+   for the gomp_team_end barrier only.  */
+static inline gomp_barrier_state_t
+gomp_barrier_wait_final_start (gomp_barrier_t *bar)
+{
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+  ret &= -BAR_INCR | BAR_CANCELLED;
+  /* See above gomp_barrier_wait_start comment.  */
+  if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_ACQ_REL) == 0)
+    ret |= BAR_WAS_LAST;
   return ret;
 }
 
 static inline bool
 gomp_barrier_last_thread (gomp_barrier_state_t state)
 {
-  return state & 1;
+  return state & BAR_WAS_LAST;
 }
 
 /* All the inlines below must be called with team->task_lock
@@ -91,31 +131,37 @@ gomp_barrier_last_thread (gomp_barrier_s
 static inline void
 gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
 {
-  bar->generation |= 1;
+  bar->generation |= BAR_TASK_PENDING;
 }
 
 static inline void
 gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
 {
-  bar->generation &= ~1;
+  bar->generation &= ~BAR_TASK_PENDING;
 }
 
 static inline void
 gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
 {
-  bar->generation |= 2;
+  bar->generation |= BAR_WAITING_FOR_TASK;
 }
 
 static inline bool
 gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
 {
-  return (bar->generation & 2) != 0;
+  return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+}
+
+static inline bool
+gomp_team_barrier_cancelled (gomp_barrier_t *bar)
+{
+  return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
 }
 
 static inline void
 gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
 {
-  bar->generation = (state & ~3) + 4;
+  bar->generation = (state & -BAR_INCR) + BAR_INCR;
 }
 
 #endif /* GOMP_BARRIER_H */
--- libgomp/config/posix/bar.c	(revision 210461)
+++ libgomp/config/posix/bar.c	(revision 210462)
@@ -42,6 +42,7 @@ gomp_barrier_init (gomp_barrier_t *bar,
   bar->total = count;
   bar->arrived = 0;
   bar->generation = 0;
+  bar->cancellable = false;
 }
 
 void
@@ -72,7 +73,7 @@ gomp_barrier_wait_end (gomp_barrier_t *b
 {
   unsigned int n;
 
-  if (state & 1)
+  if (state & BAR_WAS_LAST)
     {
       n = --bar->arrived;
       if (n > 0)
@@ -113,12 +114,14 @@ gomp_team_barrier_wait_end (gomp_barrier
 {
   unsigned int n;
 
-  if (state & 1)
+  state &= ~BAR_CANCELLED;
+  if (state & BAR_WAS_LAST)
     {
       n = --bar->arrived;
       struct gomp_thread *thr = gomp_thread ();
       struct gomp_team *team = thr->ts.team;
 
+      team->work_share_cancelled = 0;
       if (team->task_count)
 	{
 	  gomp_barrier_handle_tasks (state);
@@ -128,7 +131,7 @@ gomp_team_barrier_wait_end (gomp_barrier
 	  return;
 	}
 
-      bar->generation = state + 3;
+      bar->generation = state + BAR_INCR - BAR_WAS_LAST;
       if (n > 0)
 	{
 	  do
@@ -141,13 +144,18 @@ gomp_team_barrier_wait_end (gomp_barrier
   else
     {
       gomp_mutex_unlock (&bar->mutex1);
+      int gen;
       do
 	{
 	  gomp_sem_wait (&bar->sem1);
-	  if (bar->generation & 1)
-	    gomp_barrier_handle_tasks (state);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	  if (gen & BAR_TASK_PENDING)
+	    {
+	      gomp_barrier_handle_tasks (state);
+	      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	    }
 	}
-      while (bar->generation != state + 4);
+      while (gen != state + BAR_INCR);
 
 #ifdef HAVE_SYNC_BUILTINS
       n = __sync_add_and_fetch (&bar->arrived, -1);
@@ -162,6 +170,81 @@ gomp_team_barrier_wait_end (gomp_barrier
     }
 }
 
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+				   gomp_barrier_state_t state)
+{
+  unsigned int n;
+
+  if (state & BAR_WAS_LAST)
+    {
+      bar->cancellable = false;
+      n = --bar->arrived;
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      team->work_share_cancelled = 0;
+      if (team->task_count)
+	{
+	  gomp_barrier_handle_tasks (state);
+	  if (n > 0)
+	    gomp_sem_wait (&bar->sem2);
+	  gomp_mutex_unlock (&bar->mutex1);
+	  return false;
+	}
+
+      bar->generation = state + BAR_INCR - BAR_WAS_LAST;
+      if (n > 0)
+	{
+	  do
+	    gomp_sem_post (&bar->sem1);
+	  while (--n != 0);
+	  gomp_sem_wait (&bar->sem2);
+	}
+      gomp_mutex_unlock (&bar->mutex1);
+    }
+  else
+    {
+      if (state & BAR_CANCELLED)
+	{
+	  gomp_mutex_unlock (&bar->mutex1);
+	  return true;
+	}
+      bar->cancellable = true;
+      gomp_mutex_unlock (&bar->mutex1);
+      int gen;
+      do
+	{
+	  gomp_sem_wait (&bar->sem1);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	  if (gen & BAR_CANCELLED)
+	    break;
+	  if (gen & BAR_TASK_PENDING)
+	    {
+	      gomp_barrier_handle_tasks (state);
+	      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	      if (gen & BAR_CANCELLED)
+		break;
+	    }
+	}
+      while (gen != state + BAR_INCR);
+
+#ifdef HAVE_SYNC_BUILTINS
+      n = __sync_add_and_fetch (&bar->arrived, -1);
+#else
+      gomp_mutex_lock (&bar->mutex2);
+      n = --bar->arrived;
+      gomp_mutex_unlock (&bar->mutex2);
+#endif
+
+      if (n == 0)
+	gomp_sem_post (&bar->sem2);
+      if (gen & BAR_CANCELLED)
+	return true;
+    }
+  return false;
+}
+
 void
 gomp_team_barrier_wait (gomp_barrier_t *barrier)
 {
@@ -176,3 +259,40 @@ gomp_team_barrier_wake (gomp_barrier_t *
   while (count-- > 0)
     gomp_sem_post (&bar->sem1);
 }
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_cancel_start (bar);
+  return gomp_team_barrier_wait_cancel_end (bar, state);
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+  if (team->barrier.generation & BAR_CANCELLED)
+    return;
+  gomp_mutex_lock (&team->barrier.mutex1);
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      gomp_mutex_unlock (&team->barrier.mutex1);
+      return;
+    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);
+  if (team->barrier.cancellable)
+    {
+      int n = team->barrier.arrived;
+      if (n > 0)
+	{
+	  do
+	    gomp_sem_post (&team->barrier.sem1);
+	  while (--n != 0);
+	  gomp_sem_wait (&team->barrier.sem2);
+	}
+      team->barrier.cancellable = false;
+    }
+  gomp_mutex_unlock (&team->barrier.mutex1);
+}
--- libgomp/config/posix/affinity.c	(revision 210461)
+++ libgomp/config/posix/affinity.c	(revision 210462)
@@ -32,7 +32,84 @@ gomp_init_affinity (void)
 }
 
 void
-gomp_init_thread_affinity (pthread_attr_t *attr)
+gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place)
 {
   (void) attr;
+  (void) place;
+}
+
+void **
+gomp_affinity_alloc (unsigned long count, bool quiet)
+{
+  (void) count;
+  if (!quiet)
+    gomp_error ("Affinity not supported on this configuration");
+  return NULL;
+}
+
+void
+gomp_affinity_init_place (void *p)
+{
+  (void) p;
+}
+
+bool
+gomp_affinity_add_cpus (void *p, unsigned long num,
+			unsigned long len, long stride, bool quiet)
+{
+  (void) p;
+  (void) num;
+  (void) len;
+  (void) stride;
+  (void) quiet;
+  return false;
+}
+
+bool
+gomp_affinity_remove_cpu (void *p, unsigned long num)
+{
+  (void) p;
+  (void) num;
+  return false;
+}
+
+bool
+gomp_affinity_copy_place (void *p, void *q, long stride)
+{
+  (void) p;
+  (void) q;
+  (void) stride;
+  return false;
+}
+
+bool
+gomp_affinity_same_place (void *p, void *q)
+{
+  (void) p;
+  (void) q;
+  return false;
+}
+
+bool
+gomp_affinity_finalize_place_list (bool quiet)
+{
+  (void) quiet;
+  return false;
+}
+
+bool
+gomp_affinity_init_level (int level, unsigned long count, bool quiet)
+{
+  (void) level;
+  (void) count;
+  (void) quiet;
+  if (!quiet)
+    gomp_error ("Affinity not supported on this configuration");
+  return NULL;
+}
+
+void
+gomp_affinity_print_place (void *p)
+{
+  (void) p;
 }
--- libgomp/config/posix/bar.h	(revision 210461)
+++ libgomp/config/posix/bar.h	(revision 210462)
@@ -43,9 +43,20 @@ typedef struct
   unsigned total;
   unsigned arrived;
   unsigned generation;
+  bool cancellable;
 } gomp_barrier_t;
+
 typedef unsigned int gomp_barrier_state_t;
 
+/* The generation field contains a counter in the high bits, with a few
+   low bits dedicated to flags.  Note that TASK_PENDING and WAS_LAST can
+   share space because WAS_LAST is never stored back to generation.  */
+#define BAR_TASK_PENDING	1
+#define BAR_WAS_LAST		1
+#define BAR_WAITING_FOR_TASK	2
+#define BAR_CANCELLED		4
+#define BAR_INCR		8
+
 extern void gomp_barrier_init (gomp_barrier_t *, unsigned);
 extern void gomp_barrier_reinit (gomp_barrier_t *, unsigned);
 extern void gomp_barrier_destroy (gomp_barrier_t *);
@@ -55,22 +66,47 @@ extern void gomp_barrier_wait_end (gomp_
 extern void gomp_team_barrier_wait (gomp_barrier_t *);
 extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
 					gomp_barrier_state_t);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
+					       gomp_barrier_state_t);
 extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
+struct gomp_team;
+extern void gomp_team_barrier_cancel (struct gomp_team *);
 
 static inline gomp_barrier_state_t
 gomp_barrier_wait_start (gomp_barrier_t *bar)
 {
   unsigned int ret;
   gomp_mutex_lock (&bar->mutex1);
-  ret = bar->generation & ~3;
-  ret += ++bar->arrived == bar->total;
+  ret = bar->generation & (-BAR_INCR | BAR_CANCELLED);
+  if (++bar->arrived == bar->total)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+{
+  unsigned int ret;
+  gomp_mutex_lock (&bar->mutex1);
+  ret = bar->generation & (-BAR_INCR | BAR_CANCELLED);
+  if (ret & BAR_CANCELLED)
+    return ret;
+  if (++bar->arrived == bar->total)
+    ret |= BAR_WAS_LAST;
   return ret;
 }
 
+static inline void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_team_barrier_wait (bar);
+}
+
 static inline bool
 gomp_barrier_last_thread (gomp_barrier_state_t state)
 {
-  return state & 1;
+  return state & BAR_WAS_LAST;
 }
 
 static inline void
@@ -85,31 +121,37 @@ gomp_barrier_wait_last (gomp_barrier_t *
 static inline void
 gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
 {
-  bar->generation |= 1;
+  bar->generation |= BAR_TASK_PENDING;
 }
 
 static inline void
 gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
 {
-  bar->generation &= ~1;
+  bar->generation &= ~BAR_TASK_PENDING;
 }
 
 static inline void
 gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
 {
-  bar->generation |= 2;
+  bar->generation |= BAR_WAITING_FOR_TASK;
 }
 
 static inline bool
 gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
 {
-  return (bar->generation & 2) != 0;
+  return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+}
+
+static inline bool
+gomp_team_barrier_cancelled (gomp_barrier_t *bar)
+{
+  return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
 }
 
 static inline void
 gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
 {
-  bar->generation = (state & ~3) + 4;
+  bar->generation = (state & -BAR_INCR) + BAR_INCR;
 }
 
 #endif /* GOMP_BARRIER_H */
--- libgomp/barrier.c	(revision 210461)
+++ libgomp/barrier.c	(revision 210462)
@@ -39,3 +39,15 @@ GOMP_barrier (void)
 
   gomp_team_barrier_wait (&team->barrier);
 }
+
+bool
+GOMP_barrier_cancel (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+
+  /* The compiler transforms to barrier_cancel when it sees that the
+     barrier is within a construct that can cancel.  Thus we should
+     never have an orphaned cancellable barrier.  */
+  return gomp_team_barrier_wait_cancel (&team->barrier);
+}
--- libgomp/target.c	(revision 0)
+++ libgomp/target.c	(revision 210462)
@@ -0,0 +1,96 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU OpenMP Library (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file handles the maintainence of threads in response to team
+   creation and termination.  */
+
+#include "libgomp.h"
+#include <limits.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+attribute_hidden int
+gomp_get_num_devices (void)
+{
+  return 0;
+}
+
+/* Called when encountering a target directive.  If DEVICE
+   is -1, it means use device-var ICV.  If it is -2 (or any other value
+   larger than last available hw device, use host fallback.
+   FN is address of host code, OPENMP_TARGET contains value of the
+   __OPENMP_TARGET__ symbol in the shared library or binary that invokes
+   GOMP_target.  HOSTADDRS, SIZES and KINDS are arrays
+   with MAPNUM entries, with addresses of the host objects,
+   sizes of the host objects (resp. for pointer kind pointer bias
+   and assumed sizeof (void *) size) and kinds.  */
+
+void
+GOMP_target (int device, void (*fn) (void *), const void *openmp_target,
+	     size_t mapnum, void **hostaddrs, size_t *sizes,
+	     unsigned char *kinds)
+{
+  /* Host fallback.  */
+  struct gomp_thread old_thr, *thr = gomp_thread ();
+  old_thr = *thr;
+  memset (thr, '\0', sizeof (*thr));
+  if (gomp_places_list)
+    {
+      thr->place = old_thr.place;
+      thr->ts.place_partition_len = gomp_places_list_len;
+    }
+  fn (hostaddrs);
+  gomp_free_thread (thr);
+  *thr = old_thr;
+}
+
+void
+GOMP_target_data (int device, const void *openmp_target, size_t mapnum,
+		  void **hostaddrs, size_t *sizes, unsigned char *kinds)
+{
+}
+
+void
+GOMP_target_end_data (void)
+{
+}
+
+void
+GOMP_target_update (int device, const void *openmp_target, size_t mapnum,
+		    void **hostaddrs, size_t *sizes, unsigned char *kinds)
+{
+}
+
+void
+GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
+{
+  if (thread_limit)
+    {
+      struct gomp_task_icv *icv = gomp_icv (true);
+      icv->thread_limit_var
+	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
+    }
+  (void) num_teams;
+}
--- libgomp/parallel.c	(revision 210461)
+++ libgomp/parallel.c	(revision 210462)
@@ -37,18 +37,19 @@
 unsigned
 gomp_resolve_num_threads (unsigned specified, unsigned count)
 {
-  struct gomp_thread *thread = gomp_thread();
+  struct gomp_thread *thr = gomp_thread ();
   struct gomp_task_icv *icv;
   unsigned threads_requested, max_num_threads, num_threads;
-  unsigned long remaining;
+  unsigned long busy;
+  struct gomp_thread_pool *pool;
 
   icv = gomp_icv (false);
 
   if (specified == 1)
     return 1;
-  else if (thread->ts.active_level >= 1 && !icv->nest_var)
+  else if (thr->ts.active_level >= 1 && !icv->nest_var)
     return 1;
-  else if (thread->ts.active_level >= gomp_max_active_levels_var)
+  else if (thr->ts.active_level >= gomp_max_active_levels_var)
     return 1;
 
   /* If NUM_THREADS not specified, use nthreads_var.  */
@@ -72,30 +73,46 @@ gomp_resolve_num_threads (unsigned speci
 	max_num_threads = count;
     }
 
-  /* ULONG_MAX stands for infinity.  */
-  if (__builtin_expect (gomp_thread_limit_var == ULONG_MAX, 1)
+  /* UINT_MAX stands for infinity.  */
+  if (__builtin_expect (icv->thread_limit_var == UINT_MAX, 1)
       || max_num_threads == 1)
     return max_num_threads;
 
+  /* The threads_busy counter lives in thread_pool, if there
+     isn't a thread_pool yet, there must be just one thread
+     in the contention group.  If thr->team is NULL, this isn't
+     nested parallel, so there is just one thread in the
+     contention group as well, no need to handle it atomically.  */
+  pool = thr->thread_pool;
+  if (thr->ts.team == NULL)
+    {
+      num_threads = max_num_threads;
+      if (num_threads > icv->thread_limit_var)
+	num_threads = icv->thread_limit_var;
+      if (pool)
+	pool->threads_busy = num_threads;
+      return num_threads;
+    }
+
 #ifdef HAVE_SYNC_BUILTINS
   do
     {
-      remaining = gomp_remaining_threads_count;
+      busy = pool->threads_busy;
       num_threads = max_num_threads;
-      if (num_threads > remaining)
-	num_threads = remaining + 1;
+      if (icv->thread_limit_var - busy + 1 < num_threads)
+	num_threads = icv->thread_limit_var - busy + 1;
     }
-  while (__sync_val_compare_and_swap (&gomp_remaining_threads_count,
-				      remaining, remaining - num_threads + 1)
-	 != remaining);
+  while (__sync_val_compare_and_swap (&pool->threads_busy,
+				      busy, busy + num_threads - 1)
+	 != busy);
 #else
-  gomp_mutex_lock (&gomp_remaining_threads_lock);
+  gomp_mutex_lock (&gomp_managed_threads_lock);
   num_threads = max_num_threads;
-  remaining = gomp_remaining_threads_count;
-  if (num_threads > remaining)
-    num_threads = remaining + 1;
-  gomp_remaining_threads_count -= num_threads - 1;
-  gomp_mutex_unlock (&gomp_remaining_threads_lock);
+  busy = pool->threads_busy;
+  if (icv->thread_limit_var - busy + 1 < num_threads)
+    num_threads = icv->thread_limit_var - busy + 1;
+  pool->threads_busy += num_threads - 1;
+  gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
 
   return num_threads;
@@ -105,13 +122,14 @@ void
 GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads)
 {
   num_threads = gomp_resolve_num_threads (num_threads, 0);
-  gomp_team_start (fn, data, num_threads, gomp_new_team (num_threads));
+  gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads));
 }
 
 void
 GOMP_parallel_end (void)
 {
-  if (__builtin_expect (gomp_thread_limit_var != ULONG_MAX, 0))
+  struct gomp_task_icv *icv = gomp_icv (false);
+  if (__builtin_expect (icv->thread_limit_var != UINT_MAX, 0))
     {
       struct gomp_thread *thr = gomp_thread ();
       struct gomp_team *team = thr->ts.team;
@@ -119,20 +137,98 @@ GOMP_parallel_end (void)
       gomp_team_end ();
       if (nthreads > 1)
 	{
+	  /* If not nested, there is just one thread in the
+	     contention group left, no need for atomicity.  */
+	  if (thr->ts.team == NULL)
+	    thr->thread_pool->threads_busy = 1;
+	  else
+	    {
 #ifdef HAVE_SYNC_BUILTINS
-	  __sync_fetch_and_add (&gomp_remaining_threads_count,
-				nthreads - 1);
+	      __sync_fetch_and_add (&thr->thread_pool->threads_busy,
+				    1UL - nthreads);
 #else
-	  gomp_mutex_lock (&gomp_remaining_threads_lock);
-	  gomp_remaining_threads_count += nthreads - 1;
-	  gomp_mutex_unlock (&gomp_remaining_threads_lock);
+	      gomp_mutex_lock (&gomp_managed_threads_lock);
+	      thr->thread_pool->threads_busy -= nthreads - 1;
+	      gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
+	    }
 	}
     }
   else
     gomp_team_end ();
 }
+ialias (GOMP_parallel_end)
+
+void
+GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags)
+{
+  num_threads = gomp_resolve_num_threads (num_threads, 0);
+  gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads));
+  fn (data);
+  ialias_call (GOMP_parallel_end) ();
+}
+
+bool
+GOMP_cancellation_point (int which)
+{
+  if (!gomp_cancel_var)
+    return false;
 
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  if (which & (GOMP_CANCEL_LOOP | GOMP_CANCEL_SECTIONS))
+    {
+      if (team == NULL)
+	return false;
+      return team->work_share_cancelled != 0;
+    }
+  else if (which & GOMP_CANCEL_TASKGROUP)
+    {
+      if (thr->task->taskgroup && thr->task->taskgroup->cancelled)
+	return true;
+      /* FALLTHRU into the GOMP_CANCEL_PARALLEL case,
+	 as #pragma omp cancel parallel also cancels all explicit
+	 tasks.  */
+    }
+  if (team)
+    return gomp_team_barrier_cancelled (&team->barrier);
+  return false;
+}
+ialias (GOMP_cancellation_point)
+
+bool
+GOMP_cancel (int which, bool do_cancel)
+{
+  if (!gomp_cancel_var)
+    return false;
+
+  if (!do_cancel)
+    return ialias_call (GOMP_cancellation_point) (which);
+
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  if (which & (GOMP_CANCEL_LOOP | GOMP_CANCEL_SECTIONS))
+    {
+      /* In orphaned worksharing region, all we want to cancel
+	 is current thread.  */
+      if (team != NULL)
+	team->work_share_cancelled = 1;
+      return true;
+    }
+  else if (which & GOMP_CANCEL_TASKGROUP)
+    {
+      if (thr->task->taskgroup && !thr->task->taskgroup->cancelled)
+	{
+	  gomp_mutex_lock (&team->task_lock);
+	  thr->task->taskgroup->cancelled = true;
+	  gomp_mutex_unlock (&team->task_lock);
+	}
+      return true;
+    }
+  team->team_cancelled = 1;
+  gomp_team_barrier_cancel (team);
+  return true;
+}
 
 /* The public OpenMP API for thread and team related inquiries.  */