Blame SOURCES/rhbz1906662.patch

b8e312
commit 374d37118ae1274077a425261ef1428151eb6d7c
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Tue Nov 10 10:03:34 2020 -0800
b8e312
b8e312
    stp_utrace: disable IRQs when holding the bucket spin lock
b8e312
    
b8e312
    This lock can be acquired from inside an IRQ, leading to a deadlock:
b8e312
    
b8e312
    WARNING: inconsistent lock state
b8e312
    4.14.35-1902.6.6.el7uek.x86_64.debug #2 Tainted: G           OE
b8e312
    --------------------------------
b8e312
    inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
b8e312
    sh/15779 [HC1[1]:SC0[0]:HE0:SE1] takes:
b8e312
     (&(lock)->rlock#3){?.+.}, at: [<ffffffffc0c080b0>] _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
    {HARDIRQ-ON-W} state was registered at:
b8e312
      lock_acquire+0xe0/0x238
b8e312
      _raw_spin_lock+0x3d/0x7a
b8e312
      utrace_task_alloc+0xa4/0xe3 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
      utrace_attach_task+0x136/0x194 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
      __stp_utrace_attach+0x57/0x216 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
      stap_start_task_finder+0x12e/0x33f [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
      systemtap_module_init+0x114d/0x11f0 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
      _stp_handle_start+0xea/0x1c5 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
      _stp_ctl_write_cmd+0x28d/0x2d1 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
      full_proxy_write+0x67/0xbb
b8e312
      __vfs_write+0x3a/0x170
b8e312
      vfs_write+0xc7/0x1c0
b8e312
      SyS_write+0x58/0xbf
b8e312
      do_syscall_64+0x7e/0x22c
b8e312
      entry_SYSCALL_64_after_hwframe+0x16e/0x0
b8e312
    irq event stamp: 9454
b8e312
    hardirqs last  enabled at (9453): [<ffffffffa696c960>] _raw_write_unlock_irqrestore+0x40/0x67
b8e312
    hardirqs last disabled at (9454): [<ffffffffa6a05417>] apic_timer_interrupt+0x1c7/0x1d1
b8e312
    softirqs last  enabled at (9202): [<ffffffffa6c00361>] __do_softirq+0x361/0x4e5
b8e312
    softirqs last disabled at (9195): [<ffffffffa60aeb76>] irq_exit+0xf6/0x102
b8e312
    
b8e312
    other info that might help us debug this:
b8e312
     Possible unsafe locking scenario:
b8e312
    
b8e312
           CPU0
b8e312
           ----
b8e312
      lock(&(lock)->rlock#3);
b8e312
      <Interrupt>
b8e312
        lock(&(lock)->rlock#3);
b8e312
    
b8e312
     *** DEADLOCK ***
b8e312
    
b8e312
    no locks held by sh/15779.
b8e312
    
b8e312
    stack backtrace:
b8e312
    CPU: 16 PID: 15779 Comm: sh Tainted: G           OE   4.14.35-1902.6.6.el7uek.x86_64.debug #2
b8e312
    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
b8e312
    Call Trace:
b8e312
     <IRQ>
b8e312
     dump_stack+0x81/0xb6
b8e312
     print_usage_bug+0x1fc/0x20d
b8e312
     ? check_usage_backwards+0x130/0x12b
b8e312
     mark_lock+0x1f8/0x27b
b8e312
     __lock_acquire+0x6e7/0x165a
b8e312
     ? sched_clock_local+0x18/0x81
b8e312
     ? perf_swevent_hrtimer+0x136/0x151
b8e312
     lock_acquire+0xe0/0x238
b8e312
     ? _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     _raw_spin_lock_irqsave+0x55/0x97
b8e312
     ? _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     _stp_ctl_get_buffer+0x69/0x215 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     _stp_ctl_send+0x4e/0x169 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     _stp_vlog+0xac/0x143 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     ? _stp_utrace_probe_cb+0xa4/0xa4 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     _stp_warn+0x6a/0x88 [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     function___global_warn__overload_0+0x60/0xac [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     probe_67+0xce/0x10e [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     _stp_hrtimer_notify_function+0x2db/0x55f [orxray_lj_lua_fgraph_XXXXXXX]
b8e312
     __hrtimer_run_queues+0x132/0x5c5
b8e312
     hrtimer_interrupt+0xb7/0x1ca
b8e312
     smp_apic_timer_interrupt+0xa5/0x35a
b8e312
     apic_timer_interrupt+0x1cc/0x1d1
b8e312
     </IRQ>
b8e312
b8e312
diff --git a/runtime/stp_utrace.c b/runtime/stp_utrace.c
b8e312
index e2880f1e4..46ba48923 100644
b8e312
--- a/runtime/stp_utrace.c
b8e312
+++ b/runtime/stp_utrace.c
b8e312
@@ -490,9 +490,9 @@ static int utrace_exit(void)
b8e312
 		rcu_read_lock();
b8e312
 		stap_hlist_for_each_entry_rcu(utrace, node, &bucket->head, hlist) {
b8e312
 			utrace->freed = true;
b8e312
-			stp_spin_lock(&bucket->lock);
b8e312
+			stp_spin_lock_irqsave(&bucket->lock, flags);
b8e312
 			hlist_del_rcu(&utrace->hlist);
b8e312
-			stp_spin_unlock(&bucket->lock);
b8e312
+			stp_spin_unlock_irqrestore(&bucket->lock, flags);
b8e312
 
b8e312
 			utrace_cleanup(utrace);
b8e312
 		}
b8e312
@@ -724,6 +724,7 @@ static struct utrace *utrace_task_alloc(struct utrace_bucket *bucket,
b8e312
 					struct task_struct *task)
b8e312
 {
b8e312
 	struct utrace *utrace;
b8e312
+	unsigned long flags;
b8e312
 
b8e312
 	utrace = kmem_cache_zalloc(utrace_cachep, STP_ALLOC_FLAGS);
b8e312
 	if (unlikely(!utrace))
b8e312
@@ -739,9 +740,9 @@ static struct utrace *utrace_task_alloc(struct utrace_bucket *bucket,
b8e312
 	atomic_set(&utrace->resume_work_added, 0);
b8e312
 	atomic_set(&utrace->report_work_added, 0);
b8e312
 
b8e312
-	stp_spin_lock(&bucket->lock);
b8e312
+	stp_spin_lock_irqsave(&bucket->lock, flags);
b8e312
 	hlist_add_head_rcu(&utrace->hlist, &bucket->head);
b8e312
-	stp_spin_unlock(&bucket->lock);
b8e312
+	stp_spin_unlock_irqrestore(&bucket->lock, flags);
b8e312
 	return utrace;
b8e312
 }
b8e312
 
b8e312
@@ -768,15 +769,17 @@ static struct utrace *utrace_task_alloc(struct utrace_bucket *bucket,
b8e312
  */
b8e312
 static void utrace_free(struct utrace_bucket *bucket, struct utrace *utrace)
b8e312
 {
b8e312
+	unsigned long flags;
b8e312
+
b8e312
 	if (unlikely(!utrace))
b8e312
 		return;
b8e312
 
b8e312
 	/* Remove this utrace from the mapping list of tasks to
b8e312
 	 * struct utrace. */
b8e312
 	utrace->freed = true;
b8e312
-	stp_spin_lock(&bucket->lock);
b8e312
+	stp_spin_lock_irqsave(&bucket->lock, flags);
b8e312
 	hlist_del_rcu(&utrace->hlist);
b8e312
-	stp_spin_unlock(&bucket->lock);
b8e312
+	stp_spin_unlock_irqrestore(&bucket->lock, flags);
b8e312
 
b8e312
 	/* Free the utrace struct. */
b8e312
 #ifdef STP_TF_DEBUG
b8e312
commit 6a092f5ae824d4ce972c10b8681b9272e2fd67f3
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Tue Nov 17 11:03:53 2020 -0800
b8e312
b8e312
    task_finder: call _stp_vma_done() upon error to fix memory leak
b8e312
    
b8e312
    The memory allocated inside stap_initialize_vma_map() is not freed upon
b8e312
    error when the task finder is started because a call to _stp_vma_done()
b8e312
    in the error path is missing. Add it to fix the leak.
b8e312
b8e312
diff --git a/task_finder.cxx b/task_finder.cxx
b8e312
index d08d44a75..7c45e728b 100644
b8e312
--- a/task_finder.cxx
b8e312
+++ b/task_finder.cxx
b8e312
@@ -66,6 +66,7 @@ task_finder_derived_probe_group::emit_module_init (systemtap_session& s)
b8e312
 
b8e312
   s.op->newline() << "if (rc) {";
b8e312
   s.op->newline(1) << "stap_stop_task_finder();";
b8e312
+  s.op->newline() << "_stp_vma_done();";
b8e312
   s.op->newline(-1) << "}";
b8e312
   s.op->newline(-1) << "}";
b8e312
 }
b8e312
commit 3c4f82ca024df4f8e213f7c77418493262d4a4d7
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Tue Nov 24 10:50:10 2020 -0800
b8e312
b8e312
    runtime_context: factor out RCU usage using a rw lock
b8e312
    
b8e312
    We can factor out the RCU insanity in here by just adding in a rw lock
b8e312
    and using that to synchronize _stp_runtime_contexts_free() with any code
b8e312
    that has the runtime context held.
b8e312
b8e312
diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h
b8e312
index 41fecba81..18566957a 100644
b8e312
--- a/runtime/linux/runtime_context.h
b8e312
+++ b/runtime/linux/runtime_context.h
b8e312
@@ -11,15 +11,14 @@
b8e312
 #ifndef _LINUX_RUNTIME_CONTEXT_H_
b8e312
 #define _LINUX_RUNTIME_CONTEXT_H_
b8e312
 
b8e312
-#ifndef __rcu
b8e312
-#define __rcu
b8e312
-#endif
b8e312
-
b8e312
-static struct context __rcu *contexts[NR_CPUS] = { NULL };
b8e312
+/* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */
b8e312
+static DEFINE_RWLOCK(_stp_context_lock);
b8e312
+static DEFINE_PER_CPU(struct context *, contexts);
b8e312
+static atomic_t _stp_context_stop = ATOMIC_INIT(0);
b8e312
 
b8e312
 static int _stp_runtime_contexts_alloc(void)
b8e312
 {
b8e312
-	int cpu;
b8e312
+	unsigned int cpu;
b8e312
 
b8e312
 	for_each_possible_cpu(cpu) {
b8e312
 		/* Module init, so in user context, safe to use
b8e312
@@ -31,91 +30,67 @@ static int _stp_runtime_contexts_alloc(void)
b8e312
 				    (unsigned long) sizeof (struct context));
b8e312
 			return -ENOMEM;
b8e312
 		}
b8e312
-		rcu_assign_pointer(contexts[cpu], c);
b8e312
+		per_cpu(contexts, cpu) = c;
b8e312
 	}
b8e312
 	return 0;
b8e312
 }
b8e312
 
b8e312
 /* We should be free of all probes by this time, but for example the timer for
b8e312
  * _stp_ctl_work_callback may still be running and looking for contexts.  We
b8e312
- * use RCU-sched synchronization to be sure its safe to free them.  */
b8e312
+ * use _stp_context_stop and a write lock to be sure its safe to free them.  */
b8e312
 static void _stp_runtime_contexts_free(void)
b8e312
 {
b8e312
-	// Note that 'free_contexts' is static because it is
b8e312
-	// (probably) too big to fit on a kernel function's stack.
b8e312
-	static struct context *free_contexts[NR_CPUS] = { NULL };
b8e312
-	int cpu;
b8e312
+	unsigned long flags;
b8e312
+	unsigned int cpu;
b8e312
 
b8e312
-	/* First, save all the pointers.  */
b8e312
-	rcu_read_lock_sched();
b8e312
-	for_each_possible_cpu(cpu) {
b8e312
-		free_contexts[cpu] = rcu_dereference_sched(contexts[cpu]);
b8e312
-	}
b8e312
-	rcu_read_unlock_sched();
b8e312
+	/* Sync to make sure existing readers are done */
b8e312
+	atomic_set(&_stp_context_stop, 1);
b8e312
+	write_lock_irqsave(&_stp_context_lock, flags);
b8e312
+	write_unlock_irqrestore(&_stp_context_lock, flags);
b8e312
 
b8e312
-	/* Now clear all pointers to prevent new readers.  */
b8e312
-	for_each_possible_cpu(cpu) {
b8e312
-		rcu_assign_pointer(contexts[cpu], NULL);
b8e312
-	}
b8e312
-
b8e312
-	/* Sync to make sure existing readers are done.  */
b8e312
-	stp_synchronize_sched();
b8e312
-
b8e312
-	/* Now we can actually free the contexts.  */
b8e312
-	for_each_possible_cpu(cpu) {
b8e312
-		struct context *c = free_contexts[cpu];
b8e312
-		if (c != NULL) {
b8e312
-			free_contexts[cpu] = NULL;
b8e312
-			_stp_vfree(c);
b8e312
-		}
b8e312
-	}
b8e312
+	/* Now we can actually free the contexts */
b8e312
+	for_each_possible_cpu(cpu)
b8e312
+		_stp_vfree(per_cpu(contexts, cpu));
b8e312
 }
b8e312
 
b8e312
 static inline struct context * _stp_runtime_get_context(void)
b8e312
 {
b8e312
-	// RHBZ1788662 rcu operations are rejected in idle-cpu contexts
b8e312
-	// in effect: skip probe if it's in rcu-idle state
b8e312
-#if defined(STAPCONF_RCU_IS_WATCHING) || LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0) // linux commit #5c173eb8
b8e312
-        if (! rcu_is_watching())
b8e312
-		return 0;
b8e312
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0) // linux commit #9b2e4f18
b8e312
-        if (! rcu_is_cpu_idle())
b8e312
-		return 0;
b8e312
-#else
b8e312
-	; // XXX older kernels didn't put tracepoints in idle-cpu
b8e312
-#endif
b8e312
-	return rcu_dereference_sched(contexts[smp_processor_id()]);
b8e312
+	if (atomic_read(&_stp_context_stop))
b8e312
+		return NULL;
b8e312
+
b8e312
+	return per_cpu(contexts, smp_processor_id());
b8e312
 }
b8e312
 
b8e312
 static struct context * _stp_runtime_entryfn_get_context(void)
b8e312
+	__acquires(&_stp_context_lock)
b8e312
 {
b8e312
 	struct context* __restrict__ c = NULL;
b8e312
-	preempt_disable ();
b8e312
+
b8e312
+	if (!read_trylock(&_stp_context_lock))
b8e312
+		return NULL;
b8e312
+
b8e312
 	c = _stp_runtime_get_context();
b8e312
 	if (c != NULL) {
b8e312
-		if (atomic_inc_return(&c->busy) == 1) {
b8e312
-			// NB: Notice we're not re-enabling preemption
b8e312
+		if (!atomic_cmpxchg(&c->busy, 0, 1)) {
b8e312
+			// NB: Notice we're not releasing _stp_context_lock
b8e312
 			// here. We exepect the calling code to call
b8e312
 			// _stp_runtime_entryfn_get_context() and
b8e312
 			// _stp_runtime_entryfn_put_context() as a
b8e312
 			// pair.
b8e312
 			return c;
b8e312
 		}
b8e312
-		atomic_dec(&c->busy);
b8e312
 	}
b8e312
-	preempt_enable_no_resched();
b8e312
+	read_unlock(&_stp_context_lock);
b8e312
 	return NULL;
b8e312
 }
b8e312
 
b8e312
 static inline void _stp_runtime_entryfn_put_context(struct context *c)
b8e312
+	__releases(&_stp_context_lock)
b8e312
 {
b8e312
 	if (c) {
b8e312
-		if (c == _stp_runtime_get_context())
b8e312
-			atomic_dec(&c->busy);
b8e312
-		/* else, warn about bad state? */
b8e312
-		preempt_enable_no_resched();
b8e312
+		atomic_set(&c->busy, 0);
b8e312
+		read_unlock(&_stp_context_lock);
b8e312
 	}
b8e312
-	return;
b8e312
 }
b8e312
 
b8e312
 static void _stp_runtime_context_wait(void)
b8e312
@@ -130,9 +105,13 @@ static void _stp_runtime_context_wait(void)
b8e312
 		int i;
b8e312
 
b8e312
 		holdon = 0;
b8e312
-		rcu_read_lock_sched();
b8e312
+		read_lock(&_stp_context_lock);
b8e312
+		if (atomic_read(&_stp_context_stop)) {
b8e312
+			read_unlock(&_stp_context_lock);
b8e312
+			break;
b8e312
+		}
b8e312
 		for_each_possible_cpu(i) {
b8e312
-			struct context *c = rcu_dereference_sched(contexts[i]);
b8e312
+			struct context *c = per_cpu(contexts, i);
b8e312
 			if (c != NULL
b8e312
 			    && atomic_read (& c->busy)) {
b8e312
 				holdon = 1;
b8e312
@@ -146,7 +125,7 @@ static void _stp_runtime_context_wait(void)
b8e312
 				}
b8e312
 			}
b8e312
 		}
b8e312
-		rcu_read_unlock_sched();
b8e312
+		read_unlock(&_stp_context_lock);
b8e312
 
b8e312
 		/*
b8e312
 		 * Just in case things are really really stuck, a
b8e312
commit bb25d64f7b6c98ef2fc8b711f19bd6271866d727
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Tue Dec 1 09:54:07 2020 -0800
b8e312
b8e312
    runtime_context: synchronize _stp_context_stop more strictly
b8e312
    
b8e312
    We're only reading _stp_context_stop while the read lock is held, so we
b8e312
    can move the modification of it to inside the write lock to ensure
b8e312
    strict memory ordering. As such, it no longer needs to be an atomic_t
b8e312
    variable.
b8e312
    
b8e312
    We also don't need to disable IRQs when holding the write lock because
b8e312
    only read_trylock is used from IRQ context, not read_lock, so there's no
b8e312
    possibility of a deadlock occurring.
b8e312
b8e312
diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h
b8e312
index 18566957a..e716e6d39 100644
b8e312
--- a/runtime/linux/runtime_context.h
b8e312
+++ b/runtime/linux/runtime_context.h
b8e312
@@ -14,7 +14,7 @@
b8e312
 /* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */
b8e312
 static DEFINE_RWLOCK(_stp_context_lock);
b8e312
 static DEFINE_PER_CPU(struct context *, contexts);
b8e312
-static atomic_t _stp_context_stop = ATOMIC_INIT(0);
b8e312
+static bool _stp_context_stop;
b8e312
 
b8e312
 static int _stp_runtime_contexts_alloc(void)
b8e312
 {
b8e312
@@ -40,13 +40,12 @@ static int _stp_runtime_contexts_alloc(void)
b8e312
  * use _stp_context_stop and a write lock to be sure its safe to free them.  */
b8e312
 static void _stp_runtime_contexts_free(void)
b8e312
 {
b8e312
-	unsigned long flags;
b8e312
 	unsigned int cpu;
b8e312
 
b8e312
 	/* Sync to make sure existing readers are done */
b8e312
-	atomic_set(&_stp_context_stop, 1);
b8e312
-	write_lock_irqsave(&_stp_context_lock, flags);
b8e312
-	write_unlock_irqrestore(&_stp_context_lock, flags);
b8e312
+	write_lock(&_stp_context_lock);
b8e312
+	_stp_context_stop = true;
b8e312
+	write_unlock(&_stp_context_lock);
b8e312
 
b8e312
 	/* Now we can actually free the contexts */
b8e312
 	for_each_possible_cpu(cpu)
b8e312
@@ -55,7 +54,7 @@ static void _stp_runtime_contexts_free(void)
b8e312
 
b8e312
 static inline struct context * _stp_runtime_get_context(void)
b8e312
 {
b8e312
-	if (atomic_read(&_stp_context_stop))
b8e312
+	if (_stp_context_stop)
b8e312
 		return NULL;
b8e312
 
b8e312
 	return per_cpu(contexts, smp_processor_id());
b8e312
@@ -106,7 +105,7 @@ static void _stp_runtime_context_wait(void)
b8e312
 
b8e312
 		holdon = 0;
b8e312
 		read_lock(&_stp_context_lock);
b8e312
-		if (atomic_read(&_stp_context_stop)) {
b8e312
+		if (_stp_context_stop) {
b8e312
 			read_unlock(&_stp_context_lock);
b8e312
 			break;
b8e312
 		}
b8e312
commit 0cc239e6f0fff79cb584fc857d3220402558db37
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Tue Dec 1 18:47:04 2020 -0800
b8e312
b8e312
    runtime_context: replace _stp_context_lock with an atomic variable
b8e312
    
b8e312
    We can't use any lock primitives here, such as spin locks or rw locks,
b8e312
    because lock_acquire() has tracepoints inside of it. This can cause a
b8e312
    deadlock, so we have to roll our own synchronization mechanism using an
b8e312
    atomic variable.
b8e312
b8e312
diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h
b8e312
index e716e6d39..7dd240e1a 100644
b8e312
--- a/runtime/linux/runtime_context.h
b8e312
+++ b/runtime/linux/runtime_context.h
b8e312
@@ -11,10 +11,9 @@
b8e312
 #ifndef _LINUX_RUNTIME_CONTEXT_H_
b8e312
 #define _LINUX_RUNTIME_CONTEXT_H_
b8e312
 
b8e312
-/* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */
b8e312
-static DEFINE_RWLOCK(_stp_context_lock);
b8e312
+/* Can't use a lock primitive for this because lock_acquire() has tracepoints */
b8e312
+static atomic_t _stp_contexts_busy_ctr = ATOMIC_INIT(0);
b8e312
 static DEFINE_PER_CPU(struct context *, contexts);
b8e312
-static bool _stp_context_stop;
b8e312
 
b8e312
 static int _stp_runtime_contexts_alloc(void)
b8e312
 {
b8e312
@@ -37,15 +36,14 @@ static int _stp_runtime_contexts_alloc(void)
b8e312
 
b8e312
 /* We should be free of all probes by this time, but for example the timer for
b8e312
  * _stp_ctl_work_callback may still be running and looking for contexts.  We
b8e312
- * use _stp_context_stop and a write lock to be sure its safe to free them.  */
b8e312
+ * use _stp_contexts_busy_ctr to be sure its safe to free them.  */
b8e312
 static void _stp_runtime_contexts_free(void)
b8e312
 {
b8e312
 	unsigned int cpu;
b8e312
 
b8e312
 	/* Sync to make sure existing readers are done */
b8e312
-	write_lock(&_stp_context_lock);
b8e312
-	_stp_context_stop = true;
b8e312
-	write_unlock(&_stp_context_lock);
b8e312
+	while (atomic_cmpxchg(&_stp_contexts_busy_ctr, 0, INT_MAX))
b8e312
+		cpu_relax();
b8e312
 
b8e312
 	/* Now we can actually free the contexts */
b8e312
 	for_each_possible_cpu(cpu)
b8e312
@@ -54,24 +52,20 @@ static void _stp_runtime_contexts_free(void)
b8e312
 
b8e312
 static inline struct context * _stp_runtime_get_context(void)
b8e312
 {
b8e312
-	if (_stp_context_stop)
b8e312
-		return NULL;
b8e312
-
b8e312
 	return per_cpu(contexts, smp_processor_id());
b8e312
 }
b8e312
 
b8e312
 static struct context * _stp_runtime_entryfn_get_context(void)
b8e312
-	__acquires(&_stp_context_lock)
b8e312
 {
b8e312
 	struct context* __restrict__ c = NULL;
b8e312
 
b8e312
-	if (!read_trylock(&_stp_context_lock))
b8e312
+	if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))
b8e312
 		return NULL;
b8e312
 
b8e312
 	c = _stp_runtime_get_context();
b8e312
 	if (c != NULL) {
b8e312
 		if (!atomic_cmpxchg(&c->busy, 0, 1)) {
b8e312
-			// NB: Notice we're not releasing _stp_context_lock
b8e312
+			// NB: Notice we're not releasing _stp_contexts_busy_ctr
b8e312
 			// here. We exepect the calling code to call
b8e312
 			// _stp_runtime_entryfn_get_context() and
b8e312
 			// _stp_runtime_entryfn_put_context() as a
b8e312
@@ -79,16 +73,15 @@ static struct context * _stp_runtime_entryfn_get_context(void)
b8e312
 			return c;
b8e312
 		}
b8e312
 	}
b8e312
-	read_unlock(&_stp_context_lock);
b8e312
+	atomic_dec(&_stp_contexts_busy_ctr);
b8e312
 	return NULL;
b8e312
 }
b8e312
 
b8e312
 static inline void _stp_runtime_entryfn_put_context(struct context *c)
b8e312
-	__releases(&_stp_context_lock)
b8e312
 {
b8e312
 	if (c) {
b8e312
 		atomic_set(&c->busy, 0);
b8e312
-		read_unlock(&_stp_context_lock);
b8e312
+		atomic_dec(&_stp_contexts_busy_ctr);
b8e312
 	}
b8e312
 }
b8e312
 
b8e312
@@ -104,11 +97,9 @@ static void _stp_runtime_context_wait(void)
b8e312
 		int i;
b8e312
 
b8e312
 		holdon = 0;
b8e312
-		read_lock(&_stp_context_lock);
b8e312
-		if (_stp_context_stop) {
b8e312
-			read_unlock(&_stp_context_lock);
b8e312
+		if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))
b8e312
 			break;
b8e312
-		}
b8e312
+
b8e312
 		for_each_possible_cpu(i) {
b8e312
 			struct context *c = per_cpu(contexts, i);
b8e312
 			if (c != NULL
b8e312
@@ -124,7 +115,7 @@ static void _stp_runtime_context_wait(void)
b8e312
 				}
b8e312
 			}
b8e312
 		}
b8e312
-		read_unlock(&_stp_context_lock);
b8e312
+		atomic_dec(&_stp_contexts_busy_ctr);
b8e312
 
b8e312
 		/*
b8e312
 		 * Just in case things are really really stuck, a
b8e312
commit fbab0ea35e6af0d6599c6de3708b24008bf03ae6
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Wed Dec 2 11:07:11 2020 -0800
b8e312
b8e312
    runtime_context: disable preempt while holding the context
b8e312
    
b8e312
    After the context lock was converted to an atomic in the previous
b8e312
    commit, the preempt disable logic disappeared. Add it back.
b8e312
b8e312
diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h
b8e312
index 7dd240e1a..7a1532e54 100644
b8e312
--- a/runtime/linux/runtime_context.h
b8e312
+++ b/runtime/linux/runtime_context.h
b8e312
@@ -34,6 +34,24 @@ static int _stp_runtime_contexts_alloc(void)
b8e312
 	return 0;
b8e312
 }
b8e312
 
b8e312
+static bool _stp_runtime_context_trylock(void)
b8e312
+{
b8e312
+	bool locked;
b8e312
+
b8e312
+	preempt_disable();
b8e312
+	locked = atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX);
b8e312
+	if (!locked)
b8e312
+		preempt_enable_no_resched();
b8e312
+
b8e312
+	return locked;
b8e312
+}
b8e312
+
b8e312
+static void _stp_runtime_context_unlock(void)
b8e312
+{
b8e312
+	atomic_dec(&_stp_contexts_busy_ctr);
b8e312
+	preempt_enable_no_resched();
b8e312
+}
b8e312
+
b8e312
 /* We should be free of all probes by this time, but for example the timer for
b8e312
  * _stp_ctl_work_callback may still be running and looking for contexts.  We
b8e312
  * use _stp_contexts_busy_ctr to be sure its safe to free them.  */
b8e312
@@ -59,7 +77,7 @@ static struct context * _stp_runtime_entryfn_get_context(void)
b8e312
 {
b8e312
 	struct context* __restrict__ c = NULL;
b8e312
 
b8e312
-	if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))
b8e312
+	if (!_stp_runtime_context_trylock())
b8e312
 		return NULL;
b8e312
 
b8e312
 	c = _stp_runtime_get_context();
b8e312
@@ -73,7 +91,7 @@ static struct context * _stp_runtime_entryfn_get_context(void)
b8e312
 			return c;
b8e312
 		}
b8e312
 	}
b8e312
-	atomic_dec(&_stp_contexts_busy_ctr);
b8e312
+	_stp_runtime_context_unlock();
b8e312
 	return NULL;
b8e312
 }
b8e312
 
b8e312
@@ -81,7 +99,7 @@ static inline void _stp_runtime_entryfn_put_context(struct context *c)
b8e312
 {
b8e312
 	if (c) {
b8e312
 		atomic_set(&c->busy, 0);
b8e312
-		atomic_dec(&_stp_contexts_busy_ctr);
b8e312
+		_stp_runtime_context_unlock();
b8e312
 	}
b8e312
 }
b8e312
 
b8e312
@@ -97,7 +115,7 @@ static void _stp_runtime_context_wait(void)
b8e312
 		int i;
b8e312
 
b8e312
 		holdon = 0;
b8e312
-		if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))
b8e312
+		if (!_stp_runtime_context_trylock())
b8e312
 			break;
b8e312
 
b8e312
 		for_each_possible_cpu(i) {
b8e312
@@ -115,7 +133,7 @@ static void _stp_runtime_context_wait(void)
b8e312
 				}
b8e312
 			}
b8e312
 		}
b8e312
-		atomic_dec(&_stp_contexts_busy_ctr);
b8e312
+		_stp_runtime_context_unlock();
b8e312
 
b8e312
 		/*
b8e312
 		 * Just in case things are really really stuck, a
b8e312
commit aedc044d5d38cb2fa6144d0a3345d06847862f1b
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Wed Dec 2 11:27:47 2020 -0800
b8e312
b8e312
    task_finder_vma: add kfree_rcu() compat for old kernels
b8e312
    
b8e312
    Newer RHEL 6 kernels have kfree_rcu(), but older ones do not. Using
b8e312
    kfree_rcu() is beneficial because it lets the RCU subsystem know that
b8e312
    the queued RCU callback is low-priority, and can be deferred, hence why
b8e312
    we don't replace kfree_rcu() with call_rcu() outright. Luckily,
b8e312
    kfree_rcu() is a macro so we can just #ifdef with it.
b8e312
b8e312
diff --git a/runtime/task_finder_vma.c b/runtime/task_finder_vma.c
b8e312
index 7f0f6ed56..dc77a80f5 100644
b8e312
--- a/runtime/task_finder_vma.c
b8e312
+++ b/runtime/task_finder_vma.c
b8e312
@@ -87,6 +87,15 @@ __stp_tf_vma_new_entry(void)
b8e312
 	return entry;
b8e312
 }
b8e312
 
b8e312
+#ifndef kfree_rcu
b8e312
+static void __stp_tf_vma_free_entry(struct rcu_head *rcu)
b8e312
+{
b8e312
+	struct __stp_tf_vma_entry *entry = container_of(rcu, typeof(*entry), rcu);
b8e312
+
b8e312
+	kfree(entry);
b8e312
+}
b8e312
+#endif
b8e312
+
b8e312
 // __stp_tf_vma_put_entry(): Put a specified number of references on the entry.
b8e312
 static void
b8e312
 __stp_tf_vma_put_entry(struct __stp_tf_vma_bucket *bucket,
b8e312
@@ -106,7 +115,11 @@ __stp_tf_vma_put_entry(struct __stp_tf_vma_bucket *bucket,
b8e312
 	hlist_del_rcu(&entry->hlist);
b8e312
 	stp_spin_unlock_irqrestore(&bucket->lock, flags);
b8e312
 
b8e312
+#ifdef kfree_rcu
b8e312
 	kfree_rcu(entry, rcu);
b8e312
+#else
b8e312
+	call_rcu(&entry->rcu, __stp_tf_vma_free_entry);
b8e312
+#endif
b8e312
 }
b8e312
 
b8e312
 // stap_initialize_vma_map():  Initialize the free list.  Grabs the
b8e312
commit 6a27888b118b7a94650a68aae028957cdd5fb5f5
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Wed Dec 2 18:09:17 2020 -0800
b8e312
b8e312
    REVERTME: tapset-timers: work around on-the-fly deadlocks caused by mutex_trylock
b8e312
    
b8e312
    The following deadlock exists due to tracepoints existing inside a lock
b8e312
    that is used both inside probe context and outside probe context:
b8e312
     #0 [ffff88017f6d7a08] kvm_wait at ffffffff81079f5a
b8e312
     #1 [ffff88017f6d7a30] __pv_queued_spin_lock_slowpath at ffffffff8114f51e
b8e312
     #2 [ffff88017f6d7a70] queued_spin_lock_slowpath at ffffffff810e842b
b8e312
     #3 [ffff88017f6d7a80] mutex_trylock at ffffffff81882b1b
b8e312
     #4 [ffff88017f6d7ab8] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
     #5 [ffff88017f6d7ad8] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
     #6 [ffff88017f6d7b10] probe_7879 at ffffffffc0a98c85 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
     #7 [ffff88017f6d7b38] enter_real_tracepoint_probe_1543 at ffffffffc0c3b757 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
     #8 [ffff88017f6d7b70] enter_tracepoint_probe_1543 at ffffffffc09b117e [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
     #9 [ffff88017f6d7b80] lock_acquire at ffffffff811460ba
b8e312
    #10 [ffff88017f6d7be8] mutex_trylock at ffffffff81882a27
b8e312
    #11 [ffff88017f6d7c20] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
    #12 [ffff88017f6d7c40] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
    #13 [ffff88017f6d7c78] _stp_vlog at ffffffffc09b8d32 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
    #14 [ffff88017f6d7cd8] _stp_dbug at ffffffffc09ba43b [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
    #15 [ffff88017f6d7d38] systemtap_module_refresh at ffffffffc09ba51d [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
    #16 [ffff88017f6d7d50] module_refresher at ffffffffc09ba53e [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
    #17 [ffff88017f6d7d60] process_one_work at ffffffff810da9cc
b8e312
    #18 [ffff88017f6d7de8] worker_thread at ffffffff810dafe6
b8e312
    #19 [ffff88017f6d7e48] kthread at ffffffff810e44cf
b8e312
    #20 [ffff88017f6d7f50] ret_from_fork_nospec_begin at ffffffff818958dd
b8e312
    
b8e312
    Note the deadlock due to _stp_transport_trylock_relay_inode recursing
b8e312
    onto itself via mutex_trylock.
b8e312
    
b8e312
    This is a temporary fix for the issue until a proper patch is made to
b8e312
    remove the mutex_trylock from __stp_print_flush. This should be reverted
b8e312
    when that patch lands (it will have something to do with bulkmode).
b8e312
b8e312
diff --git a/tapset-timers.cxx b/tapset-timers.cxx
b8e312
index 10da17cda..503498c85 100644
b8e312
--- a/tapset-timers.cxx
b8e312
+++ b/tapset-timers.cxx
b8e312
@@ -391,11 +391,11 @@ hrtimer_derived_probe_group::emit_module_refresh (systemtap_session& s)
b8e312
   s.op->newline(+1) <<   "struct stap_hrtimer_probe* stp = &stap_hrtimer_probes[i];";
b8e312
   // timer disabled, but condition says enabled?
b8e312
   s.op->newline( 0) <<   "if (!stp->enabled && stp->probe->cond_enabled) {";
b8e312
-  s.op->newline(+1) <<     "dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
b8e312
+  s.op->newline(+1) <<     "//dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
b8e312
   s.op->newline( 0) <<     "_stp_hrtimer_start(stp);";
b8e312
   // timer enabled, but condition says disabled?
b8e312
   s.op->newline(-1) <<   "} else if (stp->enabled && !stp->probe->cond_enabled) {";
b8e312
-  s.op->newline(+1) <<     "dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
b8e312
+  s.op->newline(+1) <<     "//dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
b8e312
   s.op->newline( 0) <<     "_stp_hrtimer_cancel(stp);";
b8e312
   s.op->newline(-1) <<   "}";
b8e312
   s.op->newline( 0) <<   "stp->enabled = stp->probe->cond_enabled;";
b8e312
commit 7187dcf39412fcb25c432d318be8e49a6051f055
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Thu Dec 3 12:57:34 2020 -0800
b8e312
b8e312
    runtime: fix print races in IRQ context and during print cleanup
b8e312
    
b8e312
    Prints can race when there's a print called from IRQ context or a print
b8e312
    called while print cleanup takes place, which can lead to garbled print
b8e312
    messages, out-of-bounds memory accesses, and memory use-after-free. This
b8e312
    is one example of racy modification of the print buffer len in IRQ
b8e312
    context which caused a panic due to an out-of-bounds memory access:
b8e312
    
b8e312
    BUG: unable to handle kernel paging request at ffffe8ffff621000
b8e312
    IP: [<ffffffffc05da0f3>] _stp_vsprint_memory+0x83/0x950 [stap_2c44636dfda18135ca3012a752599da6_13_533]
b8e312
    PGD 174b90067 PUD 174b8f067 PMD 174b93067 PTE 0
b8e312
    Oops: 0002 [#1] SMP
b8e312
    CPU: 12 PID: 3468 Comm: cat Kdump: loaded Tainted: G           OE  ------------   3.10.0-1127.19.1.el7.x86_64.debug #1
b8e312
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014
b8e312
    task: ffff88001f4f0000 ti: ffff88004ea5c000 task.ti: ffff88004ea5c000
b8e312
    RIP: 0010:[<ffffffffc05da0f3>]  [<ffffffffc05da0f3>] _stp_vsprint_memory+0x83/0x950 [stap_2c44636dfda18135ca3012a752599da6_13_533]
b8e312
    RSP: 0018:ffff88004ea5f9a8  EFLAGS: 00010082
b8e312
    RAX: ffffe8ffff621001 RBX: ffffe8ffff620ff2 RCX: fffffffffffffffe
b8e312
    RDX: 000000000000006e RSI: ffffffffffffffff RDI: ffffc90002c23730
b8e312
    RBP: ffff88004ea5fa28 R08: 00000000ffffffff R09: 0000000000000073
b8e312
    R10: ffffc90002c243d7 R11: 0000000000000001 R12: ffffc90002c2373f
b8e312
    R13: ffffe8ffff621004 R14: 0000000000000012 R15: 00000000fffffffe
b8e312
    FS:  00007f8a9b1d4740(0000) GS:ffff880179e00000(0000) knlGS:0000000000000000
b8e312
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
b8e312
    CR2: ffffe8ffff621000 CR3: 00000000b3e3c000 CR4: 0000000000360fe0
b8e312
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
b8e312
    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
b8e312
    Call Trace:
b8e312
     [<ffffffff8103eb89>] ? sched_clock+0x9/0x10
b8e312
     [<ffffffff8114036f>] ? lock_release_holdtime.part.30+0xf/0x1a0
b8e312
     [<ffffffffc05dcb80>] function___global_trace__overload_0+0x5b0/0x1220 [stap_2c44636dfda18135ca3012a752599da6_13_533]
b8e312
     [<ffffffffc05d8993>] ? stp_lock_probe+0x53/0xe0 [stap_2c44636dfda18135ca3012a752599da6_13_533]
b8e312
     [<ffffffff8188d879>] ? kretprobe_trampoline_holder+0x9/0x9
b8e312
     [<ffffffffc05e0662>] probe_7118+0x82/0xe0 [stap_2c44636dfda18135ca3012a752599da6_13_533]
b8e312
     [<ffffffffc05de866>] enter_kretprobe_common+0x256/0x490 [stap_2c44636dfda18135ca3012a752599da6_13_533]
b8e312
     [<ffffffff813489f1>] ? proc_sys_open+0x51/0x60
b8e312
     [<ffffffffc05dead0>] enter_kretprobe_probe+0x10/0x20 [stap_2c44636dfda18135ca3012a752599da6_13_533]
b8e312
     [<ffffffff8188e1d8>] trampoline_handler+0x148/0x220
b8e312
     [<ffffffff813489f1>] ? proc_sys_open+0x51/0x60
b8e312
     [<ffffffff8188d89e>] kretprobe_trampoline+0x25/0x57
b8e312
     [<ffffffff813489f1>] ? proc_sys_open+0x51/0x60
b8e312
     [<ffffffff8188d879>] kretprobe_trampoline_holder+0x9/0x9
b8e312
     [<ffffffff81384702>] ? security_inode_permission+0x22/0x30
b8e312
     [<ffffffff813489a0>] ? sysctl_head_finish+0x50/0x50
b8e312
     [<ffffffff812ac11d>] vfs_open+0x5d/0xb0
b8e312
     [<ffffffff812bb74a>] ? may_open+0x5a/0x120
b8e312
     [<ffffffff812c0af5>] do_last+0x285/0x15b0
b8e312
     [<ffffffff812bf18e>] ? link_path_walk+0x27e/0x8c0
b8e312
     [<ffffffff812c1ef0>] path_openat+0xd0/0x5d0
b8e312
     [<ffffffff8107a7f3>] ? kvm_clock_read+0x33/0x40
b8e312
     [<ffffffff812c38ad>] do_filp_open+0x4d/0xb0
b8e312
     [<ffffffff81889497>] ? _raw_spin_unlock+0x27/0x40
b8e312
     [<ffffffff812d5a9b>] ? __alloc_fd+0xfb/0x270
b8e312
     [<ffffffff812ad784>] do_sys_open+0x124/0x220
b8e312
     [<ffffffff812ad89e>] SyS_open+0x1e/0x20
b8e312
     [<ffffffff8188d879>] kretprobe_trampoline_holder+0x9/0x9
b8e312
    
b8e312
    This patch resolves the IRQ print races by disabling IRQs on the local
b8e312
    CPU when accessing said CPU's print buffer, and resolves the cleanup
b8e312
    races with a lock. We also protect against data corruption and panics
b8e312
    from prints inside NMIs now by checking if the current CPU was accessing
b8e312
    the log buffer when an NMI fired; in this case, the NMI's prints will be
b8e312
    dropped, as there is no way to safely service them without creating a
b8e312
    dedicated log buffer for them. This is achieved by forbidding reentrancy
b8e312
    with respect to _stp_print_trylock_irqsave() when the runtime context
b8e312
    isn't held. Reentrancy is otherwise allowed when the runtime context is
b8e312
    held because the runtime context provides reentrancy protection.
b8e312
b8e312
diff --git a/runtime/linux/io.c b/runtime/linux/io.c
b8e312
index 74a032c52..122708e2a 100644
b8e312
--- a/runtime/linux/io.c
b8e312
+++ b/runtime/linux/io.c
b8e312
@@ -20,9 +20,6 @@
b8e312
 
b8e312
 #define WARN_STRING "WARNING: "
b8e312
 #define ERR_STRING "ERROR: "
b8e312
-#if (STP_LOG_BUF_LEN < 10) /* sizeof(WARN_STRING) */
b8e312
-#error "STP_LOG_BUF_LEN is too short"
b8e312
-#endif
b8e312
 
b8e312
 enum code { INFO=0, WARN, ERROR, DBUG };
b8e312
 
b8e312
@@ -31,25 +28,37 @@ static void _stp_vlog (enum code type, const char *func, int line, const char *f
b8e312
 
b8e312
 static void _stp_vlog (enum code type, const char *func, int line, const char *fmt, va_list args)
b8e312
 {
b8e312
+	struct _stp_log *log;
b8e312
+	unsigned long flags;
b8e312
+	size_t bytes_avail;
b8e312
 	int num;
b8e312
-	char *buf = per_cpu_ptr(Stp_lbuf, get_cpu());
b8e312
+	char *buf;
b8e312
 	int start = 0;
b8e312
 
b8e312
+	if (!_stp_print_trylock_irqsave(&flags))
b8e312
+		return;
b8e312
+
b8e312
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
+	bytes_avail = STP_BUFFER_SIZE - log->len;
b8e312
+	if (unlikely(!bytes_avail))
b8e312
+		goto err_unlock;
b8e312
+
b8e312
+	buf = &log->buf[log->len];
b8e312
 	if (type == DBUG) {
b8e312
-		start = _stp_snprintf(buf, STP_LOG_BUF_LEN, "%s:%d: ", func, line);
b8e312
+		start = _stp_snprintf(buf, bytes_avail, "%s:%d: ", func, line);
b8e312
 	} else if (type == WARN) {
b8e312
-		/* This strcpy() is OK, since we know STP_LOG_BUF_LEN
b8e312
-		 * is > sizeof(WARN_STRING). */
b8e312
-		strcpy (buf, WARN_STRING);
b8e312
-		start = sizeof(WARN_STRING) - 1;
b8e312
+		strncpy(buf, WARN_STRING, bytes_avail);
b8e312
+		start = min(bytes_avail, sizeof(WARN_STRING) - 1);
b8e312
 	} else if (type == ERROR) {
b8e312
-		/* This strcpy() is OK, since we know STP_LOG_BUF_LEN
b8e312
-		 * is > sizeof(ERR_STRING) (which is < sizeof(WARN_STRING). */
b8e312
-		strcpy (buf, ERR_STRING);
b8e312
-		start = sizeof(ERR_STRING) - 1;
b8e312
+		strncpy(buf, ERR_STRING, bytes_avail);
b8e312
+		start = min(bytes_avail, sizeof(ERR_STRING) - 1);
b8e312
 	}
b8e312
 
b8e312
-	num = vscnprintf (buf + start, STP_LOG_BUF_LEN - start - 1, fmt, args);
b8e312
+	bytes_avail -= start;
b8e312
+	if (unlikely(!bytes_avail))
b8e312
+		goto err_unlock;
b8e312
+
b8e312
+	num = vscnprintf(buf + start, bytes_avail - 1, fmt, args);
b8e312
 	if (num + start) {
b8e312
 		if (buf[num + start - 1] != '\n') {
b8e312
 			buf[num + start] = '\n';
b8e312
@@ -66,12 +75,13 @@ static void _stp_vlog (enum code type, const char *func, int line, const char *f
b8e312
 		if (type != DBUG) {
b8e312
 			_stp_ctl_send(STP_OOB_DATA, buf, start + num + 1);
b8e312
 		} else {
b8e312
-			_stp_print(buf);
b8e312
-			_stp_print_flush();
b8e312
+			log->len += start + num;
b8e312
+			__stp_print_flush(log);
b8e312
 		}
b8e312
 #endif
b8e312
 	}
b8e312
-	put_cpu();
b8e312
+err_unlock:
b8e312
+	_stp_print_unlock_irqrestore(&flags);
b8e312
 }
b8e312
 
b8e312
 /** Prints warning.
b8e312
diff --git a/runtime/linux/print.c b/runtime/linux/print.c
b8e312
index 777bca8b0..2aa2f1c8d 100644
b8e312
--- a/runtime/linux/print.c
b8e312
+++ b/runtime/linux/print.c
b8e312
@@ -35,84 +35,179 @@
b8e312
  * @{
b8e312
  */
b8e312
 
b8e312
-typedef struct __stp_pbuf {
b8e312
-	uint32_t len;			/* bytes used in the buffer */
b8e312
+struct _stp_log {
b8e312
+	unsigned int len; /* Bytes used in the buffer */
b8e312
 	char buf[STP_BUFFER_SIZE];
b8e312
-} _stp_pbuf;
b8e312
+	atomic_t reentrancy_lock;
b8e312
+};
b8e312
+#include "print_flush.c"
b8e312
 
b8e312
-static void *Stp_pbuf = NULL;
b8e312
+static struct _stp_log *_stp_log_pcpu;
b8e312
+
b8e312
+/*
b8e312
+ * An atomic counter is used to synchronize every possible print buffer usage
b8e312
+ * with the _stp_print_cleanup() function. The cleanup function sets the counter
b8e312
+ * to INT_MAX after waiting for everything using the print buffer to finish. We
b8e312
+ * cannot use a lock primitive to implement this because lock_acquire() contains
b8e312
+ * tracepoints and print statements are used both inside and outside of probes.
b8e312
+ * If the lock were only used inside probes, the runtime context would protect
b8e312
+ * us from recursing into the lock_acquire() tracepoints and deadlocking. We
b8e312
+ * instead use _stp_print_ctr as if it were a read-write lock.
b8e312
+ */
b8e312
+static atomic_t _stp_print_ctr = ATOMIC_INIT(0);
b8e312
 
b8e312
-/** private buffer for _stp_vlog() */
b8e312
-#ifndef STP_LOG_BUF_LEN
b8e312
-#define STP_LOG_BUF_LEN 256
b8e312
-#endif
b8e312
+/*
b8e312
+ * This disables IRQs to make per-CPU print buffer accesses atomic. There is a
b8e312
+ * reentrancy protection mechanism specifically for NMIs, since they can violate
b8e312
+ * our atomic guarantee. Reentrancy is otherwise allowed within code sections
b8e312
+ * that have the runtime context held (via _stp_runtime_entryfn_get_context()).
b8e312
+ */
b8e312
+static bool _stp_print_trylock_irqsave(unsigned long *flags)
b8e312
+{
b8e312
+	bool context_held = false;
b8e312
+	struct _stp_log *log;
b8e312
+
b8e312
+	local_irq_save(*flags);
b8e312
+	if (!atomic_add_unless(&_stp_print_ctr, 1, INT_MAX))
b8e312
+		goto irq_restore;
b8e312
+
b8e312
+	/*
b8e312
+	 * Check the per-CPU reentrancy lock for contention, unless the runtime
b8e312
+	 * context is already held, in which case we already have reentrancy
b8e312
+	 * protection. Otherwise, if the reentrancy lock is contented, that
b8e312
+	 * means we're either inside an NMI that fired while the current CPU was
b8e312
+	 * accessing the log buffer, or something is trying to nest calls to
b8e312
+	 * _stp_print_trylock_irqsave(). Our only choice is to reject the log
b8e312
+	 * access attempt in this case because log buffer corruption and panics
b8e312
+	 * could ensue if we're inside an NMI.
b8e312
+	 */
b8e312
+	if (_stp_runtime_context_trylock()) {
b8e312
+		struct context *c = _stp_runtime_get_context();
b8e312
+		context_held = c && atomic_read(&c->busy);
b8e312
+		_stp_runtime_context_unlock();
b8e312
+	}
b8e312
 
b8e312
-typedef char _stp_lbuf[STP_LOG_BUF_LEN];
b8e312
-static void *Stp_lbuf = NULL;
b8e312
+	/* Fall back onto the reentrancy lock if the context isn't held */
b8e312
+	if (!context_held) {
b8e312
+		log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
+		if (atomic_cmpxchg(&log->reentrancy_lock, 0, 1))
b8e312
+			goto print_unlock;
b8e312
+	}
b8e312
+
b8e312
+	return true;
b8e312
+
b8e312
+print_unlock:
b8e312
+	atomic_dec(&_stp_print_ctr);
b8e312
+irq_restore:
b8e312
+	local_irq_restore(*flags);
b8e312
+	return false;
b8e312
+}
b8e312
+
b8e312
+static void _stp_print_unlock_irqrestore(unsigned long *flags)
b8e312
+{
b8e312
+	bool context_held = false;
b8e312
+	struct _stp_log *log;
b8e312
+
b8e312
+	if (_stp_runtime_context_trylock()) {
b8e312
+		struct context *c = _stp_runtime_get_context();
b8e312
+		context_held = c && atomic_read(&c->busy);
b8e312
+		_stp_runtime_context_unlock();
b8e312
+	}
b8e312
+
b8e312
+	if (!context_held) {
b8e312
+		log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
+		atomic_set(&log->reentrancy_lock, 0);
b8e312
+	}
b8e312
+
b8e312
+	atomic_dec(&_stp_print_ctr);
b8e312
+	local_irq_restore(*flags);
b8e312
+}
b8e312
 
b8e312
 /* create percpu print and io buffers */
b8e312
 static int _stp_print_init (void)
b8e312
 {
b8e312
-	Stp_pbuf = _stp_alloc_percpu(sizeof(_stp_pbuf));
b8e312
-	if (unlikely(Stp_pbuf == 0))
b8e312
-		return -1;
b8e312
-
b8e312
-	/* now initialize IO buffer used in io.c */
b8e312
-	Stp_lbuf = _stp_alloc_percpu(sizeof(_stp_lbuf));
b8e312
-	if (unlikely(Stp_lbuf == 0)) {
b8e312
-		_stp_free_percpu(Stp_pbuf);
b8e312
-		return -1;
b8e312
+	unsigned int cpu;
b8e312
+
b8e312
+	_stp_log_pcpu = _stp_alloc_percpu(sizeof(*_stp_log_pcpu));
b8e312
+	if (!_stp_log_pcpu)
b8e312
+		return -ENOMEM;
b8e312
+
b8e312
+	for_each_possible_cpu(cpu) {
b8e312
+		struct _stp_log *log = per_cpu_ptr(_stp_log_pcpu, cpu);
b8e312
+
b8e312
+		log->reentrancy_lock = (atomic_t)ATOMIC_INIT(0);
b8e312
 	}
b8e312
 	return 0;
b8e312
 }
b8e312
 
b8e312
 static void _stp_print_cleanup (void)
b8e312
 {
b8e312
-	if (Stp_pbuf)
b8e312
-		_stp_free_percpu(Stp_pbuf);
b8e312
-	if (Stp_lbuf)
b8e312
-		_stp_free_percpu(Stp_lbuf);
b8e312
-}
b8e312
+	unsigned int cpu;
b8e312
 
b8e312
-#include "print_flush.c"
b8e312
+	/* Wait for the loggers to finish modifying the print buffers */
b8e312
+	while (atomic_cmpxchg(&_stp_print_ctr, 0, INT_MAX))
b8e312
+		cpu_relax();
b8e312
+
b8e312
+	for_each_possible_cpu(cpu) {
b8e312
+		struct _stp_log *log = per_cpu_ptr(_stp_log_pcpu, cpu);
b8e312
+
b8e312
+		/*
b8e312
+		 * Flush anything that could be left in the print buffer. It is
b8e312
+		 * safe to do this without any kind of synchronization mechanism
b8e312
+		 * because nothing is using this print buffer anymore.
b8e312
+		 */
b8e312
+		__stp_print_flush(log);
b8e312
+	}
b8e312
+
b8e312
+	_stp_free_percpu(_stp_log_pcpu);
b8e312
+}
b8e312
 
b8e312
 static inline void _stp_print_flush(void)
b8e312
 {
b8e312
-	stp_print_flush(per_cpu_ptr(Stp_pbuf, smp_processor_id()));
b8e312
+	struct _stp_log *log;
b8e312
+	unsigned long flags;
b8e312
+
b8e312
+	if (!_stp_print_trylock_irqsave(&flags))
b8e312
+		return;
b8e312
+
b8e312
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
+	__stp_print_flush(log);
b8e312
+	_stp_print_unlock_irqrestore(&flags);
b8e312
 }
b8e312
+
b8e312
 #ifndef STP_MAXBINARYARGS
b8e312
 #define STP_MAXBINARYARGS 127
b8e312
 #endif
b8e312
 
b8e312
 
b8e312
-/** Reserves space in the output buffer for direct I/O.
b8e312
+/** Reserves space in the output buffer for direct I/O. Must be called with
b8e312
+ * _stp_print_trylock_irqsave() held.
b8e312
  */
b8e312
 static void * _stp_reserve_bytes (int numbytes)
b8e312
 {
b8e312
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
b8e312
-	int size = STP_BUFFER_SIZE - pb->len;
b8e312
-	void * ret;
b8e312
+	struct _stp_log *log;
b8e312
+	char *ret;
b8e312
 
b8e312
 	if (unlikely(numbytes == 0 || numbytes > STP_BUFFER_SIZE))
b8e312
 		return NULL;
b8e312
 
b8e312
-	if (unlikely(numbytes > size))
b8e312
-		_stp_print_flush();
b8e312
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
+	if (unlikely(numbytes > (STP_BUFFER_SIZE - log->len)))
b8e312
+		__stp_print_flush(log);
b8e312
 
b8e312
-	ret = pb->buf + pb->len;
b8e312
-	pb->len += numbytes;
b8e312
+	ret = &log->buf[log->len];
b8e312
+	log->len += numbytes;
b8e312
 	return ret;
b8e312
 }
b8e312
 
b8e312
 
b8e312
 static void _stp_unreserve_bytes (int numbytes)
b8e312
 {
b8e312
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
b8e312
-
b8e312
-	if (unlikely(numbytes == 0 || numbytes > pb->len))
b8e312
-		return;
b8e312
+	struct _stp_log *log;
b8e312
 
b8e312
-	pb->len -= numbytes;
b8e312
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
+	if (numbytes <= log->len)
b8e312
+		log->len -= numbytes;
b8e312
 }
b8e312
 
b8e312
 /** Write 64-bit args directly into the output stream.
b8e312
@@ -123,22 +218,25 @@ static void _stp_unreserve_bytes (int numbytes)
b8e312
  */
b8e312
 static void _stp_print_binary (int num, ...)
b8e312
 {
b8e312
+	unsigned long flags;
b8e312
 	va_list vargs;
b8e312
 	int i;
b8e312
 	int64_t *args;
b8e312
-	
b8e312
+
b8e312
 	if (unlikely(num > STP_MAXBINARYARGS))
b8e312
 		num = STP_MAXBINARYARGS;
b8e312
 
b8e312
-	args = _stp_reserve_bytes(num * sizeof(int64_t));
b8e312
+	if (!_stp_print_trylock_irqsave(&flags))
b8e312
+		return;
b8e312
 
b8e312
-	if (likely(args != NULL)) {
b8e312
+	args = _stp_reserve_bytes(num * sizeof(int64_t));
b8e312
+	if (args) {
b8e312
 		va_start(vargs, num);
b8e312
-		for (i = 0; i < num; i++) {
b8e312
+		for (i = 0; i < num; i++)
b8e312
 			args[i] = va_arg(vargs, int64_t);
b8e312
-		}
b8e312
 		va_end(vargs);
b8e312
 	}
b8e312
+	_stp_print_unlock_irqrestore(&flags);
b8e312
 }
b8e312
 
b8e312
 /** Print into the print buffer.
b8e312
@@ -149,6 +247,7 @@ static void _stp_print_binary (int num, ...)
b8e312
 static void _stp_printf (const char *fmt, ...)
b8e312
 {
b8e312
 	va_list args;
b8e312
+
b8e312
 	va_start(args, fmt);
b8e312
 	_stp_vsnprintf(NULL, 0, fmt, args);
b8e312
 	va_end(args);
b8e312
@@ -160,37 +259,36 @@ static void _stp_printf (const char *fmt, ...)
b8e312
 
b8e312
 static void _stp_print (const char *str)
b8e312
 {
b8e312
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
b8e312
-	char *end = pb->buf + STP_BUFFER_SIZE;
b8e312
-	char *ptr = pb->buf + pb->len;
b8e312
-	char *instr = (char *)str;
b8e312
-
b8e312
-	while (ptr < end && *instr)
b8e312
-		*ptr++ = *instr++;
b8e312
-
b8e312
-	/* Did loop terminate due to lack of buffer space? */
b8e312
-	if (unlikely(*instr)) {
b8e312
-		/* Don't break strings across subbufs. */
b8e312
-		/* Restart after flushing. */
b8e312
-		_stp_print_flush();
b8e312
-		end = pb->buf + STP_BUFFER_SIZE;
b8e312
-		ptr = pb->buf + pb->len;
b8e312
-		instr = (char *)str;
b8e312
-		while (ptr < end && *instr)
b8e312
-			*ptr++ = *instr++;
b8e312
+	struct _stp_log *log;
b8e312
+	unsigned long flags;
b8e312
+
b8e312
+	if (!_stp_print_trylock_irqsave(&flags))
b8e312
+		return;
b8e312
+
b8e312
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
+	while (1) {
b8e312
+		while (log->len < STP_BUFFER_SIZE && *str)
b8e312
+			log->buf[log->len++] = *str++;
b8e312
+		if (likely(!*str))
b8e312
+			break;
b8e312
+		__stp_print_flush(log);
b8e312
 	}
b8e312
-	pb->len = ptr - pb->buf;
b8e312
+	_stp_print_unlock_irqrestore(&flags);
b8e312
 }
b8e312
 
b8e312
 static void _stp_print_char (const char c)
b8e312
 {
b8e312
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
b8e312
-	int size = STP_BUFFER_SIZE - pb->len;
b8e312
-	if (unlikely(1 >= size))
b8e312
-		_stp_print_flush();
b8e312
-	
b8e312
-	pb->buf[pb->len] = c;
b8e312
-	pb->len ++;
b8e312
+	struct _stp_log *log;
b8e312
+	unsigned long flags;
b8e312
+
b8e312
+	if (!_stp_print_trylock_irqsave(&flags))
b8e312
+		return;
b8e312
+
b8e312
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
+	if (unlikely(log->len == STP_BUFFER_SIZE))
b8e312
+		__stp_print_flush(log);
b8e312
+	log->buf[log->len++] = c;
b8e312
+	_stp_print_unlock_irqrestore(&flags);
b8e312
 }
b8e312
 
b8e312
 static void _stp_print_kernel_info(char *sname, char *vstr, int ctx, int num_probes)
b8e312
diff --git a/runtime/print.h b/runtime/print.h
b8e312
index ede71f033..ffdea594d 100644
b8e312
--- a/runtime/print.h
b8e312
+++ b/runtime/print.h
b8e312
@@ -10,6 +10,9 @@
b8e312
 #ifndef _STP_PRINT_H_
b8e312
 #define _STP_PRINT_H_
b8e312
 
b8e312
+/* The lock must be held with IRQs disabled to do any printing */
b8e312
+static bool _stp_print_trylock_irqsave(unsigned long *flags);
b8e312
+static void _stp_print_unlock_irqrestore(unsigned long *flags);
b8e312
 static int _stp_print_init(void);
b8e312
 static void _stp_print_cleanup(void);
b8e312
 static void *_stp_reserve_bytes(int numbytes);
b8e312
diff --git a/runtime/print_flush.c b/runtime/print_flush.c
b8e312
index cf40a2645..acd6a32d9 100644
b8e312
--- a/runtime/print_flush.c
b8e312
+++ b/runtime/print_flush.c
b8e312
@@ -13,40 +13,31 @@
b8e312
  * is filled, or this is called. This MUST be called before returning
b8e312
  * from a probe or accumulated output in the print buffer will be lost.
b8e312
  *
b8e312
- * @note Preemption must be disabled to use this.
b8e312
+ * @note Interrupts must be disabled to use this.
b8e312
  */
b8e312
 
b8e312
-static STP_DEFINE_SPINLOCK(_stp_print_lock);
b8e312
-
b8e312
-void stp_print_flush(_stp_pbuf *pb)
b8e312
+static void __stp_print_flush(struct _stp_log *log)
b8e312
 {
b8e312
-	size_t len = pb->len;
b8e312
+	size_t len = log->len;
b8e312
 	void *entry = NULL;
b8e312
 
b8e312
 	/* check to see if there is anything in the buffer */
b8e312
 	if (likely(len == 0))
b8e312
 		return;
b8e312
 
b8e312
-	pb->len = 0;
b8e312
-
b8e312
-	if (unlikely(_stp_transport_get_state() != STP_TRANSPORT_RUNNING))
b8e312
-		return;
b8e312
+	log->len = 0;
b8e312
 
b8e312
 	dbug_trans(1, "len = %zu\n", len);
b8e312
 
b8e312
 #ifdef STP_BULKMODE
b8e312
 #ifdef NO_PERCPU_HEADERS
b8e312
 	{
b8e312
-		struct context* __restrict__ c = NULL;
b8e312
-		char *bufp = pb->buf;
b8e312
+		char *bufp = log->buf;
b8e312
 		int inode_locked;
b8e312
 
b8e312
-		c = _stp_runtime_entryfn_get_context();
b8e312
-
b8e312
 		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
b8e312
 			atomic_inc (&_stp_transport_failures);
b8e312
 #ifndef STP_TRANSPORT_RISKY
b8e312
-			_stp_runtime_entryfn_put_context(c);
b8e312
 			return;
b8e312
 #endif
b8e312
 		}
b8e312
@@ -70,26 +61,20 @@ void stp_print_flush(_stp_pbuf *pb)
b8e312
 
b8e312
 		if (inode_locked)
b8e312
 			_stp_transport_unlock_relay_inode();
b8e312
-
b8e312
-		_stp_runtime_entryfn_put_context(c);
b8e312
 	}
b8e312
 
b8e312
 #else  /* !NO_PERCPU_HEADERS */
b8e312
 
b8e312
 	{
b8e312
-		struct context* __restrict__ c = NULL;
b8e312
-		char *bufp = pb->buf;
b8e312
+		char *bufp = log->buf;
b8e312
 		struct _stp_trace t = {	.sequence = _stp_seq_inc(),
b8e312
 					.pdu_len = len};
b8e312
 		size_t bytes_reserved;
b8e312
 		int inode_locked;
b8e312
 
b8e312
-		c = _stp_runtime_entryfn_get_context();
b8e312
-
b8e312
 		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
b8e312
 			atomic_inc (&_stp_transport_failures);
b8e312
 #ifndef STP_TRANSPORT_RISKY
b8e312
-			_stp_runtime_entryfn_put_context(c);
b8e312
 			return;
b8e312
 #endif
b8e312
 		}
b8e312
@@ -124,48 +109,24 @@ void stp_print_flush(_stp_pbuf *pb)
b8e312
 
b8e312
 		if (inode_locked)
b8e312
 			_stp_transport_unlock_relay_inode();
b8e312
-
b8e312
-		_stp_runtime_entryfn_put_context(c);
b8e312
 	}
b8e312
 #endif /* !NO_PERCPU_HEADERS */
b8e312
 
b8e312
 #else  /* !STP_BULKMODE */
b8e312
 
b8e312
 	{
b8e312
-		unsigned long flags;
b8e312
-		struct context* __restrict__ c = NULL;
b8e312
-		char *bufp = pb->buf;
b8e312
+		char *bufp = log->buf;
b8e312
 		int inode_locked;
b8e312
 
b8e312
-		/* Prevent probe reentrancy on _stp_print_lock.
b8e312
-		 *
b8e312
-		 * Since stp_print_flush may be called from probe context, we
b8e312
-		 * have to make sure that its lock, _stp_print_lock, can't
b8e312
-		 * possibly be held outside probe context too.  We ensure this
b8e312
-		 * by grabbing the context here, so any probe triggered by this
b8e312
-		 * region will appear reentrant and be skipped rather than
b8e312
-		 * deadlock.  Failure to get_context just means we're already
b8e312
-		 * in a probe, which is fine.
b8e312
-		 *
b8e312
-		 * (see also _stp_ctl_send for a similar situation)
b8e312
-                 *
b8e312
-                 * A better solution would be to replace this
b8e312
-                 * concurrency-control-laden effort with a lockless
b8e312
-                 * algorithm.
b8e312
-		 */
b8e312
-		c = _stp_runtime_entryfn_get_context();
b8e312
-
b8e312
 		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
b8e312
 			atomic_inc (&_stp_transport_failures);
b8e312
 #ifndef STP_TRANSPORT_RISKY
b8e312
 			dbug_trans(0, "discarding %zu bytes of data\n", len);
b8e312
-			_stp_runtime_entryfn_put_context(c);
b8e312
 			return;
b8e312
 #endif
b8e312
 		}
b8e312
 
b8e312
 		dbug_trans(1, "calling _stp_data_write...\n");
b8e312
-		stp_spin_lock_irqsave(&_stp_print_lock, flags);
b8e312
 		while (len > 0) {
b8e312
 			size_t bytes_reserved;
b8e312
 
b8e312
@@ -182,12 +143,9 @@ void stp_print_flush(_stp_pbuf *pb)
b8e312
 			    break;
b8e312
 			}
b8e312
 		}
b8e312
-		stp_spin_unlock_irqrestore(&_stp_print_lock, flags);
b8e312
 
b8e312
 		if (inode_locked)
b8e312
 			_stp_transport_unlock_relay_inode();
b8e312
-
b8e312
-		_stp_runtime_entryfn_put_context(c);
b8e312
 	}
b8e312
 #endif /* !STP_BULKMODE */
b8e312
 }
b8e312
diff --git a/runtime/stack.c b/runtime/stack.c
b8e312
index 241ccf793..da23d4395 100644
b8e312
--- a/runtime/stack.c
b8e312
+++ b/runtime/stack.c
b8e312
@@ -690,13 +690,20 @@ static void _stp_stack_kernel_sprint(char *str, int size, struct context* c,
b8e312
 	 * then call _stp_stack_print,
b8e312
 	 * then copy the result into the output string
b8e312
 	 * and clear the print buffer. */
b8e312
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
b8e312
-	_stp_print_flush();
b8e312
+	struct _stp_log *log;
b8e312
+	unsigned long flags;
b8e312
 
b8e312
-	_stp_stack_kernel_print(c, sym_flags);
b8e312
+	if (!_stp_print_trylock_irqsave(&flags)) {
b8e312
+		*str = '\0';
b8e312
+		return;
b8e312
+	}
b8e312
 
b8e312
-	strlcpy(str, pb->buf, size < (int)pb->len ? size : (int)pb->len);
b8e312
-	pb->len = 0;
b8e312
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
+	__stp_print_flush(log);
b8e312
+	_stp_stack_kernel_print(c, sym_flags);
b8e312
+	strlcpy(str, log->buf, min_t(int, size, log->len));
b8e312
+	log->len = 0;
b8e312
+	_stp_print_unlock_irqrestore(&flags);
b8e312
 }
b8e312
 
b8e312
 static void _stp_stack_user_sprint(char *str, int size, struct context* c,
b8e312
@@ -707,13 +714,20 @@ static void _stp_stack_user_sprint(char *str, int size, struct context* c,
b8e312
 	 * then call _stp_stack_print,
b8e312
 	 * then copy the result into the output string
b8e312
 	 * and clear the print buffer. */
b8e312
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
b8e312
-	_stp_print_flush();
b8e312
+	struct _stp_log *log;
b8e312
+	unsigned long flags;
b8e312
 
b8e312
-	_stp_stack_user_print(c, sym_flags);
b8e312
+	if (!_stp_print_trylock_irqsave(&flags)) {
b8e312
+		*str = '\0';
b8e312
+		return;
b8e312
+	}
b8e312
 
b8e312
-	strlcpy(str, pb->buf, size < (int)pb->len ? size : (int)pb->len);
b8e312
-	pb->len = 0;
b8e312
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
+	__stp_print_flush(log);
b8e312
+	_stp_stack_user_print(c, sym_flags);
b8e312
+	strlcpy(str, log->buf, min_t(int, size, log->len));
b8e312
+	log->len = 0;
b8e312
+	_stp_print_unlock_irqrestore(&flags);
b8e312
 }
b8e312
 
b8e312
 #endif /* _STACK_C_ */
b8e312
diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c
b8e312
index 57955334b..44e69b68c 100644
b8e312
--- a/runtime/transport/transport.c
b8e312
+++ b/runtime/transport/transport.c
b8e312
@@ -540,8 +540,8 @@ static void _stp_transport_close(void)
b8e312
 		   current->pid);
b8e312
 	_stp_cleanup_and_exit(0);
b8e312
 	_stp_unregister_ctl_channel();
b8e312
+	_stp_print_cleanup(); /* Requires the transport, so free this first */
b8e312
 	_stp_transport_fs_close();
b8e312
-	_stp_print_cleanup();	/* free print buffers */
b8e312
 	_stp_mem_debug_done();
b8e312
 
b8e312
 	dbug_trans(1, "---- CLOSED ----\n");
b8e312
diff --git a/runtime/vsprintf.c b/runtime/vsprintf.c
b8e312
index 28fd18f16..417d9f7f3 100644
b8e312
--- a/runtime/vsprintf.c
b8e312
+++ b/runtime/vsprintf.c
b8e312
@@ -542,6 +542,8 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
b8e312
 				   number of chars for from string */
b8e312
 	int qualifier;		/* 'h', 'l', or 'L' for integer fields */
b8e312
 	int num_bytes = 0;
b8e312
+	unsigned long irqflags = 0;
b8e312
+	bool got_print_lock = false;
b8e312
 
b8e312
 	/* Reject out-of-range values early */
b8e312
 	if (unlikely((int) size < 0))
b8e312
@@ -724,11 +726,14 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
b8e312
 	    num_bytes = STP_BUFFER_SIZE;
b8e312
 	  }
b8e312
 
b8e312
+	  if (!_stp_print_trylock_irqsave(&irqflags))
b8e312
+	    return 0;
b8e312
 	  str = (char*)_stp_reserve_bytes(num_bytes);
b8e312
 	  if (str == NULL) {
b8e312
 	    _stp_error("Couldn't reserve any print buffer space\n");
b8e312
-	    return 0;
b8e312
+	    goto err_unlock;
b8e312
 	  }
b8e312
+	  got_print_lock = true;
b8e312
 	  size = num_bytes;
b8e312
 	  end = str + size - 1;
b8e312
 
b8e312
@@ -820,8 +825,10 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
b8e312
 					field_width, precision,
b8e312
 					*fmt, flags);
b8e312
 			if (unlikely(str == NULL)) {
b8e312
-				if (num_bytes > 0)
b8e312
+				if (num_bytes > 0) {
b8e312
 					_stp_unreserve_bytes(num_bytes);
b8e312
+					goto err_unlock;
b8e312
+				}
b8e312
 				return 0;
b8e312
 			}
b8e312
 			continue;
b8e312
@@ -923,7 +930,14 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
b8e312
                   /* don't write out a null byte if the buf size is zero */
b8e312
                   *end = '\0';
b8e312
 	}
b8e312
+
b8e312
+	if (got_print_lock)
b8e312
+		_stp_print_unlock_irqrestore(&irqflags);
b8e312
 	return str-buf;
b8e312
+
b8e312
+err_unlock:
b8e312
+	_stp_print_unlock_irqrestore(&irqflags);
b8e312
+	return 0;
b8e312
 }
b8e312
 
b8e312
 #endif /* _VSPRINTF_C_ */
b8e312
diff --git a/translate.cxx b/translate.cxx
b8e312
index 53f1d0725..f0195486c 100644
b8e312
--- a/translate.cxx
b8e312
+++ b/translate.cxx
b8e312
@@ -1354,6 +1354,9 @@ c_unparser::emit_compiled_printfs ()
b8e312
       o->newline() << "unsigned long ptr_value;";
b8e312
       o->newline() << "int num_bytes;";
b8e312
 
b8e312
+      if (print_to_stream)
b8e312
+	  o->newline() << "unsigned long irqflags;";
b8e312
+
b8e312
       o->newline() << "(void) width;";
b8e312
       o->newline() << "(void) precision;";
b8e312
       o->newline() << "(void) ptr_value;";
b8e312
@@ -1452,7 +1455,9 @@ c_unparser::emit_compiled_printfs ()
b8e312
 	    }
b8e312
 
b8e312
 	  o->newline() << "num_bytes = clamp(num_bytes, 0, STP_BUFFER_SIZE);";
b8e312
-	  o->newline() << "str = (char*)_stp_reserve_bytes(num_bytes);";
b8e312
+	  o->newline() << "if (!_stp_print_trylock_irqsave(&irqflags))";
b8e312
+	  o->newline(1) << "return;";
b8e312
+	  o->newline(-1) << "str = (char*)_stp_reserve_bytes(num_bytes);";
b8e312
 	  o->newline() << "end = str ? str + num_bytes - 1 : 0;";
b8e312
         }
b8e312
       else // !print_to_stream
b8e312
@@ -1547,8 +1552,14 @@ c_unparser::emit_compiled_printfs ()
b8e312
 	      o->newline() << "if (unlikely(str == NULL)) {";
b8e312
 	      o->indent(1);
b8e312
 	      if (print_to_stream)
b8e312
+                {
b8e312
 		  o->newline() << "_stp_unreserve_bytes(num_bytes);";
b8e312
-	      o->newline() << "return;";
b8e312
+	          o->newline() << "goto err_unlock;";
b8e312
+                }
b8e312
+              else
b8e312
+                {
b8e312
+	          o->newline() << "return;";
b8e312
+                }
b8e312
 	      o->newline(-1) << "}";
b8e312
 	      break;
b8e312
 
b8e312
@@ -1575,6 +1586,11 @@ c_unparser::emit_compiled_printfs ()
b8e312
 
b8e312
       o->newline(-1) << "}";
b8e312
 
b8e312
+      if (print_to_stream)
b8e312
+        {
b8e312
+          o->newline(-1) << "err_unlock:";
b8e312
+          o->newline(1) << "_stp_print_unlock_irqrestore(&irqflags);";
b8e312
+        }
b8e312
       o->newline(-1) << "}";
b8e312
     }
b8e312
   o->newline() << "#endif // STP_LEGACY_PRINT";
b8e312
commit e8c7a2067ec7fc6315ee9bc34a010ec5f0369c5c
b8e312
Author: Frank Ch. Eigler <fche@redhat.com>
b8e312
Date:   Fri Dec 4 19:33:22 2020 -0500
b8e312
b8e312
    testsuite pr14536.stp: toughen
b8e312
    
b8e312
    This test case stresses nesting of heavy duty processing (backtrace
b8e312
    printing) within kernel interrupt processing paths.  It seems to
b8e312
    sometimes trigger problems - so let's make the test harder to make
b8e312
    latent problems show up more likely.  Instead of quitting after the
b8e312
    first irq_* function hit, stick around for 10 seconds.
b8e312
b8e312
diff --git a/testsuite/systemtap.base/pr14546.stp b/testsuite/systemtap.base/pr14546.stp
b8e312
index 3e59a6f3a..e4c205a8f 100644
b8e312
--- a/testsuite/systemtap.base/pr14546.stp
b8e312
+++ b/testsuite/systemtap.base/pr14546.stp
b8e312
@@ -2,5 +2,6 @@ probe kernel.function("irq_*").call {
b8e312
       x = 10; y = 10; z = 10; w = 10
b8e312
       $1
b8e312
       assert(!(x != 10 || y != 10 || z != 10 || w != 10), "memory got corrupted by " . @1)
b8e312
-      exit()
b8e312
 }
b8e312
+
b8e312
+probe timer.s(10) { exit () }
b8e312
commit cd6399e621646856824ea96b11605a0f52011272
b8e312
Author: Frank Ch. Eigler <fche@redhat.com>
b8e312
Date:   Fri Dec 4 21:33:21 2020 -0500
b8e312
b8e312
    dyninst transport: add _stp_print_*lock_irq* stubs
b8e312
    
b8e312
    Recent code on the transport/linux side needs a few new (stub)
b8e312
    functions and type decls.
b8e312
b8e312
diff --git a/runtime/dyninst/print.c b/runtime/dyninst/print.c
b8e312
index 9d91224ba..c78def272 100644
b8e312
--- a/runtime/dyninst/print.c
b8e312
+++ b/runtime/dyninst/print.c
b8e312
@@ -107,4 +107,18 @@ static void _stp_print_char (const char c)
b8e312
 	}
b8e312
 }
b8e312
 
b8e312
+
b8e312
+/* no-op stub synchronization */
b8e312
+static bool _stp_print_trylock_irqsave(unsigned long *flags)
b8e312
+{
b8e312
+        (void) flags;
b8e312
+        return true;
b8e312
+}
b8e312
+
b8e312
+static void _stp_print_unlock_irqrestore(unsigned long *flags)
b8e312
+{
b8e312
+        (void) flags;
b8e312
+}
b8e312
+
b8e312
+
b8e312
 #endif /* _STAPDYN_PRINT_C_ */
b8e312
diff --git a/runtime/dyninst/runtime_defines.h b/runtime/dyninst/runtime_defines.h
b8e312
index 5c3dec519..d00c76a21 100644
b8e312
--- a/runtime/dyninst/runtime_defines.h
b8e312
+++ b/runtime/dyninst/runtime_defines.h
b8e312
@@ -7,3 +7,6 @@
b8e312
 #define STAPCONF_PAGEFAULT_DISABLE  1
b8e312
 #define pagefault_disable()
b8e312
 #define pagefault_enable()
b8e312
+
b8e312
+typedef int bool;
b8e312
+enum { false=0, true=1 };
b8e312
commit fd93cf71df80f7bb5aae707ea5a5875727a85770
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Wed Dec 9 12:55:10 2020 -0800
b8e312
b8e312
    PR26844: fix off-by-one error when copying printed backtraces
b8e312
    
b8e312
    Since log->buf isn't null-terminated, log->len represents the total
b8e312
    number of bytes present in the log buffer for copying. The use of
b8e312
    strlcpy() here with log->len as its size results in log->len - 1 bytes
b8e312
    being copied, with the log->len'nth byte of the output buffer being set
b8e312
    to zero to terminate the string. Use memcpy() instead to remedy this,
b8e312
    while ensuring that the output buffer has space for null termination,
b8e312
    since the output buffer needs to be terminated.
b8e312
b8e312
diff --git a/runtime/stack.c b/runtime/stack.c
b8e312
index da23d4395..85883d6c4 100644
b8e312
--- a/runtime/stack.c
b8e312
+++ b/runtime/stack.c
b8e312
@@ -692,6 +692,7 @@ static void _stp_stack_kernel_sprint(char *str, int size, struct context* c,
b8e312
 	 * and clear the print buffer. */
b8e312
 	struct _stp_log *log;
b8e312
 	unsigned long flags;
b8e312
+	int bytes;
b8e312
 
b8e312
 	if (!_stp_print_trylock_irqsave(&flags)) {
b8e312
 		*str = '\0';
b8e312
@@ -701,7 +702,9 @@ static void _stp_stack_kernel_sprint(char *str, int size, struct context* c,
b8e312
 	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
 	__stp_print_flush(log);
b8e312
 	_stp_stack_kernel_print(c, sym_flags);
b8e312
-	strlcpy(str, log->buf, min_t(int, size, log->len));
b8e312
+	bytes = min_t(int, size - 1, log->len);
b8e312
+	memcpy(str, log->buf, bytes);
b8e312
+	str[bytes] = '\0';
b8e312
 	log->len = 0;
b8e312
 	_stp_print_unlock_irqrestore(&flags);
b8e312
 }
b8e312
@@ -716,6 +719,7 @@ static void _stp_stack_user_sprint(char *str, int size, struct context* c,
b8e312
 	 * and clear the print buffer. */
b8e312
 	struct _stp_log *log;
b8e312
 	unsigned long flags;
b8e312
+	int bytes;
b8e312
 
b8e312
 	if (!_stp_print_trylock_irqsave(&flags)) {
b8e312
 		*str = '\0';
b8e312
@@ -725,7 +729,9 @@ static void _stp_stack_user_sprint(char *str, int size, struct context* c,
b8e312
 	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
b8e312
 	__stp_print_flush(log);
b8e312
 	_stp_stack_user_print(c, sym_flags);
b8e312
-	strlcpy(str, log->buf, min_t(int, size, log->len));
b8e312
+	bytes = min_t(int, size - 1, log->len);
b8e312
+	memcpy(str, log->buf, bytes);
b8e312
+	str[bytes] = '\0';
b8e312
 	log->len = 0;
b8e312
 	_stp_print_unlock_irqrestore(&flags);
b8e312
 }
b8e312
commit 8819e2a04596deb2fe427d261bebcaf3d2620dfb
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Wed Dec 9 17:22:20 2020 -0800
b8e312
b8e312
    always use per-cpu bulkmode relayfs files to communicate with userspace
b8e312
    
b8e312
    Using a mutex_trylock() in __stp_print_flush() leads to a lot of havoc,
b8e312
    for numerous. Firstly, since __stp_print_flush() can be called from IRQ
b8e312
    context, holding the inode mutex from here would make the mutex owner
b8e312
    become nonsense, since mutex locks can only be held in contexts backed
b8e312
    by the scheduler. Secondly, the mutex_trylock implementation has a
b8e312
    spin_lock() inside of it that leads to two issues: IRQs aren't disabled
b8e312
    when acquiring this spin_lock(), so using it from IRQ context can lead
b8e312
    to a deadlock, and since spin locks can have tracepoints via
b8e312
    lock_acquire(), the spin_lock() can recurse on itself inside a stap
b8e312
    probe and deadlock, like so:
b8e312
    
b8e312
     #0 [ffff88017f6d7a08] kvm_wait at ffffffff81079f5a
b8e312
     #1 [ffff88017f6d7a30] __pv_queued_spin_lock_slowpath at ffffffff8114f51e
b8e312
     #2 [ffff88017f6d7a70] queued_spin_lock_slowpath at ffffffff810e842b
b8e312
     #3 [ffff88017f6d7a80] mutex_trylock at ffffffff81882b1b
b8e312
     #4 [ffff88017f6d7ab8] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
     #5 [ffff88017f6d7ad8] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
     #6 [ffff88017f6d7b10] probe_7879 at ffffffffc0a98c85 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
     #7 [ffff88017f6d7b38] enter_real_tracepoint_probe_1543 at ffffffffc0c3b757 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
     #8 [ffff88017f6d7b70] enter_tracepoint_probe_1543 at ffffffffc09b117e [stap_47650d3377d05db0ab7cbbaa25765809__11657]
b8e312
     #9 [ffff88017f6d7b80] lock_acquire at ffffffff811460ba
b8e312
    
b8e312
    The reason the mutex_trylock() was needed in the first place was because
b8e312
    staprun doesn't properly use the relayfs API when reading buffers in
b8e312
    non-bulk mode. It tries to read all CPUs' buffers from a single thread,
b8e312
    when it should be reading each CPU's buffer from a thread running on
b8e312
    said CPU in order to utilize relayfs' synchronization guarantees, which
b8e312
    are made by disabling IRQs on the local CPU when a buffer is modified.
b8e312
    
b8e312
    This change makes staprun always use per-CPU threads to read print
b8e312
    buffers so that we don't need the mutex_trylock() in the print flush
b8e312
    routine, which resolves a wide variety of serious bugs.
b8e312
    
b8e312
    We also need to adjust the transport sub-buffer count to accommodate for
b8e312
    frequent print flushing. The sub-buffer size is now reduced to match the
b8e312
    log buffer size, which is 8192 by default, and the number of sub-buffers
b8e312
    is increased to 256. This uses exactly the same amount of memory as
b8e312
    before.
b8e312
b8e312
diff --git a/runtime/print_flush.c b/runtime/print_flush.c
b8e312
index acd6a32d9..f4d72d30f 100644
b8e312
--- a/runtime/print_flush.c
b8e312
+++ b/runtime/print_flush.c
b8e312
@@ -18,6 +18,7 @@
b8e312
 
b8e312
 static void __stp_print_flush(struct _stp_log *log)
b8e312
 {
b8e312
+	char *bufp = log->buf;
b8e312
 	size_t len = log->len;
b8e312
 	void *entry = NULL;
b8e312
 
b8e312
@@ -26,126 +27,20 @@ static void __stp_print_flush(struct _stp_log *log)
b8e312
 		return;
b8e312
 
b8e312
 	log->len = 0;
b8e312
-
b8e312
 	dbug_trans(1, "len = %zu\n", len);
b8e312
-
b8e312
-#ifdef STP_BULKMODE
b8e312
-#ifdef NO_PERCPU_HEADERS
b8e312
-	{
b8e312
-		char *bufp = log->buf;
b8e312
-		int inode_locked;
b8e312
-
b8e312
-		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
b8e312
-			atomic_inc (&_stp_transport_failures);
b8e312
-#ifndef STP_TRANSPORT_RISKY
b8e312
-			return;
b8e312
-#endif
b8e312
-		}
b8e312
-
b8e312
-		while (len > 0) {
b8e312
-			size_t bytes_reserved;
b8e312
-
b8e312
-			bytes_reserved = _stp_data_write_reserve(len, &entry);
b8e312
-			if (likely(entry && bytes_reserved > 0)) {
b8e312
-				memcpy(_stp_data_entry_data(entry), bufp,
b8e312
-				       bytes_reserved);
b8e312
-				_stp_data_write_commit(entry);
b8e312
-				bufp += bytes_reserved;
b8e312
-				len -= bytes_reserved;
b8e312
-			}
b8e312
-			else {
b8e312
-				atomic_inc(&_stp_transport_failures);
b8e312
-				break;
b8e312
-			}
b8e312
-		}
b8e312
-
b8e312
-		if (inode_locked)
b8e312
-			_stp_transport_unlock_relay_inode();
b8e312
-	}
b8e312
-
b8e312
-#else  /* !NO_PERCPU_HEADERS */
b8e312
-
b8e312
-	{
b8e312
-		char *bufp = log->buf;
b8e312
-		struct _stp_trace t = {	.sequence = _stp_seq_inc(),
b8e312
-					.pdu_len = len};
b8e312
+	do {
b8e312
 		size_t bytes_reserved;
b8e312
-		int inode_locked;
b8e312
 
b8e312
-		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
b8e312
-			atomic_inc (&_stp_transport_failures);
b8e312
-#ifndef STP_TRANSPORT_RISKY
b8e312
-			return;
b8e312
-#endif
b8e312
-		}
b8e312
-
b8e312
-		bytes_reserved = _stp_data_write_reserve(sizeof(struct _stp_trace), &entry);
b8e312
-		if (likely(entry && bytes_reserved > 0)) {
b8e312
-			/* prevent unaligned access by using memcpy() */
b8e312
-			memcpy(_stp_data_entry_data(entry), &t, sizeof(t));
b8e312
+		bytes_reserved = _stp_data_write_reserve(len, &entry);
b8e312
+		if (likely(entry && bytes_reserved)) {
b8e312
+			memcpy(_stp_data_entry_data(entry), bufp,
b8e312
+			       bytes_reserved);
b8e312
 			_stp_data_write_commit(entry);
b8e312
-		}
b8e312
-		else {
b8e312
+			bufp += bytes_reserved;
b8e312
+			len -= bytes_reserved;
b8e312
+		} else {
b8e312
 			atomic_inc(&_stp_transport_failures);
b8e312
-			goto done;
b8e312
+			break;
b8e312
 		}
b8e312
-
b8e312
-		while (len > 0) {
b8e312
-			bytes_reserved = _stp_data_write_reserve(len, &entry);
b8e312
-			if (likely(entry && bytes_reserved > 0)) {
b8e312
-				memcpy(_stp_data_entry_data(entry), bufp,
b8e312
-				       bytes_reserved);
b8e312
-				_stp_data_write_commit(entry);
b8e312
-				bufp += bytes_reserved;
b8e312
-				len -= bytes_reserved;
b8e312
-			}
b8e312
-			else {
b8e312
-				atomic_inc(&_stp_transport_failures);
b8e312
-				break;
b8e312
-			}
b8e312
-		}
b8e312
-
b8e312
-	done:
b8e312
-
b8e312
-		if (inode_locked)
b8e312
-			_stp_transport_unlock_relay_inode();
b8e312
-	}
b8e312
-#endif /* !NO_PERCPU_HEADERS */
b8e312
-
b8e312
-#else  /* !STP_BULKMODE */
b8e312
-
b8e312
-	{
b8e312
-		char *bufp = log->buf;
b8e312
-		int inode_locked;
b8e312
-
b8e312
-		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
b8e312
-			atomic_inc (&_stp_transport_failures);
b8e312
-#ifndef STP_TRANSPORT_RISKY
b8e312
-			dbug_trans(0, "discarding %zu bytes of data\n", len);
b8e312
-			return;
b8e312
-#endif
b8e312
-		}
b8e312
-
b8e312
-		dbug_trans(1, "calling _stp_data_write...\n");
b8e312
-		while (len > 0) {
b8e312
-			size_t bytes_reserved;
b8e312
-
b8e312
-			bytes_reserved = _stp_data_write_reserve(len, &entry);
b8e312
-			if (likely(entry && bytes_reserved > 0)) {
b8e312
-				memcpy(_stp_data_entry_data(entry), bufp,
b8e312
-				       bytes_reserved);
b8e312
-				_stp_data_write_commit(entry);
b8e312
-				bufp += bytes_reserved;
b8e312
-				len -= bytes_reserved;
b8e312
-			}
b8e312
-			else {
b8e312
-			    atomic_inc(&_stp_transport_failures);
b8e312
-			    break;
b8e312
-			}
b8e312
-		}
b8e312
-
b8e312
-		if (inode_locked)
b8e312
-			_stp_transport_unlock_relay_inode();
b8e312
-	}
b8e312
-#endif /* !STP_BULKMODE */
b8e312
+	} while (len > 0);
b8e312
 }
b8e312
diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c
b8e312
index ff621f71d..2ba5eea7d 100644
b8e312
--- a/runtime/transport/relay_v2.c
b8e312
+++ b/runtime/transport/relay_v2.c
b8e312
@@ -67,7 +67,7 @@ static size_t __stp_relay_switch_subbuf(struct rchan_buf *buf, size_t length)
b8e312
 		return 0;
b8e312
 
b8e312
 	if (unlikely(length > buf->chan->subbuf_size))
b8e312
-		goto toobig;
b8e312
+		length = buf->chan->subbuf_size;
b8e312
 
b8e312
 	if (buf->offset != buf->chan->subbuf_size + 1) {
b8e312
 		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
b8e312
@@ -98,14 +98,7 @@ static size_t __stp_relay_switch_subbuf(struct rchan_buf *buf, size_t length)
b8e312
 	buf->data = new;
b8e312
 	buf->padding[new_subbuf] = 0;
b8e312
 
b8e312
-	if (unlikely(length + buf->offset > buf->chan->subbuf_size))
b8e312
-		goto toobig;
b8e312
-
b8e312
 	return length;
b8e312
-
b8e312
-toobig:
b8e312
-	buf->chan->last_toobig = length;
b8e312
-	return 0;
b8e312
 }
b8e312
 
b8e312
 static void __stp_relay_wakeup_readers(struct rchan_buf *buf)
b8e312
@@ -117,24 +110,17 @@ static void __stp_relay_wakeup_readers(struct rchan_buf *buf)
b8e312
 
b8e312
 static void __stp_relay_wakeup_timer(stp_timer_callback_parameter_t unused)
b8e312
 {
b8e312
-#ifdef STP_BULKMODE
b8e312
 	int i;
b8e312
-#endif
b8e312
 
b8e312
 	if (atomic_read(&_stp_relay_data.wakeup)) {
b8e312
 		struct rchan_buf *buf;
b8e312
 		
b8e312
 		atomic_set(&_stp_relay_data.wakeup, 0);
b8e312
-#ifdef STP_BULKMODE
b8e312
 		for_each_possible_cpu(i) {
b8e312
 			buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
b8e312
 						    i);
b8e312
 			__stp_relay_wakeup_readers(buf);
b8e312
 		}
b8e312
-#else
b8e312
-		buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);
b8e312
-		__stp_relay_wakeup_readers(buf);
b8e312
-#endif
b8e312
 	}
b8e312
 
b8e312
 	if (atomic_read(&_stp_relay_data.transport_state) == STP_TRANSPORT_RUNNING)
b8e312
@@ -235,55 +221,8 @@ static void _stp_transport_data_fs_stop(void)
b8e312
 		atomic_set (&_stp_relay_data.transport_state, STP_TRANSPORT_STOPPED);
b8e312
 		del_timer_sync(&_stp_relay_data.timer);
b8e312
 		dbug_trans(0, "flushing...\n");
b8e312
-		if (_stp_relay_data.rchan) {
b8e312
-			struct rchan_buf *buf;
b8e312
-
b8e312
-			/* NB we cannot call relay_flush() directly here since
b8e312
-			 * we need to do inode locking ourselves.
b8e312
-			 */
b8e312
-
b8e312
-#ifdef STP_BULKMODE
b8e312
-			unsigned int i;
b8e312
-			struct rchan *rchan = _stp_relay_data.rchan;
b8e312
-
b8e312
-			for_each_possible_cpu(i) {
b8e312
-				buf = _stp_get_rchan_subbuf(rchan->buf, i);
b8e312
-				if (buf) {
b8e312
-					struct inode *inode = buf->dentry->d_inode;
b8e312
-
b8e312
-					/* NB we are in the syscall context which
b8e312
-					 * allows sleeping. The following inode
b8e312
-					 * locking might sleep. See PR26131. */
b8e312
-					_stp_lock_inode(inode);
b8e312
-
b8e312
-					/* NB we intentionally avoids calling
b8e312
-					 * our own __stp_relay_switch_subbuf()
b8e312
-					 * since here we can sleep. */
b8e312
-					relay_switch_subbuf(buf, 0);
b8e312
-
b8e312
-					_stp_unlock_inode(inode);
b8e312
-				}
b8e312
-			}
b8e312
-#else  /* !STP_BULKMODE */
b8e312
-			buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);
b8e312
-
b8e312
-			if (buf != NULL) {
b8e312
-				struct inode *inode = buf->dentry->d_inode;
b8e312
-
b8e312
-				/* NB we are in the syscall context which allows
b8e312
-				 * sleeping. The following inode locking might
b8e312
-				 * sleep. See PR26131. */
b8e312
-				_stp_lock_inode(inode);
b8e312
-
b8e312
-				/* NB we intentionally avoids calling
b8e312
-				 * our own __stp_relay_switch_subbuf()
b8e312
-				 * since here we can sleep. */
b8e312
-				relay_switch_subbuf(buf, 0);
b8e312
-
b8e312
-				_stp_unlock_inode(inode);
b8e312
-			}
b8e312
-#endif
b8e312
-		}
b8e312
+		if (_stp_relay_data.rchan)
b8e312
+			relay_flush(_stp_relay_data.rchan);
b8e312
 	}
b8e312
 }
b8e312
 
b8e312
@@ -308,9 +247,7 @@ static int _stp_transport_data_fs_init(void)
b8e312
 
b8e312
 	/* Create "trace" file. */
b8e312
 	npages = _stp_subbuf_size * _stp_nsubbufs;
b8e312
-#ifdef STP_BULKMODE
b8e312
 	npages *= num_online_cpus();
b8e312
-#endif
b8e312
 	npages >>= PAGE_SHIFT;
b8e312
 	si_meminfo(&si);
b8e312
 #define MB(i) (unsigned long)((i) >> (20 - PAGE_SHIFT))
b8e312
@@ -347,9 +284,7 @@ static int _stp_transport_data_fs_init(void)
b8e312
         {
b8e312
                 u64 relay_mem;
b8e312
                 relay_mem = _stp_subbuf_size * _stp_nsubbufs;
b8e312
-#ifdef STP_BULKMODE
b8e312
                 relay_mem *= num_online_cpus();
b8e312
-#endif
b8e312
                 _stp_allocated_net_memory += relay_mem;
b8e312
                 _stp_allocated_memory += relay_mem;
b8e312
         }
b8e312
@@ -386,12 +321,7 @@ _stp_data_write_reserve(size_t size_request, void **entry)
b8e312
 		return -EINVAL;
b8e312
 
b8e312
 	buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
b8e312
-#ifdef STP_BULKMODE
b8e312
-				    smp_processor_id()
b8e312
-#else
b8e312
-				    0
b8e312
-#endif
b8e312
-				    );
b8e312
+				    smp_processor_id());
b8e312
 	if (unlikely(buf->offset + size_request > buf->chan->subbuf_size)) {
b8e312
 		size_request = __stp_relay_switch_subbuf(buf, size_request);
b8e312
 		if (!size_request)
b8e312
@@ -411,65 +341,10 @@ static unsigned char *_stp_data_entry_data(void *entry)
b8e312
 
b8e312
 static int _stp_data_write_commit(void *entry)
b8e312
 {
b8e312
-	/* Nothing to do here. */
b8e312
-	return 0;
b8e312
-}
b8e312
-
b8e312
-static noinline int _stp_transport_trylock_relay_inode(void)
b8e312
-{
b8e312
-	unsigned i;
b8e312
 	struct rchan_buf *buf;
b8e312
-	struct inode *inode;
b8e312
-#ifdef DEBUG_TRANS
b8e312
-	cycles_t begin;
b8e312
-#endif
b8e312
 
b8e312
 	buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
b8e312
-#ifdef STP_BULKMODE
b8e312
-				    smp_processor_id()
b8e312
-#else
b8e312
-				    0
b8e312
-#endif
b8e312
-				    );
b8e312
-	if (buf == NULL)
b8e312
-		return 0;
b8e312
-
b8e312
-	inode = buf->dentry->d_inode;
b8e312
-
b8e312
-#ifdef DEBUG_TRANS
b8e312
-	begin = get_cycles();
b8e312
-#endif
b8e312
-
b8e312
-	/* NB this bounded spinlock is needed for stream mode. it is observed
b8e312
-	 * that almost all of the iterations needed are less than 50K iterations
b8e312
-	 * or about 300K cycles.
b8e312
-	 */
b8e312
-	for (i = 0; i < 50 * 1000; i++) {
b8e312
-		if (_stp_trylock_inode(inode)) {
b8e312
-			dbug_trans(3, "got inode lock: i=%u: cycles: %llu", i,
b8e312
-				   get_cycles() - begin);
b8e312
-			return 1;
b8e312
-		}
b8e312
-	}
b8e312
-
b8e312
-	dbug_trans(0, "failed to get inode lock: i=%u: cycles: %llu", i,
b8e312
-		   get_cycles() - begin);
b8e312
+				    smp_processor_id());
b8e312
+	__stp_relay_switch_subbuf(buf, 0);
b8e312
 	return 0;
b8e312
 }
b8e312
-
b8e312
-static void _stp_transport_unlock_relay_inode(void)
b8e312
-{
b8e312
-	struct rchan_buf *buf;
b8e312
-
b8e312
-	buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
b8e312
-#ifdef STP_BULKMODE
b8e312
-				    smp_processor_id()
b8e312
-#else
b8e312
-				    0
b8e312
-#endif
b8e312
-				    );
b8e312
-	if (buf == NULL)
b8e312
-		return;
b8e312
-
b8e312
-	_stp_unlock_inode(buf->dentry->d_inode);
b8e312
-}
b8e312
diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c
b8e312
index 96426eb7b..1be3e9485 100644
b8e312
--- a/runtime/transport/transport.c
b8e312
+++ b/runtime/transport/transport.c
b8e312
@@ -49,7 +49,6 @@ static int _stp_probes_started = 0;
b8e312
  * transport state flag is atomic. */
b8e312
 static atomic_t _stp_transport_state = ATOMIC_INIT(_STP_TS_UNINITIALIZED);
b8e312
 
b8e312
-static inline int _stp_trylock_inode(struct inode *inode);
b8e312
 static inline void _stp_lock_inode(struct inode *inode);
b8e312
 static inline void _stp_unlock_inode(struct inode *inode);
b8e312
 
b8e312
@@ -70,8 +69,8 @@ static inline void _stp_unlock_inode(struct inode *inode);
b8e312
 #include "procfs.c"
b8e312
 #include "control.c"
b8e312
 
b8e312
-static unsigned _stp_nsubbufs = 8;
b8e312
-static unsigned _stp_subbuf_size = 65536*4;
b8e312
+static unsigned _stp_nsubbufs = 256;
b8e312
+static unsigned _stp_subbuf_size = STP_BUFFER_SIZE;
b8e312
 
b8e312
 /* module parameters */
b8e312
 static int _stp_bufsize;
b8e312
@@ -643,23 +642,6 @@ err0:
b8e312
 	return -1;
b8e312
 }
b8e312
 
b8e312
-/* returns 1 when the lock is successfully acquired, 0 otherwise. */
b8e312
-static inline int _stp_trylock_inode(struct inode *inode)
b8e312
-{
b8e312
-#ifdef STAPCONF_INODE_RWSEM
b8e312
-	return inode_trylock(inode);
b8e312
-#else
b8e312
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
b8e312
-	return mutex_trylock(&inode->i_mutex);
b8e312
-#else
b8e312
-	/* NB down_trylock() uses a different convention where 0 means
b8e312
-	 * the lock is successfully acquired.
b8e312
-	 */
b8e312
-	return !down_trylock(&inode->i_sem);
b8e312
-#endif
b8e312
-#endif
b8e312
-}
b8e312
-
b8e312
 static inline void _stp_lock_inode(struct inode *inode)
b8e312
 {
b8e312
 #ifdef STAPCONF_INODE_RWSEM
b8e312
diff --git a/runtime/transport/transport.h b/runtime/transport/transport.h
b8e312
index 51723b7f5..cc09fc0ae 100644
b8e312
--- a/runtime/transport/transport.h
b8e312
+++ b/runtime/transport/transport.h
b8e312
@@ -98,24 +98,6 @@ enum _stp_transport_state {
b8e312
  */
b8e312
 static enum _stp_transport_state _stp_transport_get_state(void);
b8e312
 
b8e312
-/*
b8e312
- * _stp_transport_trylock_relay_inode
b8e312
- *
b8e312
- * This function locks the relay file inode to protect against relay readers
b8e312
- * (i.e., staprun/stapio).
b8e312
- * Returns whether the lock is successfully obtained.
b8e312
- */
b8e312
-static noinline int _stp_transport_trylock_relay_inode(void);
b8e312
-
b8e312
-/*
b8e312
- * _stp_transport_unlock_relay_inode
b8e312
- *
b8e312
- * This function releases the lock obtained by
b8e312
- * _stp_transport_trylock_relay_inode.
b8e312
- * should only call this when the lock is indeed obtained.
b8e312
- */
b8e312
-static void _stp_transport_unlock_relay_inode(void);
b8e312
-
b8e312
 /*
b8e312
  * _stp_transport_data_fs_init
b8e312
  *
b8e312
diff --git a/staprun/relay.c b/staprun/relay.c
b8e312
index 2f5f2e06a..c76e76719 100644
b8e312
--- a/staprun/relay.c
b8e312
+++ b/staprun/relay.c
b8e312
@@ -131,6 +131,7 @@ static void *reader_thread(void *data)
b8e312
 	sigset_t sigs;
b8e312
 	off_t wsize = 0;
b8e312
 	int fnum = 0;
b8e312
+	cpu_set_t cpu_mask;
b8e312
 
b8e312
 	sigemptyset(&sigs;;
b8e312
 	sigaddset(&sigs,SIGUSR2);
b8e312
@@ -139,21 +140,18 @@ static void *reader_thread(void *data)
b8e312
 	sigfillset(&sigs;;
b8e312
 	sigdelset(&sigs,SIGUSR2);
b8e312
 
b8e312
-	if (bulkmode) {
b8e312
-		cpu_set_t cpu_mask;
b8e312
-		CPU_ZERO(&cpu_mask);
b8e312
-		CPU_SET(cpu, &cpu_mask);
b8e312
-		if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 )
b8e312
-			_perr("sched_setaffinity");
b8e312
+	CPU_ZERO(&cpu_mask);
b8e312
+	CPU_SET(cpu, &cpu_mask);
b8e312
+	if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 )
b8e312
+		_perr("sched_setaffinity");
b8e312
 #ifdef NEED_PPOLL
b8e312
-		/* Without a real ppoll, there is a small race condition that could */
b8e312
-		/* block ppoll(). So use a timeout to prevent that. */
b8e312
-		timeout->tv_sec = 10;
b8e312
-		timeout->tv_nsec = 0;
b8e312
+	/* Without a real ppoll, there is a small race condition that could */
b8e312
+	/* block ppoll(). So use a timeout to prevent that. */
b8e312
+	timeout->tv_sec = 10;
b8e312
+	timeout->tv_nsec = 0;
b8e312
 #else
b8e312
-		timeout = NULL;
b8e312
+	timeout = NULL;
b8e312
 #endif
b8e312
-	}
b8e312
 
b8e312
         if (reader_timeout_ms && timeout) {
b8e312
                 timeout->tv_sec = reader_timeout_ms / 1000;
b8e312
@@ -358,11 +356,6 @@ int init_relayfs(void)
b8e312
 		_err("couldn't open %s.\n", buf);
b8e312
 		return -1;
b8e312
 	}
b8e312
-	if (ncpus > 1 && bulkmode == 0) {
b8e312
-		_err("ncpus=%d, bulkmode = %d\n", ncpus, bulkmode);
b8e312
-		_err("This is inconsistent! Please file a bug report. Exiting now.\n");
b8e312
-		return -1;
b8e312
-	}
b8e312
 
b8e312
         /* PR7097 */
b8e312
         if (load_only)
b8e312
commit d86b64029598f69b47d9cf4295f30b7093f38cfc
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Wed Dec 9 17:22:27 2020 -0800
b8e312
b8e312
    Revert "REVERTME: tapset-timers: work around on-the-fly deadlocks caused by mutex_trylock"
b8e312
    
b8e312
    This reverts commit 6a27888b118b7a94650a68aae028957cdd5fb5f5.
b8e312
    
b8e312
    No longer needed. As promised, we're reverting this.
b8e312
b8e312
diff --git a/tapset-timers.cxx b/tapset-timers.cxx
b8e312
index 503498c85..10da17cda 100644
b8e312
--- a/tapset-timers.cxx
b8e312
+++ b/tapset-timers.cxx
b8e312
@@ -391,11 +391,11 @@ hrtimer_derived_probe_group::emit_module_refresh (systemtap_session& s)
b8e312
   s.op->newline(+1) <<   "struct stap_hrtimer_probe* stp = &stap_hrtimer_probes[i];";
b8e312
   // timer disabled, but condition says enabled?
b8e312
   s.op->newline( 0) <<   "if (!stp->enabled && stp->probe->cond_enabled) {";
b8e312
-  s.op->newline(+1) <<     "//dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
b8e312
+  s.op->newline(+1) <<     "dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
b8e312
   s.op->newline( 0) <<     "_stp_hrtimer_start(stp);";
b8e312
   // timer enabled, but condition says disabled?
b8e312
   s.op->newline(-1) <<   "} else if (stp->enabled && !stp->probe->cond_enabled) {";
b8e312
-  s.op->newline(+1) <<     "//dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
b8e312
+  s.op->newline(+1) <<     "dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
b8e312
   s.op->newline( 0) <<     "_stp_hrtimer_cancel(stp);";
b8e312
   s.op->newline(-1) <<   "}";
b8e312
   s.op->newline( 0) <<   "stp->enabled = stp->probe->cond_enabled;";
b8e312
commit 3abe2c40b2dae499aff2e31beff121fbe43f7654
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Fri Dec 11 12:31:25 2020 -0800
b8e312
b8e312
    transport: set is_global to zero even when bulkmode is disabled
b8e312
    
b8e312
    This is needed now that we always want per-cpu logger threads. When
b8e312
    is_global is set to a non-zero value, relay won't create per-cpu log
b8e312
    files.
b8e312
b8e312
diff --git a/runtime/transport/debugfs.c b/runtime/transport/debugfs.c
b8e312
index 28a5bf89d..bd20281a0 100644
b8e312
--- a/runtime/transport/debugfs.c
b8e312
+++ b/runtime/transport/debugfs.c
b8e312
@@ -256,13 +256,8 @@ __stp_debugfs_relay_create_buf_file_callback(const char *filename,
b8e312
 	 * cause relay_open() to create a single global buffer rather
b8e312
 	 * than the default set of per-cpu buffers.
b8e312
 	 */
b8e312
-	if (is_global) {
b8e312
-#ifdef STP_BULKMODE
b8e312
+	if (is_global)
b8e312
 		*is_global = 0;
b8e312
-#else
b8e312
-		*is_global = 1;
b8e312
-#endif
b8e312
-	}
b8e312
 
b8e312
 	if (IS_ERR(file)) {
b8e312
 		file = NULL;
b8e312
diff --git a/runtime/transport/procfs.c b/runtime/transport/procfs.c
b8e312
index 262409356..b0a5d5760 100644
b8e312
--- a/runtime/transport/procfs.c
b8e312
+++ b/runtime/transport/procfs.c
b8e312
@@ -328,13 +328,8 @@ __stp_procfs_relay_create_buf_file_callback(const char *filename,
b8e312
   unsigned i = 0;
b8e312
   struct inode* in;
b8e312
   
b8e312
-  if (is_global) {
b8e312
-#ifdef STP_BULKMODE
b8e312
+  if (is_global)
b8e312
           *is_global = 0;
b8e312
-#else
b8e312
-          *is_global = 1;
b8e312
-#endif
b8e312
-  }
b8e312
   
b8e312
   if (parent != _stp_procfs_module_dir_path.dentry)
b8e312
     goto out;
b8e312
commit a26bf7890196395d73ac193b23e271398731745d
b8e312
Author: Frank Ch. Eigler <fche@redhat.com>
b8e312
Date:   Fri Dec 11 15:39:29 2020 -0500
b8e312
b8e312
    relay transport: comment on STP_BULK message
b8e312
    
b8e312
    While we've eliminated any STP_BULKMODE effects from the way relayfs
b8e312
    files are used ("always bulkmode"), staprun/stapio still need to know
b8e312
    whether the user intended "stap -b" or not, so they can save files
b8e312
    stpd_cpu* files separately.
b8e312
b8e312
diff --git a/runtime/transport/control.c b/runtime/transport/control.c
b8e312
index 9343b3c28..d123bef2f 100644
b8e312
--- a/runtime/transport/control.c
b8e312
+++ b/runtime/transport/control.c
b8e312
@@ -88,6 +88,9 @@ static ssize_t _stp_ctl_write_cmd(struct file *file, const char __user *buf, siz
b8e312
 		break;
b8e312
 
b8e312
 	case STP_BULK:
b8e312
+                // NB: this signals the runtime to save separate
b8e312
+                // per-cpu files; our kernel->userspace files are now
b8e312
+                // always bulkmode (trace$N files).
b8e312
 #ifdef STP_BULKMODE
b8e312
                 // no action needed
b8e312
                 break;
b8e312
commit b43eb4ed690bf2421978ed2896667e45e60c3400
b8e312
Author: Cosmin Tanislav <demonsingur@gmail.com>
b8e312
Date:   Thu Dec 10 16:48:54 2020 -0500
b8e312
b8e312
    bugfix: runtime: transport: handle more error cases in module init
b8e312
    
b8e312
    Signed-off-by: Sultan Alsawaf <sultan@openresty.com>
b8e312
b8e312
diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c
b8e312
index 2ba5eea7d..27729f4c8 100644
b8e312
--- a/runtime/transport/relay_v2.c
b8e312
+++ b/runtime/transport/relay_v2.c
b8e312
@@ -277,6 +277,7 @@ static int _stp_transport_data_fs_init(void)
b8e312
 #endif  /* (RELAYFS_CHANNEL_VERSION < 7) */
b8e312
 	if (!_stp_relay_data.rchan) {
b8e312
 		rc = -ENOENT;
b8e312
+		errk("%s: relay_open() failed: %d\n", THIS_MODULE->name, rc);
b8e312
 		goto err;
b8e312
 	}
b8e312
         /* Increment _stp_allocated_memory and _stp_allocated_net_memory to account for buffers
b8e312
diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c
b8e312
index 1be3e9485..f005e14e2 100644
b8e312
--- a/runtime/transport/transport.c
b8e312
+++ b/runtime/transport/transport.c
b8e312
@@ -552,6 +552,8 @@ static void _stp_transport_close(void)
b8e312
  */
b8e312
 static int _stp_transport_init(void)
b8e312
 {
b8e312
+	int ret;
b8e312
+
b8e312
 	dbug_trans(1, "transport_init\n");
b8e312
 #ifdef STAPCONF_TASK_UID
b8e312
 	_stp_uid = current->uid;
b8e312
@@ -603,20 +605,28 @@ static int _stp_transport_init(void)
b8e312
 		dbug_trans(1, "Using %d subbufs of size %d\n", _stp_nsubbufs, _stp_subbuf_size);
b8e312
 	}
b8e312
 
b8e312
-	if (_stp_transport_fs_init(THIS_MODULE->name) != 0)
b8e312
+	ret = _stp_transport_fs_init(THIS_MODULE->name);
b8e312
+	if (ret)
b8e312
 		goto err0;
b8e312
 
b8e312
 	/* create control channel */
b8e312
-	if (_stp_register_ctl_channel() < 0)
b8e312
+	ret = _stp_register_ctl_channel();
b8e312
+	if (ret < 0)
b8e312
 		goto err1;
b8e312
 
b8e312
 	/* create print buffers */
b8e312
-	if (_stp_print_init() < 0)
b8e312
+	ret = _stp_print_init();
b8e312
+	if (ret < 0) {
b8e312
+		errk("%s: can't create print buffers!", THIS_MODULE->name);
b8e312
 		goto err2;
b8e312
+	}
b8e312
 
b8e312
 	/* set _stp_module_self dynamic info */
b8e312
-	if (_stp_module_update_self() < 0)
b8e312
+	ret = _stp_module_update_self();
b8e312
+	if (ret < 0) {
b8e312
+		errk("%s: can't update dynamic info!", THIS_MODULE->name);
b8e312
 		goto err3;
b8e312
+	}
b8e312
 
b8e312
 	/* start transport */
b8e312
 	_stp_transport_data_fs_start();
b8e312
@@ -639,7 +649,7 @@ err2:
b8e312
 err1:
b8e312
 	_stp_transport_fs_close();
b8e312
 err0:
b8e312
-	return -1;
b8e312
+	return ret;
b8e312
 }
b8e312
 
b8e312
 static inline void _stp_lock_inode(struct inode *inode)
b8e312
commit 341bf33f14062269c52bcebaa309518d9972ca00
b8e312
Author: Frank Ch. Eigler <fche@redhat.com>
b8e312
Date:   Fri Dec 11 18:06:36 2020 -0500
b8e312
b8e312
    staprun: handle more and fewer cpus better
b8e312
    
b8e312
    NR_CPUS was a hard-coded minimum and maximum on the number of CPUs
b8e312
    worth of trace$N files staprun/stapio would open at startup.  While a
b8e312
    constant is useful for array sizing (and so might as well be really
b8e312
    large), the actual iteration should be informed by get_nprocs_conf(3).
b8e312
    
b8e312
    This patch replaces NR_CPUS with MAX_NR_CPUS (now 1024, why not), and
b8e312
    limits open/thread iterations to the actual number of processors.  It
b8e312
    even prints an error if a behemoth >1K-core machine comes into being.
b8e312
b8e312
diff --git a/staprun/relay.c b/staprun/relay.c
b8e312
index c76e76719..3eb8df34b 100644
b8e312
--- a/staprun/relay.c
b8e312
+++ b/staprun/relay.c
b8e312
@@ -12,16 +12,16 @@
b8e312
 
b8e312
 #include "staprun.h"
b8e312
 
b8e312
-int out_fd[NR_CPUS];
b8e312
+int out_fd[MAX_NR_CPUS];
b8e312
 int monitor_end = 0;
b8e312
-static pthread_t reader[NR_CPUS];
b8e312
-static int relay_fd[NR_CPUS];
b8e312
-static int avail_cpus[NR_CPUS];
b8e312
-static int switch_file[NR_CPUS];
b8e312
-static pthread_mutex_t mutex[NR_CPUS];
b8e312
+static pthread_t reader[MAX_NR_CPUS];
b8e312
+static int relay_fd[MAX_NR_CPUS];
b8e312
+static int avail_cpus[MAX_NR_CPUS];
b8e312
+static int switch_file[MAX_NR_CPUS];
b8e312
+static pthread_mutex_t mutex[MAX_NR_CPUS];
b8e312
 static int bulkmode = 0;
b8e312
 static volatile int stop_threads = 0;
b8e312
-static time_t *time_backlog[NR_CPUS];
b8e312
+static time_t *time_backlog[MAX_NR_CPUS];
b8e312
 static int backlog_order=0;
b8e312
 #define BACKLOG_MASK ((1 << backlog_order) - 1)
b8e312
 #define MONITORLINELENGTH 4096
b8e312
@@ -313,12 +313,19 @@ int init_relayfs(void)
b8e312
 	if (send_request(STP_BULK, rqbuf, sizeof(rqbuf)) == 0)
b8e312
 		bulkmode = 1;
b8e312
 
b8e312
-	/* Try to open a slew of per-cpu trace%d files.  Per PR19241, we
b8e312
-	   need to go through all potentially present CPUs up to NR_CPUS, that
b8e312
-	   we hope is a reasonable limit.  For !bulknode, "trace0" will be
b8e312
-	   typically used. */
b8e312
+	/* Try to open a slew of per-cpu trace%d files.  Per PR19241,
b8e312
+	   we need to go through all potentially present CPUs up to
b8e312
+	   get_nprocs_conf(), up to MAX_NR_CPUS (used for array
b8e312
+	   allocations).  For !bulknode, "trace0" will be typically
b8e312
+	   used, prior to systemtap 4.5; after, all are used. */
b8e312
 
b8e312
-	for (i = 0; i < NR_CPUS; i++) {
b8e312
+        int nprocs = get_nprocs_conf();
b8e312
+        if (nprocs > MAX_NR_CPUS) {
b8e312
+                err("Too many CPUs: get_nprocs_conf()=%d vs MAX_NR_CPUS=%d\n", nprocs, MAX_NR_CPUS);
b8e312
+                return -1;
b8e312
+        }
b8e312
+        
b8e312
+	for (i = 0; i < nprocs; i++) {
b8e312
                 relay_fd[i] = -1;
b8e312
 
b8e312
 #ifdef HAVE_OPENAT
b8e312
@@ -348,7 +355,8 @@ int init_relayfs(void)
b8e312
 		}
b8e312
 	}
b8e312
 	ncpus = cpui;
b8e312
-	dbug(2, "ncpus=%d, bulkmode = %d\n", ncpus, bulkmode);
b8e312
+        /* ncpus could be smaller than nprocs if some cpus are offline */
b8e312
+	dbug(2, "ncpus=%d, nprocs=%d, bulkmode=%d\n", ncpus, nprocs, bulkmode);
b8e312
 	for (i = 0; i < ncpus; i++)
b8e312
 		dbug(2, "cpui=%d, relayfd=%d\n", i, avail_cpus[i]);
b8e312
 
b8e312
diff --git a/staprun/relay_old.c b/staprun/relay_old.c
b8e312
index f0d2e918f..248e6059d 100644
b8e312
--- a/staprun/relay_old.c
b8e312
+++ b/staprun/relay_old.c
b8e312
@@ -14,12 +14,12 @@
b8e312
 #include "staprun.h"
b8e312
 
b8e312
 /* temporary per-cpu output written here for relayfs, filebase0...N */
b8e312
-static int relay_fd[NR_CPUS];
b8e312
-static int proc_fd[NR_CPUS];
b8e312
-static FILE *percpu_tmpfile[NR_CPUS];
b8e312
-static char *relay_buffer[NR_CPUS];
b8e312
-static pthread_t reader[NR_CPUS];
b8e312
-static int switch_file[NR_CPUS];
b8e312
+static int relay_fd[MAX_NR_CPUS];
b8e312
+static int proc_fd[MAX_NR_CPUS];
b8e312
+static FILE *percpu_tmpfile[MAX_NR_CPUS];
b8e312
+static char *relay_buffer[MAX_NR_CPUS];
b8e312
+static pthread_t reader[MAX_NR_CPUS];
b8e312
+static int switch_file[MAX_NR_CPUS];
b8e312
 static int bulkmode = 0;
b8e312
 unsigned subbuf_size = 0;
b8e312
 unsigned n_subbufs = 0;
b8e312
@@ -37,7 +37,7 @@ static struct buf_status
b8e312
 {
b8e312
 	struct _stp_buf_info info;
b8e312
 	unsigned max_backlog; /* max # sub-buffers ready at one time */
b8e312
-} status[NR_CPUS];
b8e312
+} status[MAX_NR_CPUS];
b8e312
 
b8e312
 
b8e312
 /**
b8e312
@@ -461,7 +461,13 @@ int init_oldrelayfs(void)
b8e312
 	relay_fd[0] = -1;
b8e312
 	out_fd[0] = 0;
b8e312
 
b8e312
-	for (i = 0; i < NR_CPUS; i++) {
b8e312
+        int nprocs = get_nprocs_conf();
b8e312
+        if (nprocs > MAX_NR_CPUS) {
b8e312
+                err("Too many CPUs: get_nprocs_conf()=%d vs MAX_NR_CPUS=%d\n", nprocs, MAX_NR_CPUS);
b8e312
+                goto err;
b8e312
+        }
b8e312
+        
b8e312
+	for (i = 0; i < nprocs; i++) {
b8e312
 		int ret = open_relayfs_files(i, relay_filebase, proc_filebase);
b8e312
 		if (ret == 0)
b8e312
 			break;
b8e312
@@ -472,7 +478,8 @@ int init_oldrelayfs(void)
b8e312
 	}
b8e312
 
b8e312
 	ncpus = i;
b8e312
-	dbug(2, "ncpus=%d\n", ncpus);
b8e312
+        /* ncpus could be smaller than nprocs if some cpus are offline */
b8e312
+	dbug(2, "ncpus=%d, nprocs=%d\n", ncpus, nprocs);
b8e312
 
b8e312
 	if (ncpus == 0) {
b8e312
 		err("Couldn't open relayfs files.\n");
b8e312
diff --git a/staprun/stap_merge.c b/staprun/stap_merge.c
b8e312
index 7507f0e3d..87de7d465 100644
b8e312
--- a/staprun/stap_merge.c
b8e312
+++ b/staprun/stap_merge.c
b8e312
@@ -31,15 +31,15 @@ static void usage (char *prog)
b8e312
 }
b8e312
 
b8e312
 #define TIMESTAMP_SIZE (sizeof(int))
b8e312
-#define NR_CPUS 256
b8e312
+#define MAX_NR_CPUS 1024
b8e312
 
b8e312
 int main (int argc, char *argv[])
b8e312
 {
b8e312
 	char *buf, *outfile_name = NULL;
b8e312
 	int c, i, j, rc, dropped=0;
b8e312
-	long count=0, min, num[NR_CPUS] = { 0 };
b8e312
+	long count=0, min, num[MAX_NR_CPUS] = { 0 };
b8e312
 	FILE *ofp = NULL;
b8e312
-	FILE *fp[NR_CPUS] = { 0 };
b8e312
+	FILE *fp[MAX_NR_CPUS] = { 0 };
b8e312
 	int ncpus, len, verbose = 0;
b8e312
 	int bufsize = 65536;
b8e312
 
b8e312
@@ -67,6 +67,10 @@ int main (int argc, char *argv[])
b8e312
 
b8e312
 	i = 0;
b8e312
 	while (optind < argc) {
b8e312
+                if (i >= MAX_NR_CPUS) {
b8e312
+                        fprintf(stderr, "too many files (MAX_NR_CPUS=%d)\n", MAX_NR_CPUS);
b8e312
+			return -1;
b8e312
+		}                  
b8e312
 		fp[i] = fopen(argv[optind++], "r");
b8e312
 		if (!fp[i]) {
b8e312
 			fprintf(stderr, "error opening file %s.\n", argv[optind - 1]);
b8e312
diff --git a/staprun/staprun.h b/staprun/staprun.h
b8e312
index e05dbe5b6..2d68bf527 100644
b8e312
--- a/staprun/staprun.h
b8e312
+++ b/staprun/staprun.h
b8e312
@@ -37,6 +37,7 @@
b8e312
 #include <sys/wait.h>
b8e312
 #include <sys/statfs.h>
b8e312
 #include <syslog.h>
b8e312
+#include <sys/sysinfo.h>
b8e312
 
b8e312
 /* Include config.h to pick up dependency for --prefix usage. */
b8e312
 #include "../config.h"
b8e312
@@ -285,10 +286,10 @@ extern int optopt;
b8e312
 extern int optind;
b8e312
 
b8e312
 /* maximum number of CPUs we can handle */
b8e312
-#define NR_CPUS 256
b8e312
+#define MAX_NR_CPUS 1024
b8e312
 
b8e312
 /* relay*.c uses these */
b8e312
-extern int out_fd[NR_CPUS];
b8e312
+extern int out_fd[MAX_NR_CPUS];
b8e312
 
b8e312
 /* relay_old uses these. Set in ctl.c */
b8e312
 extern unsigned subbuf_size;
b8e312
commit b4b5a29b51586f75de16cacdb44bdf0b3ad0478e
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Mon Dec 14 13:20:34 2020 -0800
b8e312
b8e312
    staprun: use the correct out_fd when bulkmode and fsize_max aren't used
b8e312
    
b8e312
    When bulkmode and fsize_max aren't used, there is only one output fd and
b8e312
    it is stored at out_fd[avail_cpus[0]].
b8e312
b8e312
diff --git a/staprun/relay.c b/staprun/relay.c
b8e312
index 3eb8df34b..d0202e52f 100644
b8e312
--- a/staprun/relay.c
b8e312
+++ b/staprun/relay.c
b8e312
@@ -232,10 +232,17 @@ static void *reader_thread(void *data)
b8e312
 					wbuf += bytes;
b8e312
 					wsize += bytes;
b8e312
 				} else {
b8e312
-	                                rc = write(out_fd[cpu], wbuf, wbytes);
b8e312
+					int fd;
b8e312
+					/* Only bulkmode and fsize_max use per-cpu output files. Otherwise,
b8e312
+					   there's just a single output fd stored at out_fd[avail_cpus[0]]. */
b8e312
+					if (bulkmode || fsize_max)
b8e312
+						fd = out_fd[cpu];
b8e312
+					else
b8e312
+						fd = out_fd[avail_cpus[0]];
b8e312
+	                                rc = write(fd, wbuf, wbytes);
b8e312
 	                                if (rc <= 0) {
b8e312
 						perr("Couldn't write to output %d for cpu %d, exiting.",
b8e312
-	                                             out_fd[cpu], cpu);
b8e312
+	                                             fd, cpu);
b8e312
 	                                        goto error_out;
b8e312
 	                                }
b8e312
 	                                wbytes -= rc;
b8e312
commit b26b4e2c257e0bd65134eed5e51d754227a4ed3f
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Wed Dec 30 14:21:42 2020 -0800
b8e312
b8e312
    task_finder2: fix panics due to broken task work cancellation
b8e312
    
b8e312
    The task_work_cancel() API uses function pointers to uniquely identify
b8e312
    task work structs, so there's no guarantee that a specific task work
b8e312
    struct we want to cancel is the one that will actually get canceled.
b8e312
    This issue would cause task work structs to be freed while they were
b8e312
    still queued up on the task's task-worker list.
b8e312
    
b8e312
    This is an example of one such panic, where the DEBUG_MEM feature
b8e312
    reported that a __stp_tf_task_work struct (56 bytes) wasn't freed,
b8e312
    because that specific task worker got canceled and instead an active
b8e312
    task worker got freed!
b8e312
    
b8e312
    orxray_resty_mem_X_35062: ERROR: Memory ffff8809ed388620 len=56 allocation type: kmalloc. Not freed.
b8e312
    BUG: unable to handle kernel paging request at ffffffffa0570877
b8e312
    IP: [<ffffffffa0570877>] 0xffffffffa0570876
b8e312
    PGD 1abd067 PUD 1abe063 PMD 1028286067 PTE 0
b8e312
    Oops: 0010 [#1] SMP
b8e312
    CPU: 3 PID: 1338 Comm: nginx Tainted: G           OE  ------------   3.10.0-514.10.2.el7.x86_64.debug #1
b8e312
    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
b8e312
    task: ffff880eae2d0000 ti: ffff880eaf2e4000 task.ti: ffff880eaf2e4000
b8e312
    RIP: 0010:[<ffffffffa0570877>]  [<ffffffffa0570877>] 0xffffffffa0570876
b8e312
    RSP: 0018:ffff880eaf2e7d78  EFLAGS: 00010282
b8e312
    RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000
b8e312
    RDX: ffff8809ed388640 RSI: 0000000000000000 RDI: ffff8809ed388640
b8e312
    RBP: ffff880eaf2e7da0 R08: 0000000000000000 R09: 0000000000000000
b8e312
    R10: 0000000000000001 R11: ffffffffff90001c R12: ffffffff8248b1c0
b8e312
    R13: ffff880eae2d0818 R14: ffff880eae2d0000 R15: 00007eff3d2490b0
b8e312
    FS:  00007eff3dcd2740(0000) GS:ffff881037c00000(0000) knlGS:0000000000000000
b8e312
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
b8e312
    CR2: ffffffffa0570877 CR3: 0000000ebce67000 CR4: 00000000003406e0
b8e312
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
b8e312
    DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
b8e312
    Stack:
b8e312
     ffffffff810c6544 ffff880eaf2e7f58 ffff880eaf2e7e70 ffff880eae2d0000
b8e312
     00007eff3dcb3338 ffff880eaf2e7e38 ffffffff810b31ba ffff880eaf2e7dc0
b8e312
     ffffffff8106c279 ffff880eaf2e7e50 ffff880ef8a792c0 ffff880eaf2e7df8
b8e312
    Call Trace:
b8e312
     [<ffffffff810c6544>] ? task_work_run+0xb4/0xe0
b8e312
     [<ffffffff810b31ba>] get_signal_to_deliver+0x85a/0x960
b8e312
     [<ffffffff8106c279>] ? kvm_sched_clock_read+0x9/0x20
b8e312
     [<ffffffff810e7b4d>] ? sched_clock_local+0x1d/0x80
b8e312
     [<ffffffff810e7dd8>] ? sched_clock_cpu+0xb8/0xe0
b8e312
     [<ffffffff810324a7>] do_signal+0x57/0x6e0
b8e312
     [<ffffffff8176dba6>] ? int_very_careful+0x5/0xd
b8e312
     [<ffffffff81032b8f>] do_notify_resume+0x5f/0xb0
b8e312
     [<ffffffff8176dbfd>] int_signal+0x12/0x17
b8e312
    Code:  Bad RIP value.
b8e312
    RIP  [<ffffffffa0570877>] 0xffffffffa0570876
b8e312
     RSP <ffff880eaf2e7d78>
b8e312
    CR2: ffffffffa0570877
b8e312
    ---[ end trace 1cdf8e5b522b246e ]---
b8e312
b8e312
diff --git a/runtime/linux/task_finder2.c b/runtime/linux/task_finder2.c
b8e312
index 4e0b68f7c..ecf1f77fd 100644
b8e312
--- a/runtime/linux/task_finder2.c
b8e312
+++ b/runtime/linux/task_finder2.c
b8e312
@@ -226,10 +226,22 @@ static void __stp_tf_cancel_all_task_work(void)
b8e312
 	// Cancel all remaining requests.
b8e312
 	stp_spin_lock_irqsave(&__stp_tf_task_work_list_lock, flags);
b8e312
 	list_for_each_entry_safe(node, tmp, &__stp_tf_task_work_list, list) {
b8e312
-		if (stp_task_work_cancel(node->task, node->work.func)) {
b8e312
-			list_del(&node->list);
b8e312
-			_stp_kfree(node);
b8e312
-		}
b8e312
+		struct __stp_tf_task_work *tf_work;
b8e312
+		struct task_work *work;
b8e312
+
b8e312
+		work = stp_task_work_cancel(node->task, node->work.func);
b8e312
+		if (!work)
b8e312
+			continue;
b8e312
+
b8e312
+		/*
b8e312
+		 * There can be multiple queued task workers with the same
b8e312
+		 * worker func, so there's no guarantee that tf_work == node.
b8e312
+		 * Therefore, we can only free what stp_task_work_cancel() just
b8e312
+		 * gave us; freeing 'node' would be unsafe.
b8e312
+		 */
b8e312
+		tf_work = container_of(work, typeof(*tf_work), work);
b8e312
+		list_del(&tf_work->list);
b8e312
+		_stp_kfree(tf_work);
b8e312
 	}
b8e312
 	stp_spin_unlock_irqrestore(&__stp_tf_task_work_list_lock, flags);
b8e312
 }
b8e312
commit 96470399a5a6fba864b90afd15eda43cdc8c8ac4
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Wed Dec 30 15:42:11 2020 -0800
b8e312
b8e312
    task_finder2: fix list corruption in __stp_tf_cancel_all_task_work()
b8e312
    
b8e312
    The previous commit (b26b4e2c2 "task_finder2: fix panics due to broken
b8e312
    task work cancellation") made it possible for the next node in the task
b8e312
    work list to be free, which would made list_for_each_entry_safe() not so
b8e312
    safe anymore. Using list_for_each_entry_safe() is still the fastest
b8e312
    approach here, so when the next node in the list happens to be freed, we
b8e312
    should just restart iteration on the list.
b8e312
b8e312
diff --git a/runtime/linux/task_finder2.c b/runtime/linux/task_finder2.c
b8e312
index ecf1f77fd..83fc17b5e 100644
b8e312
--- a/runtime/linux/task_finder2.c
b8e312
+++ b/runtime/linux/task_finder2.c
b8e312
@@ -225,6 +225,7 @@ static void __stp_tf_cancel_all_task_work(void)
b8e312
 
b8e312
 	// Cancel all remaining requests.
b8e312
 	stp_spin_lock_irqsave(&__stp_tf_task_work_list_lock, flags);
b8e312
+restart:
b8e312
 	list_for_each_entry_safe(node, tmp, &__stp_tf_task_work_list, list) {
b8e312
 		struct __stp_tf_task_work *tf_work;
b8e312
 		struct task_work *work;
b8e312
@@ -242,6 +243,21 @@ static void __stp_tf_cancel_all_task_work(void)
b8e312
 		tf_work = container_of(work, typeof(*tf_work), work);
b8e312
 		list_del(&tf_work->list);
b8e312
 		_stp_kfree(tf_work);
b8e312
+
b8e312
+		/*
b8e312
+		 * If the tf_work we just freed was the next node in the list,
b8e312
+		 * then we need to restart the list iteration because
b8e312
+		 * list_for_each_entry_safe() can't cope with the next node
b8e312
+		 * being freed. We still need to use list_for_each_entry_safe()
b8e312
+		 * because we need to get through one successful pass through
b8e312
+		 * the entire list, since it's not guaranteed that this list
b8e312
+		 * will be empty when this function exits, as there can still be
b8e312
+		 * active task workers running, which is fine since the
b8e312
+		 * stp_task_work API will wait for all task workers to finish
b8e312
+		 * before allowing the module to unload.
b8e312
+		 */
b8e312
+		if (tf_work == tmp)
b8e312
+			goto restart;
b8e312
 	}
b8e312
 	stp_spin_unlock_irqrestore(&__stp_tf_task_work_list_lock, flags);
b8e312
 }
b8e312
commit 6cb54128e005d1220a7b064ee42b9f72561c28e7
b8e312
Author: Sultan Alsawaf <sultan@openresty.com>
b8e312
Date:   Wed Dec 30 15:47:58 2020 -0800
b8e312
b8e312
    task_finder2: fix task worker race on module unload
b8e312
    
b8e312
    Unfortunately, __stp_tf_cancel_all_task_work() does not guarantee that
b8e312
    all of the task finder's task workers will be finished executing when it
b8e312
    returns. In this case, we rely on the stp_task_work API to prevent the
b8e312
    module from being unloaded while there are task workers in-flight, which
b8e312
    works, but the stp_task_work API is notified of a task worker finishing
b8e312
    before it actually finishes. Inside __stp_tf_task_worker_fn(), the
b8e312
    call to the task worker's function (tf_work->func) is where the final
b8e312
    refcount in the stp_task_work API could be put, but there will still be
b8e312
    instructions left in the task worker that will be executing for a short
b8e312
    time after that. In that short time, there can be a race where the
b8e312
    module is unloaded before the task worker finishes executing all of its
b8e312
    instructions, especially if the task worker gets preempted during this
b8e312
    time on a PREEMPT kernel.
b8e312
    
b8e312
    To remedy this, we must ensure that the last instruction in
b8e312
    __stp_tf_task_worker_fn() is where the stp_task_work API is notified of
b8e312
    a task worker finishing.
b8e312
b8e312
diff --git a/runtime/linux/task_finder2.c b/runtime/linux/task_finder2.c
b8e312
index 83fc17b5e..2bab19295 100644
b8e312
--- a/runtime/linux/task_finder2.c
b8e312
+++ b/runtime/linux/task_finder2.c
b8e312
@@ -150,6 +150,7 @@ __stp_tf_task_worker_fn(struct task_work *work)
b8e312
 	 * workers for this task.
b8e312
 	 */
b8e312
 	__stp_tf_task_work_free(work);
b8e312
+	stp_task_work_func_done();
b8e312
 }
b8e312
 
b8e312
 static void
b8e312
@@ -1066,11 +1067,8 @@ __stp_tf_clone_worker(struct task_work *work)
b8e312
 
b8e312
 	might_sleep();
b8e312
 	if (atomic_read(&__stp_task_finder_state) != __STP_TF_RUNNING
b8e312
-	    || current->flags & PF_EXITING) {
b8e312
-		/* Remember that this task_work_func is finished. */
b8e312
-		stp_task_work_func_done();
b8e312
+	    || current->flags & PF_EXITING)
b8e312
 		return;
b8e312
-	}
b8e312
 
b8e312
 	__stp_tf_handler_start();
b8e312
 
b8e312
@@ -1085,10 +1083,6 @@ __stp_tf_clone_worker(struct task_work *work)
b8e312
 	}
b8e312
 
b8e312
 	__stp_tf_handler_end();
b8e312
-
b8e312
-	/* Remember that this task_work_func is finished. */
b8e312
-	stp_task_work_func_done();
b8e312
-	return;
b8e312
 }
b8e312
 
b8e312
 
b8e312
@@ -1392,11 +1386,8 @@ __stp_tf_quiesce_worker(struct task_work *work)
b8e312
 
b8e312
 	might_sleep();
b8e312
 	if (atomic_read(&__stp_task_finder_state) != __STP_TF_RUNNING
b8e312
-	    || current->flags & PF_EXITING) {
b8e312
-		/* Remember that this task_work_func is finished. */
b8e312
-		stp_task_work_func_done();
b8e312
+	    || current->flags & PF_EXITING)
b8e312
 		return;
b8e312
-	}
b8e312
 
b8e312
         /* If we had a build-id based executable probe (so we have a
b8e312
          * tgt->build_id) set, we could not check it back in
b8e312
@@ -1420,8 +1411,6 @@ __stp_tf_quiesce_worker(struct task_work *work)
b8e312
                           (long) current->tgid, ok);
b8e312
                 if (!ok) {
b8e312
                         // stap_utrace_detach (current, & tgt->ops);
b8e312
-                        /* Remember that this task_work_func is finished. */
b8e312
-                        stp_task_work_func_done();
b8e312
                         return;
b8e312
                 }
b8e312
         } 
b8e312
@@ -1444,10 +1433,6 @@ __stp_tf_quiesce_worker(struct task_work *work)
b8e312
 	__stp_call_callbacks(tgt, current, 1, (current->pid == current->tgid));
b8e312
 
b8e312
 	__stp_tf_handler_end();
b8e312
-
b8e312
-	/* Remember that this task_work_func is finished. */
b8e312
-	stp_task_work_func_done();
b8e312
-	return;
b8e312
 }
b8e312
 
b8e312
 static u32
b8e312
@@ -1614,18 +1599,12 @@ __stp_tf_mmap_worker(struct task_work *work)
b8e312
 
b8e312
 	// See if we can find saved syscall info.
b8e312
 	entry = __stp_tf_get_map_entry(current);
b8e312
-	if (entry == NULL) {
b8e312
-		/* Remember that this task_work_func is finished. */
b8e312
-		stp_task_work_func_done();
b8e312
+	if (entry == NULL)
b8e312
 		return;
b8e312
-	}
b8e312
 
b8e312
 	if (atomic_read(&__stp_task_finder_state) != __STP_TF_RUNNING
b8e312
 	    || current->flags & PF_EXITING) {
b8e312
 		__stp_tf_remove_map_entry(entry);
b8e312
-
b8e312
-		/* Remember that this task_work_func is finished. */
b8e312
-		stp_task_work_func_done();
b8e312
 		return;
b8e312
 	}
b8e312
 
b8e312
@@ -1650,10 +1629,6 @@ __stp_tf_mmap_worker(struct task_work *work)
b8e312
 	__stp_tf_remove_map_entry(entry);
b8e312
 
b8e312
 	__stp_tf_handler_end();
b8e312
-
b8e312
-	/* Remember that this task_work_func is finished. */
b8e312
-	stp_task_work_func_done();
b8e312
-	return;
b8e312
 }
b8e312
 
b8e312
 static u32