Blame SOURCES/rhbz1906662.patch

665f76
commit 374d37118ae1274077a425261ef1428151eb6d7c
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Tue Nov 10 10:03:34 2020 -0800
665f76
665f76
    stp_utrace: disable IRQs when holding the bucket spin lock
665f76
    
665f76
    This lock can be acquired from inside an IRQ, leading to a deadlock:
665f76
    
665f76
    WARNING: inconsistent lock state
665f76
    4.14.35-1902.6.6.el7uek.x86_64.debug #2 Tainted: G           OE
665f76
    --------------------------------
665f76
    inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
665f76
    sh/15779 [HC1[1]:SC0[0]:HE0:SE1] takes:
665f76
     (&(lock)->rlock#3){?.+.}, at: [<ffffffffc0c080b0>] _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]
665f76
    {HARDIRQ-ON-W} state was registered at:
665f76
      lock_acquire+0xe0/0x238
665f76
      _raw_spin_lock+0x3d/0x7a
665f76
      utrace_task_alloc+0xa4/0xe3 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
      utrace_attach_task+0x136/0x194 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
      __stp_utrace_attach+0x57/0x216 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
      stap_start_task_finder+0x12e/0x33f [orxray_lj_lua_fgraph_XXXXXXX]
665f76
      systemtap_module_init+0x114d/0x11f0 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
      _stp_handle_start+0xea/0x1c5 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
      _stp_ctl_write_cmd+0x28d/0x2d1 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
      full_proxy_write+0x67/0xbb
665f76
      __vfs_write+0x3a/0x170
665f76
      vfs_write+0xc7/0x1c0
665f76
      SyS_write+0x58/0xbf
665f76
      do_syscall_64+0x7e/0x22c
665f76
      entry_SYSCALL_64_after_hwframe+0x16e/0x0
665f76
    irq event stamp: 9454
665f76
    hardirqs last  enabled at (9453): [<ffffffffa696c960>] _raw_write_unlock_irqrestore+0x40/0x67
665f76
    hardirqs last disabled at (9454): [<ffffffffa6a05417>] apic_timer_interrupt+0x1c7/0x1d1
665f76
    softirqs last  enabled at (9202): [<ffffffffa6c00361>] __do_softirq+0x361/0x4e5
665f76
    softirqs last disabled at (9195): [<ffffffffa60aeb76>] irq_exit+0xf6/0x102
665f76
    
665f76
    other info that might help us debug this:
665f76
     Possible unsafe locking scenario:
665f76
    
665f76
           CPU0
665f76
           ----
665f76
      lock(&(lock)->rlock#3);
665f76
      <interrupt>
665f76
        lock(&(lock)->rlock#3);
665f76
    
665f76
     *** DEADLOCK ***
665f76
    
665f76
    no locks held by sh/15779.
665f76
    
665f76
    stack backtrace:
665f76
    CPU: 16 PID: 15779 Comm: sh Tainted: G           OE   4.14.35-1902.6.6.el7uek.x86_64.debug #2
665f76
    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
665f76
    Call Trace:
665f76
     <irq>
665f76
     dump_stack+0x81/0xb6
665f76
     print_usage_bug+0x1fc/0x20d
665f76
     ? check_usage_backwards+0x130/0x12b
665f76
     mark_lock+0x1f8/0x27b
665f76
     __lock_acquire+0x6e7/0x165a
665f76
     ? sched_clock_local+0x18/0x81
665f76
     ? perf_swevent_hrtimer+0x136/0x151
665f76
     lock_acquire+0xe0/0x238
665f76
     ? _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     _raw_spin_lock_irqsave+0x55/0x97
665f76
     ? _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     _stp_ctl_get_buffer+0x69/0x215 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     _stp_ctl_send+0x4e/0x169 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     _stp_vlog+0xac/0x143 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     ? _stp_utrace_probe_cb+0xa4/0xa4 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     _stp_warn+0x6a/0x88 [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     function___global_warn__overload_0+0x60/0xac [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     probe_67+0xce/0x10e [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     _stp_hrtimer_notify_function+0x2db/0x55f [orxray_lj_lua_fgraph_XXXXXXX]
665f76
     __hrtimer_run_queues+0x132/0x5c5
665f76
     hrtimer_interrupt+0xb7/0x1ca
665f76
     smp_apic_timer_interrupt+0xa5/0x35a
665f76
     apic_timer_interrupt+0x1cc/0x1d1
665f76
     </irq>
665f76
665f76
diff --git a/runtime/stp_utrace.c b/runtime/stp_utrace.c
665f76
index e2880f1e4..46ba48923 100644
665f76
--- a/runtime/stp_utrace.c
665f76
+++ b/runtime/stp_utrace.c
665f76
@@ -490,9 +490,9 @@ static int utrace_exit(void)
665f76
 		rcu_read_lock();
665f76
 		stap_hlist_for_each_entry_rcu(utrace, node, &bucket->head, hlist) {
665f76
 			utrace->freed = true;
665f76
-			stp_spin_lock(&bucket->lock);
665f76
+			stp_spin_lock_irqsave(&bucket->lock, flags);
665f76
 			hlist_del_rcu(&utrace->hlist);
665f76
-			stp_spin_unlock(&bucket->lock);
665f76
+			stp_spin_unlock_irqrestore(&bucket->lock, flags);
665f76
 
665f76
 			utrace_cleanup(utrace);
665f76
 		}
665f76
@@ -724,6 +724,7 @@ static struct utrace *utrace_task_alloc(struct utrace_bucket *bucket,
665f76
 					struct task_struct *task)
665f76
 {
665f76
 	struct utrace *utrace;
665f76
+	unsigned long flags;
665f76
 
665f76
 	utrace = kmem_cache_zalloc(utrace_cachep, STP_ALLOC_FLAGS);
665f76
 	if (unlikely(!utrace))
665f76
@@ -739,9 +740,9 @@ static struct utrace *utrace_task_alloc(struct utrace_bucket *bucket,
665f76
 	atomic_set(&utrace->resume_work_added, 0);
665f76
 	atomic_set(&utrace->report_work_added, 0);
665f76
 
665f76
-	stp_spin_lock(&bucket->lock);
665f76
+	stp_spin_lock_irqsave(&bucket->lock, flags);
665f76
 	hlist_add_head_rcu(&utrace->hlist, &bucket->head);
665f76
-	stp_spin_unlock(&bucket->lock);
665f76
+	stp_spin_unlock_irqrestore(&bucket->lock, flags);
665f76
 	return utrace;
665f76
 }
665f76
 
665f76
@@ -768,15 +769,17 @@ static struct utrace *utrace_task_alloc(struct utrace_bucket *bucket,
665f76
  */
665f76
 static void utrace_free(struct utrace_bucket *bucket, struct utrace *utrace)
665f76
 {
665f76
+	unsigned long flags;
665f76
+
665f76
 	if (unlikely(!utrace))
665f76
 		return;
665f76
 
665f76
 	/* Remove this utrace from the mapping list of tasks to
665f76
 	 * struct utrace. */
665f76
 	utrace->freed = true;
665f76
-	stp_spin_lock(&bucket->lock);
665f76
+	stp_spin_lock_irqsave(&bucket->lock, flags);
665f76
 	hlist_del_rcu(&utrace->hlist);
665f76
-	stp_spin_unlock(&bucket->lock);
665f76
+	stp_spin_unlock_irqrestore(&bucket->lock, flags);
665f76
 
665f76
 	/* Free the utrace struct. */
665f76
 #ifdef STP_TF_DEBUG
665f76
commit 6a092f5ae824d4ce972c10b8681b9272e2fd67f3
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Tue Nov 17 11:03:53 2020 -0800
665f76
665f76
    task_finder: call _stp_vma_done() upon error to fix memory leak
665f76
    
665f76
    The memory allocated inside stap_initialize_vma_map() is not freed upon
665f76
    error when the task finder is started because a call to _stp_vma_done()
665f76
    in the error path is missing. Add it to fix the leak.
665f76
665f76
diff --git a/task_finder.cxx b/task_finder.cxx
665f76
index d08d44a75..7c45e728b 100644
665f76
--- a/task_finder.cxx
665f76
+++ b/task_finder.cxx
665f76
@@ -66,6 +66,7 @@ task_finder_derived_probe_group::emit_module_init (systemtap_session& s)
665f76
 
665f76
   s.op->newline() << "if (rc) {";
665f76
   s.op->newline(1) << "stap_stop_task_finder();";
665f76
+  s.op->newline() << "_stp_vma_done();";
665f76
   s.op->newline(-1) << "}";
665f76
   s.op->newline(-1) << "}";
665f76
 }
665f76
commit 3c4f82ca024df4f8e213f7c77418493262d4a4d7
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Tue Nov 24 10:50:10 2020 -0800
665f76
665f76
    runtime_context: factor out RCU usage using a rw lock
665f76
    
665f76
    We can factor out the RCU insanity in here by just adding in a rw lock
665f76
    and using that to synchronize _stp_runtime_contexts_free() with any code
665f76
    that has the runtime context held.
665f76
665f76
diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h
665f76
index 41fecba81..18566957a 100644
665f76
--- a/runtime/linux/runtime_context.h
665f76
+++ b/runtime/linux/runtime_context.h
665f76
@@ -11,15 +11,14 @@
665f76
 #ifndef _LINUX_RUNTIME_CONTEXT_H_
665f76
 #define _LINUX_RUNTIME_CONTEXT_H_
665f76
 
665f76
-#ifndef __rcu
665f76
-#define __rcu
665f76
-#endif
665f76
-
665f76
-static struct context __rcu *contexts[NR_CPUS] = { NULL };
665f76
+/* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */
665f76
+static DEFINE_RWLOCK(_stp_context_lock);
665f76
+static DEFINE_PER_CPU(struct context *, contexts);
665f76
+static atomic_t _stp_context_stop = ATOMIC_INIT(0);
665f76
 
665f76
 static int _stp_runtime_contexts_alloc(void)
665f76
 {
665f76
-	int cpu;
665f76
+	unsigned int cpu;
665f76
 
665f76
 	for_each_possible_cpu(cpu) {
665f76
 		/* Module init, so in user context, safe to use
665f76
@@ -31,91 +30,67 @@ static int _stp_runtime_contexts_alloc(void)
665f76
 				    (unsigned long) sizeof (struct context));
665f76
 			return -ENOMEM;
665f76
 		}
665f76
-		rcu_assign_pointer(contexts[cpu], c);
665f76
+		per_cpu(contexts, cpu) = c;
665f76
 	}
665f76
 	return 0;
665f76
 }
665f76
 
665f76
 /* We should be free of all probes by this time, but for example the timer for
665f76
  * _stp_ctl_work_callback may still be running and looking for contexts.  We
665f76
- * use RCU-sched synchronization to be sure its safe to free them.  */
665f76
+ * use _stp_context_stop and a write lock to be sure its safe to free them.  */
665f76
 static void _stp_runtime_contexts_free(void)
665f76
 {
665f76
-	// Note that 'free_contexts' is static because it is
665f76
-	// (probably) too big to fit on a kernel function's stack.
665f76
-	static struct context *free_contexts[NR_CPUS] = { NULL };
665f76
-	int cpu;
665f76
+	unsigned long flags;
665f76
+	unsigned int cpu;
665f76
 
665f76
-	/* First, save all the pointers.  */
665f76
-	rcu_read_lock_sched();
665f76
-	for_each_possible_cpu(cpu) {
665f76
-		free_contexts[cpu] = rcu_dereference_sched(contexts[cpu]);
665f76
-	}
665f76
-	rcu_read_unlock_sched();
665f76
+	/* Sync to make sure existing readers are done */
665f76
+	atomic_set(&_stp_context_stop, 1);
665f76
+	write_lock_irqsave(&_stp_context_lock, flags);
665f76
+	write_unlock_irqrestore(&_stp_context_lock, flags);
665f76
 
665f76
-	/* Now clear all pointers to prevent new readers.  */
665f76
-	for_each_possible_cpu(cpu) {
665f76
-		rcu_assign_pointer(contexts[cpu], NULL);
665f76
-	}
665f76
-
665f76
-	/* Sync to make sure existing readers are done.  */
665f76
-	stp_synchronize_sched();
665f76
-
665f76
-	/* Now we can actually free the contexts.  */
665f76
-	for_each_possible_cpu(cpu) {
665f76
-		struct context *c = free_contexts[cpu];
665f76
-		if (c != NULL) {
665f76
-			free_contexts[cpu] = NULL;
665f76
-			_stp_vfree(c);
665f76
-		}
665f76
-	}
665f76
+	/* Now we can actually free the contexts */
665f76
+	for_each_possible_cpu(cpu)
665f76
+		_stp_vfree(per_cpu(contexts, cpu));
665f76
 }
665f76
 
665f76
 static inline struct context * _stp_runtime_get_context(void)
665f76
 {
665f76
-	// RHBZ1788662 rcu operations are rejected in idle-cpu contexts
665f76
-	// in effect: skip probe if it's in rcu-idle state
665f76
-#if defined(STAPCONF_RCU_IS_WATCHING) || LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0) // linux commit #5c173eb8
665f76
-        if (! rcu_is_watching())
665f76
-		return 0;
665f76
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0) // linux commit #9b2e4f18
665f76
-        if (! rcu_is_cpu_idle())
665f76
-		return 0;
665f76
-#else
665f76
-	; // XXX older kernels didn't put tracepoints in idle-cpu
665f76
-#endif
665f76
-	return rcu_dereference_sched(contexts[smp_processor_id()]);
665f76
+	if (atomic_read(&_stp_context_stop))
665f76
+		return NULL;
665f76
+
665f76
+	return per_cpu(contexts, smp_processor_id());
665f76
 }
665f76
 
665f76
 static struct context * _stp_runtime_entryfn_get_context(void)
665f76
+	__acquires(&_stp_context_lock)
665f76
 {
665f76
 	struct context* __restrict__ c = NULL;
665f76
-	preempt_disable ();
665f76
+
665f76
+	if (!read_trylock(&_stp_context_lock))
665f76
+		return NULL;
665f76
+
665f76
 	c = _stp_runtime_get_context();
665f76
 	if (c != NULL) {
665f76
-		if (atomic_inc_return(&c->busy) == 1) {
665f76
-			// NB: Notice we're not re-enabling preemption
665f76
+		if (!atomic_cmpxchg(&c->busy, 0, 1)) {
665f76
+			// NB: Notice we're not releasing _stp_context_lock
665f76
 			// here. We exepect the calling code to call
665f76
 			// _stp_runtime_entryfn_get_context() and
665f76
 			// _stp_runtime_entryfn_put_context() as a
665f76
 			// pair.
665f76
 			return c;
665f76
 		}
665f76
-		atomic_dec(&c->busy);
665f76
 	}
665f76
-	preempt_enable_no_resched();
665f76
+	read_unlock(&_stp_context_lock);
665f76
 	return NULL;
665f76
 }
665f76
 
665f76
 static inline void _stp_runtime_entryfn_put_context(struct context *c)
665f76
+	__releases(&_stp_context_lock)
665f76
 {
665f76
 	if (c) {
665f76
-		if (c == _stp_runtime_get_context())
665f76
-			atomic_dec(&c->busy);
665f76
-		/* else, warn about bad state? */
665f76
-		preempt_enable_no_resched();
665f76
+		atomic_set(&c->busy, 0);
665f76
+		read_unlock(&_stp_context_lock);
665f76
 	}
665f76
-	return;
665f76
 }
665f76
 
665f76
 static void _stp_runtime_context_wait(void)
665f76
@@ -130,9 +105,13 @@ static void _stp_runtime_context_wait(void)
665f76
 		int i;
665f76
 
665f76
 		holdon = 0;
665f76
-		rcu_read_lock_sched();
665f76
+		read_lock(&_stp_context_lock);
665f76
+		if (atomic_read(&_stp_context_stop)) {
665f76
+			read_unlock(&_stp_context_lock);
665f76
+			break;
665f76
+		}
665f76
 		for_each_possible_cpu(i) {
665f76
-			struct context *c = rcu_dereference_sched(contexts[i]);
665f76
+			struct context *c = per_cpu(contexts, i);
665f76
 			if (c != NULL
665f76
 			    && atomic_read (& c->busy)) {
665f76
 				holdon = 1;
665f76
@@ -146,7 +125,7 @@ static void _stp_runtime_context_wait(void)
665f76
 				}
665f76
 			}
665f76
 		}
665f76
-		rcu_read_unlock_sched();
665f76
+		read_unlock(&_stp_context_lock);
665f76
 
665f76
 		/*
665f76
 		 * Just in case things are really really stuck, a
665f76
commit bb25d64f7b6c98ef2fc8b711f19bd6271866d727
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Tue Dec 1 09:54:07 2020 -0800
665f76
665f76
    runtime_context: synchronize _stp_context_stop more strictly
665f76
    
665f76
    We're only reading _stp_context_stop while the read lock is held, so we
665f76
    can move the modification of it to inside the write lock to ensure
665f76
    strict memory ordering. As such, it no longer needs to be an atomic_t
665f76
    variable.
665f76
    
665f76
    We also don't need to disable IRQs when holding the write lock because
665f76
    only read_trylock is used from IRQ context, not read_lock, so there's no
665f76
    possibility of a deadlock occurring.
665f76
665f76
diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h
665f76
index 18566957a..e716e6d39 100644
665f76
--- a/runtime/linux/runtime_context.h
665f76
+++ b/runtime/linux/runtime_context.h
665f76
@@ -14,7 +14,7 @@
665f76
 /* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */
665f76
 static DEFINE_RWLOCK(_stp_context_lock);
665f76
 static DEFINE_PER_CPU(struct context *, contexts);
665f76
-static atomic_t _stp_context_stop = ATOMIC_INIT(0);
665f76
+static bool _stp_context_stop;
665f76
 
665f76
 static int _stp_runtime_contexts_alloc(void)
665f76
 {
665f76
@@ -40,13 +40,12 @@ static int _stp_runtime_contexts_alloc(void)
665f76
  * use _stp_context_stop and a write lock to be sure its safe to free them.  */
665f76
 static void _stp_runtime_contexts_free(void)
665f76
 {
665f76
-	unsigned long flags;
665f76
 	unsigned int cpu;
665f76
 
665f76
 	/* Sync to make sure existing readers are done */
665f76
-	atomic_set(&_stp_context_stop, 1);
665f76
-	write_lock_irqsave(&_stp_context_lock, flags);
665f76
-	write_unlock_irqrestore(&_stp_context_lock, flags);
665f76
+	write_lock(&_stp_context_lock);
665f76
+	_stp_context_stop = true;
665f76
+	write_unlock(&_stp_context_lock);
665f76
 
665f76
 	/* Now we can actually free the contexts */
665f76
 	for_each_possible_cpu(cpu)
665f76
@@ -55,7 +54,7 @@ static void _stp_runtime_contexts_free(void)
665f76
 
665f76
 static inline struct context * _stp_runtime_get_context(void)
665f76
 {
665f76
-	if (atomic_read(&_stp_context_stop))
665f76
+	if (_stp_context_stop)
665f76
 		return NULL;
665f76
 
665f76
 	return per_cpu(contexts, smp_processor_id());
665f76
@@ -106,7 +105,7 @@ static void _stp_runtime_context_wait(void)
665f76
 
665f76
 		holdon = 0;
665f76
 		read_lock(&_stp_context_lock);
665f76
-		if (atomic_read(&_stp_context_stop)) {
665f76
+		if (_stp_context_stop) {
665f76
 			read_unlock(&_stp_context_lock);
665f76
 			break;
665f76
 		}
665f76
commit 0cc239e6f0fff79cb584fc857d3220402558db37
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Tue Dec 1 18:47:04 2020 -0800
665f76
665f76
    runtime_context: replace _stp_context_lock with an atomic variable
665f76
    
665f76
    We can't use any lock primitives here, such as spin locks or rw locks,
665f76
    because lock_acquire() has tracepoints inside of it. This can cause a
665f76
    deadlock, so we have to roll our own synchronization mechanism using an
665f76
    atomic variable.
665f76
665f76
diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h
665f76
index e716e6d39..7dd240e1a 100644
665f76
--- a/runtime/linux/runtime_context.h
665f76
+++ b/runtime/linux/runtime_context.h
665f76
@@ -11,10 +11,9 @@
665f76
 #ifndef _LINUX_RUNTIME_CONTEXT_H_
665f76
 #define _LINUX_RUNTIME_CONTEXT_H_
665f76
 
665f76
-/* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */
665f76
-static DEFINE_RWLOCK(_stp_context_lock);
665f76
+/* Can't use a lock primitive for this because lock_acquire() has tracepoints */
665f76
+static atomic_t _stp_contexts_busy_ctr = ATOMIC_INIT(0);
665f76
 static DEFINE_PER_CPU(struct context *, contexts);
665f76
-static bool _stp_context_stop;
665f76
 
665f76
 static int _stp_runtime_contexts_alloc(void)
665f76
 {
665f76
@@ -37,15 +36,14 @@ static int _stp_runtime_contexts_alloc(void)
665f76
 
665f76
 /* We should be free of all probes by this time, but for example the timer for
665f76
  * _stp_ctl_work_callback may still be running and looking for contexts.  We
665f76
- * use _stp_context_stop and a write lock to be sure its safe to free them.  */
665f76
+ * use _stp_contexts_busy_ctr to be sure its safe to free them.  */
665f76
 static void _stp_runtime_contexts_free(void)
665f76
 {
665f76
 	unsigned int cpu;
665f76
 
665f76
 	/* Sync to make sure existing readers are done */
665f76
-	write_lock(&_stp_context_lock);
665f76
-	_stp_context_stop = true;
665f76
-	write_unlock(&_stp_context_lock);
665f76
+	while (atomic_cmpxchg(&_stp_contexts_busy_ctr, 0, INT_MAX))
665f76
+		cpu_relax();
665f76
 
665f76
 	/* Now we can actually free the contexts */
665f76
 	for_each_possible_cpu(cpu)
665f76
@@ -54,24 +52,20 @@ static void _stp_runtime_contexts_free(void)
665f76
 
665f76
 static inline struct context * _stp_runtime_get_context(void)
665f76
 {
665f76
-	if (_stp_context_stop)
665f76
-		return NULL;
665f76
-
665f76
 	return per_cpu(contexts, smp_processor_id());
665f76
 }
665f76
 
665f76
 static struct context * _stp_runtime_entryfn_get_context(void)
665f76
-	__acquires(&_stp_context_lock)
665f76
 {
665f76
 	struct context* __restrict__ c = NULL;
665f76
 
665f76
-	if (!read_trylock(&_stp_context_lock))
665f76
+	if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))
665f76
 		return NULL;
665f76
 
665f76
 	c = _stp_runtime_get_context();
665f76
 	if (c != NULL) {
665f76
 		if (!atomic_cmpxchg(&c->busy, 0, 1)) {
665f76
-			// NB: Notice we're not releasing _stp_context_lock
665f76
+			// NB: Notice we're not releasing _stp_contexts_busy_ctr
665f76
 			// here. We exepect the calling code to call
665f76
 			// _stp_runtime_entryfn_get_context() and
665f76
 			// _stp_runtime_entryfn_put_context() as a
665f76
@@ -79,16 +73,15 @@ static struct context * _stp_runtime_entryfn_get_context(void)
665f76
 			return c;
665f76
 		}
665f76
 	}
665f76
-	read_unlock(&_stp_context_lock);
665f76
+	atomic_dec(&_stp_contexts_busy_ctr);
665f76
 	return NULL;
665f76
 }
665f76
 
665f76
 static inline void _stp_runtime_entryfn_put_context(struct context *c)
665f76
-	__releases(&_stp_context_lock)
665f76
 {
665f76
 	if (c) {
665f76
 		atomic_set(&c->busy, 0);
665f76
-		read_unlock(&_stp_context_lock);
665f76
+		atomic_dec(&_stp_contexts_busy_ctr);
665f76
 	}
665f76
 }
665f76
 
665f76
@@ -104,11 +97,9 @@ static void _stp_runtime_context_wait(void)
665f76
 		int i;
665f76
 
665f76
 		holdon = 0;
665f76
-		read_lock(&_stp_context_lock);
665f76
-		if (_stp_context_stop) {
665f76
-			read_unlock(&_stp_context_lock);
665f76
+		if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))
665f76
 			break;
665f76
-		}
665f76
+
665f76
 		for_each_possible_cpu(i) {
665f76
 			struct context *c = per_cpu(contexts, i);
665f76
 			if (c != NULL
665f76
@@ -124,7 +115,7 @@ static void _stp_runtime_context_wait(void)
665f76
 				}
665f76
 			}
665f76
 		}
665f76
-		read_unlock(&_stp_context_lock);
665f76
+		atomic_dec(&_stp_contexts_busy_ctr);
665f76
 
665f76
 		/*
665f76
 		 * Just in case things are really really stuck, a
665f76
commit fbab0ea35e6af0d6599c6de3708b24008bf03ae6
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Wed Dec 2 11:07:11 2020 -0800
665f76
665f76
    runtime_context: disable preempt while holding the context
665f76
    
665f76
    After the context lock was converted to an atomic in the previous
665f76
    commit, the preempt disable logic disappeared. Add it back.
665f76
665f76
diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h
665f76
index 7dd240e1a..7a1532e54 100644
665f76
--- a/runtime/linux/runtime_context.h
665f76
+++ b/runtime/linux/runtime_context.h
665f76
@@ -34,6 +34,24 @@ static int _stp_runtime_contexts_alloc(void)
665f76
 	return 0;
665f76
 }
665f76
 
665f76
+static bool _stp_runtime_context_trylock(void)
665f76
+{
665f76
+	bool locked;
665f76
+
665f76
+	preempt_disable();
665f76
+	locked = atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX);
665f76
+	if (!locked)
665f76
+		preempt_enable_no_resched();
665f76
+
665f76
+	return locked;
665f76
+}
665f76
+
665f76
+static void _stp_runtime_context_unlock(void)
665f76
+{
665f76
+	atomic_dec(&_stp_contexts_busy_ctr);
665f76
+	preempt_enable_no_resched();
665f76
+}
665f76
+
665f76
 /* We should be free of all probes by this time, but for example the timer for
665f76
  * _stp_ctl_work_callback may still be running and looking for contexts.  We
665f76
  * use _stp_contexts_busy_ctr to be sure its safe to free them.  */
665f76
@@ -59,7 +77,7 @@ static struct context * _stp_runtime_entryfn_get_context(void)
665f76
 {
665f76
 	struct context* __restrict__ c = NULL;
665f76
 
665f76
-	if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))
665f76
+	if (!_stp_runtime_context_trylock())
665f76
 		return NULL;
665f76
 
665f76
 	c = _stp_runtime_get_context();
665f76
@@ -73,7 +91,7 @@ static struct context * _stp_runtime_entryfn_get_context(void)
665f76
 			return c;
665f76
 		}
665f76
 	}
665f76
-	atomic_dec(&_stp_contexts_busy_ctr);
665f76
+	_stp_runtime_context_unlock();
665f76
 	return NULL;
665f76
 }
665f76
 
665f76
@@ -81,7 +99,7 @@ static inline void _stp_runtime_entryfn_put_context(struct context *c)
665f76
 {
665f76
 	if (c) {
665f76
 		atomic_set(&c->busy, 0);
665f76
-		atomic_dec(&_stp_contexts_busy_ctr);
665f76
+		_stp_runtime_context_unlock();
665f76
 	}
665f76
 }
665f76
 
665f76
@@ -97,7 +115,7 @@ static void _stp_runtime_context_wait(void)
665f76
 		int i;
665f76
 
665f76
 		holdon = 0;
665f76
-		if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))
665f76
+		if (!_stp_runtime_context_trylock())
665f76
 			break;
665f76
 
665f76
 		for_each_possible_cpu(i) {
665f76
@@ -115,7 +133,7 @@ static void _stp_runtime_context_wait(void)
665f76
 				}
665f76
 			}
665f76
 		}
665f76
-		atomic_dec(&_stp_contexts_busy_ctr);
665f76
+		_stp_runtime_context_unlock();
665f76
 
665f76
 		/*
665f76
 		 * Just in case things are really really stuck, a
665f76
commit aedc044d5d38cb2fa6144d0a3345d06847862f1b
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Wed Dec 2 11:27:47 2020 -0800
665f76
665f76
    task_finder_vma: add kfree_rcu() compat for old kernels
665f76
    
665f76
    Newer RHEL 6 kernels have kfree_rcu(), but older ones do not. Using
665f76
    kfree_rcu() is beneficial because it lets the RCU subsystem know that
665f76
    the queued RCU callback is low-priority, and can be deferred, hence why
665f76
    we don't replace kfree_rcu() with call_rcu() outright. Luckily,
665f76
    kfree_rcu() is a macro so we can just #ifdef with it.
665f76
665f76
diff --git a/runtime/task_finder_vma.c b/runtime/task_finder_vma.c
665f76
index 7f0f6ed56..dc77a80f5 100644
665f76
--- a/runtime/task_finder_vma.c
665f76
+++ b/runtime/task_finder_vma.c
665f76
@@ -87,6 +87,15 @@ __stp_tf_vma_new_entry(void)
665f76
 	return entry;
665f76
 }
665f76
 
665f76
+#ifndef kfree_rcu
665f76
+static void __stp_tf_vma_free_entry(struct rcu_head *rcu)
665f76
+{
665f76
+	struct __stp_tf_vma_entry *entry = container_of(rcu, typeof(*entry), rcu);
665f76
+
665f76
+	kfree(entry);
665f76
+}
665f76
+#endif
665f76
+
665f76
 // __stp_tf_vma_put_entry(): Put a specified number of references on the entry.
665f76
 static void
665f76
 __stp_tf_vma_put_entry(struct __stp_tf_vma_bucket *bucket,
665f76
@@ -106,7 +115,11 @@ __stp_tf_vma_put_entry(struct __stp_tf_vma_bucket *bucket,
665f76
 	hlist_del_rcu(&entry->hlist);
665f76
 	stp_spin_unlock_irqrestore(&bucket->lock, flags);
665f76
 
665f76
+#ifdef kfree_rcu
665f76
 	kfree_rcu(entry, rcu);
665f76
+#else
665f76
+	call_rcu(&entry->rcu, __stp_tf_vma_free_entry);
665f76
+#endif
665f76
 }
665f76
 
665f76
 // stap_initialize_vma_map():  Initialize the free list.  Grabs the
665f76
commit 6a27888b118b7a94650a68aae028957cdd5fb5f5
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Wed Dec 2 18:09:17 2020 -0800
665f76
665f76
    REVERTME: tapset-timers: work around on-the-fly deadlocks caused by mutex_trylock
665f76
    
665f76
    The following deadlock exists due to tracepoints existing inside a lock
665f76
    that is used both inside probe context and outside probe context:
665f76
     #0 [ffff88017f6d7a08] kvm_wait at ffffffff81079f5a
665f76
     #1 [ffff88017f6d7a30] __pv_queued_spin_lock_slowpath at ffffffff8114f51e
665f76
     #2 [ffff88017f6d7a70] queued_spin_lock_slowpath at ffffffff810e842b
665f76
     #3 [ffff88017f6d7a80] mutex_trylock at ffffffff81882b1b
665f76
     #4 [ffff88017f6d7ab8] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
     #5 [ffff88017f6d7ad8] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
     #6 [ffff88017f6d7b10] probe_7879 at ffffffffc0a98c85 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
     #7 [ffff88017f6d7b38] enter_real_tracepoint_probe_1543 at ffffffffc0c3b757 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
     #8 [ffff88017f6d7b70] enter_tracepoint_probe_1543 at ffffffffc09b117e [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
     #9 [ffff88017f6d7b80] lock_acquire at ffffffff811460ba
665f76
    #10 [ffff88017f6d7be8] mutex_trylock at ffffffff81882a27
665f76
    #11 [ffff88017f6d7c20] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
    #12 [ffff88017f6d7c40] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
    #13 [ffff88017f6d7c78] _stp_vlog at ffffffffc09b8d32 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
    #14 [ffff88017f6d7cd8] _stp_dbug at ffffffffc09ba43b [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
    #15 [ffff88017f6d7d38] systemtap_module_refresh at ffffffffc09ba51d [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
    #16 [ffff88017f6d7d50] module_refresher at ffffffffc09ba53e [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
    #17 [ffff88017f6d7d60] process_one_work at ffffffff810da9cc
665f76
    #18 [ffff88017f6d7de8] worker_thread at ffffffff810dafe6
665f76
    #19 [ffff88017f6d7e48] kthread at ffffffff810e44cf
665f76
    #20 [ffff88017f6d7f50] ret_from_fork_nospec_begin at ffffffff818958dd
665f76
    
665f76
    Note the deadlock due to _stp_transport_trylock_relay_inode recursing
665f76
    onto itself via mutex_trylock.
665f76
    
665f76
    This is a temporary fix for the issue until a proper patch is made to
665f76
    remove the mutex_trylock from __stp_print_flush. This should be reverted
665f76
    when that patch lands (it will have something to do with bulkmode).
665f76
665f76
diff --git a/tapset-timers.cxx b/tapset-timers.cxx
665f76
index 10da17cda..503498c85 100644
665f76
--- a/tapset-timers.cxx
665f76
+++ b/tapset-timers.cxx
665f76
@@ -391,11 +391,11 @@ hrtimer_derived_probe_group::emit_module_refresh (systemtap_session& s)
665f76
   s.op->newline(+1) <<   "struct stap_hrtimer_probe* stp = &stap_hrtimer_probes[i];";
665f76
   // timer disabled, but condition says enabled?
665f76
   s.op->newline( 0) <<   "if (!stp->enabled && stp->probe->cond_enabled) {";
665f76
-  s.op->newline(+1) <<     "dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
665f76
+  s.op->newline(+1) <<     "//dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
665f76
   s.op->newline( 0) <<     "_stp_hrtimer_start(stp);";
665f76
   // timer enabled, but condition says disabled?
665f76
   s.op->newline(-1) <<   "} else if (stp->enabled && !stp->probe->cond_enabled) {";
665f76
-  s.op->newline(+1) <<     "dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
665f76
+  s.op->newline(+1) <<     "//dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);";
665f76
   s.op->newline( 0) <<     "_stp_hrtimer_cancel(stp);";
665f76
   s.op->newline(-1) <<   "}";
665f76
   s.op->newline( 0) <<   "stp->enabled = stp->probe->cond_enabled;";
665f76
commit 7187dcf39412fcb25c432d318be8e49a6051f055
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Thu Dec 3 12:57:34 2020 -0800
665f76
665f76
    runtime: fix print races in IRQ context and during print cleanup
665f76
    
665f76
    Prints can race when there's a print called from IRQ context or a print
665f76
    called while print cleanup takes place, which can lead to garbled print
665f76
    messages, out-of-bounds memory accesses, and memory use-after-free. This
665f76
    is one example of racy modification of the print buffer len in IRQ
665f76
    context which caused a panic due to an out-of-bounds memory access:
665f76
    
665f76
    BUG: unable to handle kernel paging request at ffffe8ffff621000
665f76
    IP: [<ffffffffc05da0f3>] _stp_vsprint_memory+0x83/0x950 [stap_2c44636dfda18135ca3012a752599da6_13_533]
665f76
    PGD 174b90067 PUD 174b8f067 PMD 174b93067 PTE 0
665f76
    Oops: 0002 [#1] SMP
665f76
    CPU: 12 PID: 3468 Comm: cat Kdump: loaded Tainted: G           OE  ------------   3.10.0-1127.19.1.el7.x86_64.debug #1
665f76
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014
665f76
    task: ffff88001f4f0000 ti: ffff88004ea5c000 task.ti: ffff88004ea5c000
665f76
    RIP: 0010:[<ffffffffc05da0f3>]  [<ffffffffc05da0f3>] _stp_vsprint_memory+0x83/0x950 [stap_2c44636dfda18135ca3012a752599da6_13_533]
665f76
    RSP: 0018:ffff88004ea5f9a8  EFLAGS: 00010082
665f76
    RAX: ffffe8ffff621001 RBX: ffffe8ffff620ff2 RCX: fffffffffffffffe
665f76
    RDX: 000000000000006e RSI: ffffffffffffffff RDI: ffffc90002c23730
665f76
    RBP: ffff88004ea5fa28 R08: 00000000ffffffff R09: 0000000000000073
665f76
    R10: ffffc90002c243d7 R11: 0000000000000001 R12: ffffc90002c2373f
665f76
    R13: ffffe8ffff621004 R14: 0000000000000012 R15: 00000000fffffffe
665f76
    FS:  00007f8a9b1d4740(0000) GS:ffff880179e00000(0000) knlGS:0000000000000000
665f76
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
665f76
    CR2: ffffe8ffff621000 CR3: 00000000b3e3c000 CR4: 0000000000360fe0
665f76
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
665f76
    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
665f76
    Call Trace:
665f76
     [<ffffffff8103eb89>] ? sched_clock+0x9/0x10
665f76
     [<ffffffff8114036f>] ? lock_release_holdtime.part.30+0xf/0x1a0
665f76
     [<ffffffffc05dcb80>] function___global_trace__overload_0+0x5b0/0x1220 [stap_2c44636dfda18135ca3012a752599da6_13_533]
665f76
     [<ffffffffc05d8993>] ? stp_lock_probe+0x53/0xe0 [stap_2c44636dfda18135ca3012a752599da6_13_533]
665f76
     [<ffffffff8188d879>] ? kretprobe_trampoline_holder+0x9/0x9
665f76
     [<ffffffffc05e0662>] probe_7118+0x82/0xe0 [stap_2c44636dfda18135ca3012a752599da6_13_533]
665f76
     [<ffffffffc05de866>] enter_kretprobe_common+0x256/0x490 [stap_2c44636dfda18135ca3012a752599da6_13_533]
665f76
     [<ffffffff813489f1>] ? proc_sys_open+0x51/0x60
665f76
     [<ffffffffc05dead0>] enter_kretprobe_probe+0x10/0x20 [stap_2c44636dfda18135ca3012a752599da6_13_533]
665f76
     [<ffffffff8188e1d8>] trampoline_handler+0x148/0x220
665f76
     [<ffffffff813489f1>] ? proc_sys_open+0x51/0x60
665f76
     [<ffffffff8188d89e>] kretprobe_trampoline+0x25/0x57
665f76
     [<ffffffff813489f1>] ? proc_sys_open+0x51/0x60
665f76
     [<ffffffff8188d879>] kretprobe_trampoline_holder+0x9/0x9
665f76
     [<ffffffff81384702>] ? security_inode_permission+0x22/0x30
665f76
     [<ffffffff813489a0>] ? sysctl_head_finish+0x50/0x50
665f76
     [<ffffffff812ac11d>] vfs_open+0x5d/0xb0
665f76
     [<ffffffff812bb74a>] ? may_open+0x5a/0x120
665f76
     [<ffffffff812c0af5>] do_last+0x285/0x15b0
665f76
     [<ffffffff812bf18e>] ? link_path_walk+0x27e/0x8c0
665f76
     [<ffffffff812c1ef0>] path_openat+0xd0/0x5d0
665f76
     [<ffffffff8107a7f3>] ? kvm_clock_read+0x33/0x40
665f76
     [<ffffffff812c38ad>] do_filp_open+0x4d/0xb0
665f76
     [<ffffffff81889497>] ? _raw_spin_unlock+0x27/0x40
665f76
     [<ffffffff812d5a9b>] ? __alloc_fd+0xfb/0x270
665f76
     [<ffffffff812ad784>] do_sys_open+0x124/0x220
665f76
     [<ffffffff812ad89e>] SyS_open+0x1e/0x20
665f76
     [<ffffffff8188d879>] kretprobe_trampoline_holder+0x9/0x9
665f76
    
665f76
    This patch resolves the IRQ print races by disabling IRQs on the local
665f76
    CPU when accessing said CPU's print buffer, and resolves the cleanup
665f76
    races with a lock. We also protect against data corruption and panics
665f76
    from prints inside NMIs now by checking if the current CPU was accessing
665f76
    the log buffer when an NMI fired; in this case, the NMI's prints will be
665f76
    dropped, as there is no way to safely service them without creating a
665f76
    dedicated log buffer for them. This is achieved by forbidding reentrancy
665f76
    with respect to _stp_print_trylock_irqsave() when the runtime context
665f76
    isn't held. Reentrancy is otherwise allowed when the runtime context is
665f76
    held because the runtime context provides reentrancy protection.
665f76
665f76
diff --git a/runtime/linux/io.c b/runtime/linux/io.c
665f76
index 74a032c52..122708e2a 100644
665f76
--- a/runtime/linux/io.c
665f76
+++ b/runtime/linux/io.c
665f76
@@ -20,9 +20,6 @@
665f76
 
665f76
 #define WARN_STRING "WARNING: "
665f76
 #define ERR_STRING "ERROR: "
665f76
-#if (STP_LOG_BUF_LEN < 10) /* sizeof(WARN_STRING) */
665f76
-#error "STP_LOG_BUF_LEN is too short"
665f76
-#endif
665f76
 
665f76
 enum code { INFO=0, WARN, ERROR, DBUG };
665f76
 
665f76
@@ -31,25 +28,37 @@ static void _stp_vlog (enum code type, const char *func, int line, const char *f
665f76
 
665f76
 static void _stp_vlog (enum code type, const char *func, int line, const char *fmt, va_list args)
665f76
 {
665f76
+	struct _stp_log *log;
665f76
+	unsigned long flags;
665f76
+	size_t bytes_avail;
665f76
 	int num;
665f76
-	char *buf = per_cpu_ptr(Stp_lbuf, get_cpu());
665f76
+	char *buf;
665f76
 	int start = 0;
665f76
 
665f76
+	if (!_stp_print_trylock_irqsave(&flags))
665f76
+		return;
665f76
+
665f76
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
+	bytes_avail = STP_BUFFER_SIZE - log->len;
665f76
+	if (unlikely(!bytes_avail))
665f76
+		goto err_unlock;
665f76
+
665f76
+	buf = &log->buf[log->len];
665f76
 	if (type == DBUG) {
665f76
-		start = _stp_snprintf(buf, STP_LOG_BUF_LEN, "%s:%d: ", func, line);
665f76
+		start = _stp_snprintf(buf, bytes_avail, "%s:%d: ", func, line);
665f76
 	} else if (type == WARN) {
665f76
-		/* This strcpy() is OK, since we know STP_LOG_BUF_LEN
665f76
-		 * is > sizeof(WARN_STRING). */
665f76
-		strcpy (buf, WARN_STRING);
665f76
-		start = sizeof(WARN_STRING) - 1;
665f76
+		strncpy(buf, WARN_STRING, bytes_avail);
665f76
+		start = min(bytes_avail, sizeof(WARN_STRING) - 1);
665f76
 	} else if (type == ERROR) {
665f76
-		/* This strcpy() is OK, since we know STP_LOG_BUF_LEN
665f76
-		 * is > sizeof(ERR_STRING) (which is < sizeof(WARN_STRING). */
665f76
-		strcpy (buf, ERR_STRING);
665f76
-		start = sizeof(ERR_STRING) - 1;
665f76
+		strncpy(buf, ERR_STRING, bytes_avail);
665f76
+		start = min(bytes_avail, sizeof(ERR_STRING) - 1);
665f76
 	}
665f76
 
665f76
-	num = vscnprintf (buf + start, STP_LOG_BUF_LEN - start - 1, fmt, args);
665f76
+	bytes_avail -= start;
665f76
+	if (unlikely(!bytes_avail))
665f76
+		goto err_unlock;
665f76
+
665f76
+	num = vscnprintf(buf + start, bytes_avail - 1, fmt, args);
665f76
 	if (num + start) {
665f76
 		if (buf[num + start - 1] != '\n') {
665f76
 			buf[num + start] = '\n';
665f76
@@ -66,12 +75,13 @@ static void _stp_vlog (enum code type, const char *func, int line, const char *f
665f76
 		if (type != DBUG) {
665f76
 			_stp_ctl_send(STP_OOB_DATA, buf, start + num + 1);
665f76
 		} else {
665f76
-			_stp_print(buf);
665f76
-			_stp_print_flush();
665f76
+			log->len += start + num;
665f76
+			__stp_print_flush(log);
665f76
 		}
665f76
 #endif
665f76
 	}
665f76
-	put_cpu();
665f76
+err_unlock:
665f76
+	_stp_print_unlock_irqrestore(&flags);
665f76
 }
665f76
 
665f76
 /** Prints warning.
665f76
diff --git a/runtime/linux/print.c b/runtime/linux/print.c
665f76
index 777bca8b0..2aa2f1c8d 100644
665f76
--- a/runtime/linux/print.c
665f76
+++ b/runtime/linux/print.c
665f76
@@ -35,84 +35,179 @@
665f76
  * @{
665f76
  */
665f76
 
665f76
-typedef struct __stp_pbuf {
665f76
-	uint32_t len;			/* bytes used in the buffer */
665f76
+struct _stp_log {
665f76
+	unsigned int len; /* Bytes used in the buffer */
665f76
 	char buf[STP_BUFFER_SIZE];
665f76
-} _stp_pbuf;
665f76
+	atomic_t reentrancy_lock;
665f76
+};
665f76
+#include "print_flush.c"
665f76
 
665f76
-static void *Stp_pbuf = NULL;
665f76
+static struct _stp_log *_stp_log_pcpu;
665f76
+
665f76
+/*
665f76
+ * An atomic counter is used to synchronize every possible print buffer usage
665f76
+ * with the _stp_print_cleanup() function. The cleanup function sets the counter
665f76
+ * to INT_MAX after waiting for everything using the print buffer to finish. We
665f76
+ * cannot use a lock primitive to implement this because lock_acquire() contains
665f76
+ * tracepoints and print statements are used both inside and outside of probes.
665f76
+ * If the lock were only used inside probes, the runtime context would protect
665f76
+ * us from recursing into the lock_acquire() tracepoints and deadlocking. We
665f76
+ * instead use _stp_print_ctr as if it were a read-write lock.
665f76
+ */
665f76
+static atomic_t _stp_print_ctr = ATOMIC_INIT(0);
665f76
 
665f76
-/** private buffer for _stp_vlog() */
665f76
-#ifndef STP_LOG_BUF_LEN
665f76
-#define STP_LOG_BUF_LEN 256
665f76
-#endif
665f76
+/*
665f76
+ * This disables IRQs to make per-CPU print buffer accesses atomic. There is a
665f76
+ * reentrancy protection mechanism specifically for NMIs, since they can violate
665f76
+ * our atomic guarantee. Reentrancy is otherwise allowed within code sections
665f76
+ * that have the runtime context held (via _stp_runtime_entryfn_get_context()).
665f76
+ */
665f76
+static bool _stp_print_trylock_irqsave(unsigned long *flags)
665f76
+{
665f76
+	bool context_held = false;
665f76
+	struct _stp_log *log;
665f76
+
665f76
+	local_irq_save(*flags);
665f76
+	if (!atomic_add_unless(&_stp_print_ctr, 1, INT_MAX))
665f76
+		goto irq_restore;
665f76
+
665f76
+	/*
665f76
+	 * Check the per-CPU reentrancy lock for contention, unless the runtime
665f76
+	 * context is already held, in which case we already have reentrancy
665f76
+	 * protection. Otherwise, if the reentrancy lock is contented, that
665f76
+	 * means we're either inside an NMI that fired while the current CPU was
665f76
+	 * accessing the log buffer, or something is trying to nest calls to
665f76
+	 * _stp_print_trylock_irqsave(). Our only choice is to reject the log
665f76
+	 * access attempt in this case because log buffer corruption and panics
665f76
+	 * could ensue if we're inside an NMI.
665f76
+	 */
665f76
+	if (_stp_runtime_context_trylock()) {
665f76
+		struct context *c = _stp_runtime_get_context();
665f76
+		context_held = c && atomic_read(&c->busy);
665f76
+		_stp_runtime_context_unlock();
665f76
+	}
665f76
 
665f76
-typedef char _stp_lbuf[STP_LOG_BUF_LEN];
665f76
-static void *Stp_lbuf = NULL;
665f76
+	/* Fall back onto the reentrancy lock if the context isn't held */
665f76
+	if (!context_held) {
665f76
+		log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
+		if (atomic_cmpxchg(&log->reentrancy_lock, 0, 1))
665f76
+			goto print_unlock;
665f76
+	}
665f76
+
665f76
+	return true;
665f76
+
665f76
+print_unlock:
665f76
+	atomic_dec(&_stp_print_ctr);
665f76
+irq_restore:
665f76
+	local_irq_restore(*flags);
665f76
+	return false;
665f76
+}
665f76
+
665f76
+static void _stp_print_unlock_irqrestore(unsigned long *flags)
665f76
+{
665f76
+	bool context_held = false;
665f76
+	struct _stp_log *log;
665f76
+
665f76
+	if (_stp_runtime_context_trylock()) {
665f76
+		struct context *c = _stp_runtime_get_context();
665f76
+		context_held = c && atomic_read(&c->busy);
665f76
+		_stp_runtime_context_unlock();
665f76
+	}
665f76
+
665f76
+	if (!context_held) {
665f76
+		log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
+		atomic_set(&log->reentrancy_lock, 0);
665f76
+	}
665f76
+
665f76
+	atomic_dec(&_stp_print_ctr);
665f76
+	local_irq_restore(*flags);
665f76
+}
665f76
 
665f76
 /* create percpu print and io buffers */
665f76
 static int _stp_print_init (void)
665f76
 {
665f76
-	Stp_pbuf = _stp_alloc_percpu(sizeof(_stp_pbuf));
665f76
-	if (unlikely(Stp_pbuf == 0))
665f76
-		return -1;
665f76
-
665f76
-	/* now initialize IO buffer used in io.c */
665f76
-	Stp_lbuf = _stp_alloc_percpu(sizeof(_stp_lbuf));
665f76
-	if (unlikely(Stp_lbuf == 0)) {
665f76
-		_stp_free_percpu(Stp_pbuf);
665f76
-		return -1;
665f76
+	unsigned int cpu;
665f76
+
665f76
+	_stp_log_pcpu = _stp_alloc_percpu(sizeof(*_stp_log_pcpu));
665f76
+	if (!_stp_log_pcpu)
665f76
+		return -ENOMEM;
665f76
+
665f76
+	for_each_possible_cpu(cpu) {
665f76
+		struct _stp_log *log = per_cpu_ptr(_stp_log_pcpu, cpu);
665f76
+
665f76
+		log->reentrancy_lock = (atomic_t)ATOMIC_INIT(0);
665f76
 	}
665f76
 	return 0;
665f76
 }
665f76
 
665f76
 static void _stp_print_cleanup (void)
665f76
 {
665f76
-	if (Stp_pbuf)
665f76
-		_stp_free_percpu(Stp_pbuf);
665f76
-	if (Stp_lbuf)
665f76
-		_stp_free_percpu(Stp_lbuf);
665f76
-}
665f76
+	unsigned int cpu;
665f76
 
665f76
-#include "print_flush.c"
665f76
+	/* Wait for the loggers to finish modifying the print buffers */
665f76
+	while (atomic_cmpxchg(&_stp_print_ctr, 0, INT_MAX))
665f76
+		cpu_relax();
665f76
+
665f76
+	for_each_possible_cpu(cpu) {
665f76
+		struct _stp_log *log = per_cpu_ptr(_stp_log_pcpu, cpu);
665f76
+
665f76
+		/*
665f76
+		 * Flush anything that could be left in the print buffer. It is
665f76
+		 * safe to do this without any kind of synchronization mechanism
665f76
+		 * because nothing is using this print buffer anymore.
665f76
+		 */
665f76
+		__stp_print_flush(log);
665f76
+	}
665f76
+
665f76
+	_stp_free_percpu(_stp_log_pcpu);
665f76
+}
665f76
 
665f76
 static inline void _stp_print_flush(void)
665f76
 {
665f76
-	stp_print_flush(per_cpu_ptr(Stp_pbuf, smp_processor_id()));
665f76
+	struct _stp_log *log;
665f76
+	unsigned long flags;
665f76
+
665f76
+	if (!_stp_print_trylock_irqsave(&flags))
665f76
+		return;
665f76
+
665f76
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
+	__stp_print_flush(log);
665f76
+	_stp_print_unlock_irqrestore(&flags);
665f76
 }
665f76
+
665f76
 #ifndef STP_MAXBINARYARGS
665f76
 #define STP_MAXBINARYARGS 127
665f76
 #endif
665f76
 
665f76
 
665f76
-/** Reserves space in the output buffer for direct I/O.
665f76
+/** Reserves space in the output buffer for direct I/O. Must be called with
665f76
+ * _stp_print_trylock_irqsave() held.
665f76
  */
665f76
 static void * _stp_reserve_bytes (int numbytes)
665f76
 {
665f76
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
665f76
-	int size = STP_BUFFER_SIZE - pb->len;
665f76
-	void * ret;
665f76
+	struct _stp_log *log;
665f76
+	char *ret;
665f76
 
665f76
 	if (unlikely(numbytes == 0 || numbytes > STP_BUFFER_SIZE))
665f76
 		return NULL;
665f76
 
665f76
-	if (unlikely(numbytes > size))
665f76
-		_stp_print_flush();
665f76
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
+	if (unlikely(numbytes > (STP_BUFFER_SIZE - log->len)))
665f76
+		__stp_print_flush(log);
665f76
 
665f76
-	ret = pb->buf + pb->len;
665f76
-	pb->len += numbytes;
665f76
+	ret = &log->buf[log->len];
665f76
+	log->len += numbytes;
665f76
 	return ret;
665f76
 }
665f76
 
665f76
 
665f76
 static void _stp_unreserve_bytes (int numbytes)
665f76
 {
665f76
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
665f76
-
665f76
-	if (unlikely(numbytes == 0 || numbytes > pb->len))
665f76
-		return;
665f76
+	struct _stp_log *log;
665f76
 
665f76
-	pb->len -= numbytes;
665f76
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
+	if (numbytes <= log->len)
665f76
+		log->len -= numbytes;
665f76
 }
665f76
 
665f76
 /** Write 64-bit args directly into the output stream.
665f76
@@ -123,22 +218,25 @@ static void _stp_unreserve_bytes (int numbytes)
665f76
  */
665f76
 static void _stp_print_binary (int num, ...)
665f76
 {
665f76
+	unsigned long flags;
665f76
 	va_list vargs;
665f76
 	int i;
665f76
 	int64_t *args;
665f76
-	
665f76
+
665f76
 	if (unlikely(num > STP_MAXBINARYARGS))
665f76
 		num = STP_MAXBINARYARGS;
665f76
 
665f76
-	args = _stp_reserve_bytes(num * sizeof(int64_t));
665f76
+	if (!_stp_print_trylock_irqsave(&flags))
665f76
+		return;
665f76
 
665f76
-	if (likely(args != NULL)) {
665f76
+	args = _stp_reserve_bytes(num * sizeof(int64_t));
665f76
+	if (args) {
665f76
 		va_start(vargs, num);
665f76
-		for (i = 0; i < num; i++) {
665f76
+		for (i = 0; i < num; i++)
665f76
 			args[i] = va_arg(vargs, int64_t);
665f76
-		}
665f76
 		va_end(vargs);
665f76
 	}
665f76
+	_stp_print_unlock_irqrestore(&flags);
665f76
 }
665f76
 
665f76
 /** Print into the print buffer.
665f76
@@ -149,6 +247,7 @@ static void _stp_print_binary (int num, ...)
665f76
 static void _stp_printf (const char *fmt, ...)
665f76
 {
665f76
 	va_list args;
665f76
+
665f76
 	va_start(args, fmt);
665f76
 	_stp_vsnprintf(NULL, 0, fmt, args);
665f76
 	va_end(args);
665f76
@@ -160,37 +259,36 @@ static void _stp_printf (const char *fmt, ...)
665f76
 
665f76
 static void _stp_print (const char *str)
665f76
 {
665f76
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
665f76
-	char *end = pb->buf + STP_BUFFER_SIZE;
665f76
-	char *ptr = pb->buf + pb->len;
665f76
-	char *instr = (char *)str;
665f76
-
665f76
-	while (ptr < end && *instr)
665f76
-		*ptr++ = *instr++;
665f76
-
665f76
-	/* Did loop terminate due to lack of buffer space? */
665f76
-	if (unlikely(*instr)) {
665f76
-		/* Don't break strings across subbufs. */
665f76
-		/* Restart after flushing. */
665f76
-		_stp_print_flush();
665f76
-		end = pb->buf + STP_BUFFER_SIZE;
665f76
-		ptr = pb->buf + pb->len;
665f76
-		instr = (char *)str;
665f76
-		while (ptr < end && *instr)
665f76
-			*ptr++ = *instr++;
665f76
+	struct _stp_log *log;
665f76
+	unsigned long flags;
665f76
+
665f76
+	if (!_stp_print_trylock_irqsave(&flags))
665f76
+		return;
665f76
+
665f76
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
+	while (1) {
665f76
+		while (log->len < STP_BUFFER_SIZE && *str)
665f76
+			log->buf[log->len++] = *str++;
665f76
+		if (likely(!*str))
665f76
+			break;
665f76
+		__stp_print_flush(log);
665f76
 	}
665f76
-	pb->len = ptr - pb->buf;
665f76
+	_stp_print_unlock_irqrestore(&flags);
665f76
 }
665f76
 
665f76
 static void _stp_print_char (const char c)
665f76
 {
665f76
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
665f76
-	int size = STP_BUFFER_SIZE - pb->len;
665f76
-	if (unlikely(1 >= size))
665f76
-		_stp_print_flush();
665f76
-	
665f76
-	pb->buf[pb->len] = c;
665f76
-	pb->len ++;
665f76
+	struct _stp_log *log;
665f76
+	unsigned long flags;
665f76
+
665f76
+	if (!_stp_print_trylock_irqsave(&flags))
665f76
+		return;
665f76
+
665f76
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
+	if (unlikely(log->len == STP_BUFFER_SIZE))
665f76
+		__stp_print_flush(log);
665f76
+	log->buf[log->len++] = c;
665f76
+	_stp_print_unlock_irqrestore(&flags);
665f76
 }
665f76
 
665f76
 static void _stp_print_kernel_info(char *sname, char *vstr, int ctx, int num_probes)
665f76
diff --git a/runtime/print.h b/runtime/print.h
665f76
index ede71f033..ffdea594d 100644
665f76
--- a/runtime/print.h
665f76
+++ b/runtime/print.h
665f76
@@ -10,6 +10,9 @@
665f76
 #ifndef _STP_PRINT_H_
665f76
 #define _STP_PRINT_H_
665f76
 
665f76
+/* The lock must be held with IRQs disabled to do any printing */
665f76
+static bool _stp_print_trylock_irqsave(unsigned long *flags);
665f76
+static void _stp_print_unlock_irqrestore(unsigned long *flags);
665f76
 static int _stp_print_init(void);
665f76
 static void _stp_print_cleanup(void);
665f76
 static void *_stp_reserve_bytes(int numbytes);
665f76
diff --git a/runtime/print_flush.c b/runtime/print_flush.c
665f76
index cf40a2645..acd6a32d9 100644
665f76
--- a/runtime/print_flush.c
665f76
+++ b/runtime/print_flush.c
665f76
@@ -13,40 +13,31 @@
665f76
  * is filled, or this is called. This MUST be called before returning
665f76
  * from a probe or accumulated output in the print buffer will be lost.
665f76
  *
665f76
- * @note Preemption must be disabled to use this.
665f76
+ * @note Interrupts must be disabled to use this.
665f76
  */
665f76
 
665f76
-static STP_DEFINE_SPINLOCK(_stp_print_lock);
665f76
-
665f76
-void stp_print_flush(_stp_pbuf *pb)
665f76
+static void __stp_print_flush(struct _stp_log *log)
665f76
 {
665f76
-	size_t len = pb->len;
665f76
+	size_t len = log->len;
665f76
 	void *entry = NULL;
665f76
 
665f76
 	/* check to see if there is anything in the buffer */
665f76
 	if (likely(len == 0))
665f76
 		return;
665f76
 
665f76
-	pb->len = 0;
665f76
-
665f76
-	if (unlikely(_stp_transport_get_state() != STP_TRANSPORT_RUNNING))
665f76
-		return;
665f76
+	log->len = 0;
665f76
 
665f76
 	dbug_trans(1, "len = %zu\n", len);
665f76
 
665f76
 #ifdef STP_BULKMODE
665f76
 #ifdef NO_PERCPU_HEADERS
665f76
 	{
665f76
-		struct context* __restrict__ c = NULL;
665f76
-		char *bufp = pb->buf;
665f76
+		char *bufp = log->buf;
665f76
 		int inode_locked;
665f76
 
665f76
-		c = _stp_runtime_entryfn_get_context();
665f76
-
665f76
 		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
665f76
 			atomic_inc (&_stp_transport_failures);
665f76
 #ifndef STP_TRANSPORT_RISKY
665f76
-			_stp_runtime_entryfn_put_context(c);
665f76
 			return;
665f76
 #endif
665f76
 		}
665f76
@@ -70,26 +61,20 @@ void stp_print_flush(_stp_pbuf *pb)
665f76
 
665f76
 		if (inode_locked)
665f76
 			_stp_transport_unlock_relay_inode();
665f76
-
665f76
-		_stp_runtime_entryfn_put_context(c);
665f76
 	}
665f76
 
665f76
 #else  /* !NO_PERCPU_HEADERS */
665f76
 
665f76
 	{
665f76
-		struct context* __restrict__ c = NULL;
665f76
-		char *bufp = pb->buf;
665f76
+		char *bufp = log->buf;
665f76
 		struct _stp_trace t = {	.sequence = _stp_seq_inc(),
665f76
 					.pdu_len = len};
665f76
 		size_t bytes_reserved;
665f76
 		int inode_locked;
665f76
 
665f76
-		c = _stp_runtime_entryfn_get_context();
665f76
-
665f76
 		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
665f76
 			atomic_inc (&_stp_transport_failures);
665f76
 #ifndef STP_TRANSPORT_RISKY
665f76
-			_stp_runtime_entryfn_put_context(c);
665f76
 			return;
665f76
 #endif
665f76
 		}
665f76
@@ -124,48 +109,24 @@ void stp_print_flush(_stp_pbuf *pb)
665f76
 
665f76
 		if (inode_locked)
665f76
 			_stp_transport_unlock_relay_inode();
665f76
-
665f76
-		_stp_runtime_entryfn_put_context(c);
665f76
 	}
665f76
 #endif /* !NO_PERCPU_HEADERS */
665f76
 
665f76
 #else  /* !STP_BULKMODE */
665f76
 
665f76
 	{
665f76
-		unsigned long flags;
665f76
-		struct context* __restrict__ c = NULL;
665f76
-		char *bufp = pb->buf;
665f76
+		char *bufp = log->buf;
665f76
 		int inode_locked;
665f76
 
665f76
-		/* Prevent probe reentrancy on _stp_print_lock.
665f76
-		 *
665f76
-		 * Since stp_print_flush may be called from probe context, we
665f76
-		 * have to make sure that its lock, _stp_print_lock, can't
665f76
-		 * possibly be held outside probe context too.  We ensure this
665f76
-		 * by grabbing the context here, so any probe triggered by this
665f76
-		 * region will appear reentrant and be skipped rather than
665f76
-		 * deadlock.  Failure to get_context just means we're already
665f76
-		 * in a probe, which is fine.
665f76
-		 *
665f76
-		 * (see also _stp_ctl_send for a similar situation)
665f76
-                 *
665f76
-                 * A better solution would be to replace this
665f76
-                 * concurrency-control-laden effort with a lockless
665f76
-                 * algorithm.
665f76
-		 */
665f76
-		c = _stp_runtime_entryfn_get_context();
665f76
-
665f76
 		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
665f76
 			atomic_inc (&_stp_transport_failures);
665f76
 #ifndef STP_TRANSPORT_RISKY
665f76
 			dbug_trans(0, "discarding %zu bytes of data\n", len);
665f76
-			_stp_runtime_entryfn_put_context(c);
665f76
 			return;
665f76
 #endif
665f76
 		}
665f76
 
665f76
 		dbug_trans(1, "calling _stp_data_write...\n");
665f76
-		stp_spin_lock_irqsave(&_stp_print_lock, flags);
665f76
 		while (len > 0) {
665f76
 			size_t bytes_reserved;
665f76
 
665f76
@@ -182,12 +143,9 @@ void stp_print_flush(_stp_pbuf *pb)
665f76
 			    break;
665f76
 			}
665f76
 		}
665f76
-		stp_spin_unlock_irqrestore(&_stp_print_lock, flags);
665f76
 
665f76
 		if (inode_locked)
665f76
 			_stp_transport_unlock_relay_inode();
665f76
-
665f76
-		_stp_runtime_entryfn_put_context(c);
665f76
 	}
665f76
 #endif /* !STP_BULKMODE */
665f76
 }
665f76
diff --git a/runtime/stack.c b/runtime/stack.c
665f76
index 241ccf793..da23d4395 100644
665f76
--- a/runtime/stack.c
665f76
+++ b/runtime/stack.c
665f76
@@ -690,13 +690,20 @@ static void _stp_stack_kernel_sprint(char *str, int size, struct context* c,
665f76
 	 * then call _stp_stack_print,
665f76
 	 * then copy the result into the output string
665f76
 	 * and clear the print buffer. */
665f76
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
665f76
-	_stp_print_flush();
665f76
+	struct _stp_log *log;
665f76
+	unsigned long flags;
665f76
 
665f76
-	_stp_stack_kernel_print(c, sym_flags);
665f76
+	if (!_stp_print_trylock_irqsave(&flags)) {
665f76
+		*str = '\0';
665f76
+		return;
665f76
+	}
665f76
 
665f76
-	strlcpy(str, pb->buf, size < (int)pb->len ? size : (int)pb->len);
665f76
-	pb->len = 0;
665f76
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
+	__stp_print_flush(log);
665f76
+	_stp_stack_kernel_print(c, sym_flags);
665f76
+	strlcpy(str, log->buf, min_t(int, size, log->len));
665f76
+	log->len = 0;
665f76
+	_stp_print_unlock_irqrestore(&flags);
665f76
 }
665f76
 
665f76
 static void _stp_stack_user_sprint(char *str, int size, struct context* c,
665f76
@@ -707,13 +714,20 @@ static void _stp_stack_user_sprint(char *str, int size, struct context* c,
665f76
 	 * then call _stp_stack_print,
665f76
 	 * then copy the result into the output string
665f76
 	 * and clear the print buffer. */
665f76
-	_stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());
665f76
-	_stp_print_flush();
665f76
+	struct _stp_log *log;
665f76
+	unsigned long flags;
665f76
 
665f76
-	_stp_stack_user_print(c, sym_flags);
665f76
+	if (!_stp_print_trylock_irqsave(&flags)) {
665f76
+		*str = '\0';
665f76
+		return;
665f76
+	}
665f76
 
665f76
-	strlcpy(str, pb->buf, size < (int)pb->len ? size : (int)pb->len);
665f76
-	pb->len = 0;
665f76
+	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
+	__stp_print_flush(log);
665f76
+	_stp_stack_user_print(c, sym_flags);
665f76
+	strlcpy(str, log->buf, min_t(int, size, log->len));
665f76
+	log->len = 0;
665f76
+	_stp_print_unlock_irqrestore(&flags);
665f76
 }
665f76
 
665f76
 #endif /* _STACK_C_ */
665f76
diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c
665f76
index 57955334b..44e69b68c 100644
665f76
--- a/runtime/transport/transport.c
665f76
+++ b/runtime/transport/transport.c
665f76
@@ -540,8 +540,8 @@ static void _stp_transport_close(void)
665f76
 		   current->pid);
665f76
 	_stp_cleanup_and_exit(0);
665f76
 	_stp_unregister_ctl_channel();
665f76
+	_stp_print_cleanup(); /* Requires the transport, so free this first */
665f76
 	_stp_transport_fs_close();
665f76
-	_stp_print_cleanup();	/* free print buffers */
665f76
 	_stp_mem_debug_done();
665f76
 
665f76
 	dbug_trans(1, "---- CLOSED ----\n");
665f76
diff --git a/runtime/vsprintf.c b/runtime/vsprintf.c
665f76
index 28fd18f16..417d9f7f3 100644
665f76
--- a/runtime/vsprintf.c
665f76
+++ b/runtime/vsprintf.c
665f76
@@ -542,6 +542,8 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
665f76
 				   number of chars for from string */
665f76
 	int qualifier;		/* 'h', 'l', or 'L' for integer fields */
665f76
 	int num_bytes = 0;
665f76
+	unsigned long irqflags = 0;
665f76
+	bool got_print_lock = false;
665f76
 
665f76
 	/* Reject out-of-range values early */
665f76
 	if (unlikely((int) size < 0))
665f76
@@ -724,11 +726,14 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
665f76
 	    num_bytes = STP_BUFFER_SIZE;
665f76
 	  }
665f76
 
665f76
+	  if (!_stp_print_trylock_irqsave(&irqflags))
665f76
+	    return 0;
665f76
 	  str = (char*)_stp_reserve_bytes(num_bytes);
665f76
 	  if (str == NULL) {
665f76
 	    _stp_error("Couldn't reserve any print buffer space\n");
665f76
-	    return 0;
665f76
+	    goto err_unlock;
665f76
 	  }
665f76
+	  got_print_lock = true;
665f76
 	  size = num_bytes;
665f76
 	  end = str + size - 1;
665f76
 
665f76
@@ -820,8 +825,10 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
665f76
 					field_width, precision,
665f76
 					*fmt, flags);
665f76
 			if (unlikely(str == NULL)) {
665f76
-				if (num_bytes > 0)
665f76
+				if (num_bytes > 0) {
665f76
 					_stp_unreserve_bytes(num_bytes);
665f76
+					goto err_unlock;
665f76
+				}
665f76
 				return 0;
665f76
 			}
665f76
 			continue;
665f76
@@ -923,7 +930,14 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
665f76
                   /* don't write out a null byte if the buf size is zero */
665f76
                   *end = '\0';
665f76
 	}
665f76
+
665f76
+	if (got_print_lock)
665f76
+		_stp_print_unlock_irqrestore(&irqflags);
665f76
 	return str-buf;
665f76
+
665f76
+err_unlock:
665f76
+	_stp_print_unlock_irqrestore(&irqflags);
665f76
+	return 0;
665f76
 }
665f76
 
665f76
 #endif /* _VSPRINTF_C_ */
665f76
diff --git a/translate.cxx b/translate.cxx
665f76
index 53f1d0725..f0195486c 100644
665f76
--- a/translate.cxx
665f76
+++ b/translate.cxx
665f76
@@ -1354,6 +1354,9 @@ c_unparser::emit_compiled_printfs ()
665f76
       o->newline() << "unsigned long ptr_value;";
665f76
       o->newline() << "int num_bytes;";
665f76
 
665f76
+      if (print_to_stream)
665f76
+	  o->newline() << "unsigned long irqflags;";
665f76
+
665f76
       o->newline() << "(void) width;";
665f76
       o->newline() << "(void) precision;";
665f76
       o->newline() << "(void) ptr_value;";
665f76
@@ -1452,7 +1455,9 @@ c_unparser::emit_compiled_printfs ()
665f76
 	    }
665f76
 
665f76
 	  o->newline() << "num_bytes = clamp(num_bytes, 0, STP_BUFFER_SIZE);";
665f76
-	  o->newline() << "str = (char*)_stp_reserve_bytes(num_bytes);";
665f76
+	  o->newline() << "if (!_stp_print_trylock_irqsave(&irqflags))";
665f76
+	  o->newline(1) << "return;";
665f76
+	  o->newline(-1) << "str = (char*)_stp_reserve_bytes(num_bytes);";
665f76
 	  o->newline() << "end = str ? str + num_bytes - 1 : 0;";
665f76
         }
665f76
       else // !print_to_stream
665f76
@@ -1547,8 +1552,14 @@ c_unparser::emit_compiled_printfs ()
665f76
 	      o->newline() << "if (unlikely(str == NULL)) {";
665f76
 	      o->indent(1);
665f76
 	      if (print_to_stream)
665f76
+                {
665f76
 		  o->newline() << "_stp_unreserve_bytes(num_bytes);";
665f76
-	      o->newline() << "return;";
665f76
+	          o->newline() << "goto err_unlock;";
665f76
+                }
665f76
+              else
665f76
+                {
665f76
+	          o->newline() << "return;";
665f76
+                }
665f76
 	      o->newline(-1) << "}";
665f76
 	      break;
665f76
 
665f76
@@ -1575,6 +1586,11 @@ c_unparser::emit_compiled_printfs ()
665f76
 
665f76
       o->newline(-1) << "}";
665f76
 
665f76
+      if (print_to_stream)
665f76
+        {
665f76
+          o->newline(-1) << "err_unlock:";
665f76
+          o->newline(1) << "_stp_print_unlock_irqrestore(&irqflags);";
665f76
+        }
665f76
       o->newline(-1) << "}";
665f76
     }
665f76
   o->newline() << "#endif // STP_LEGACY_PRINT";
665f76
commit e8c7a2067ec7fc6315ee9bc34a010ec5f0369c5c
665f76
Author: Frank Ch. Eigler <fche@redhat.com>
665f76
Date:   Fri Dec 4 19:33:22 2020 -0500
665f76
665f76
    testsuite pr14536.stp: toughen
665f76
    
665f76
    This test case stresses nesting of heavy duty processing (backtrace
665f76
    printing) within kernel interrupt processing paths.  It seems to
665f76
    sometimes trigger problems - so let's make the test harder to make
665f76
    latent problems show up more likely.  Instead of quitting after the
665f76
    first irq_* function hit, stick around for 10 seconds.
665f76
665f76
diff --git a/testsuite/systemtap.base/pr14546.stp b/testsuite/systemtap.base/pr14546.stp
665f76
index 3e59a6f3a..e4c205a8f 100644
665f76
--- a/testsuite/systemtap.base/pr14546.stp
665f76
+++ b/testsuite/systemtap.base/pr14546.stp
665f76
@@ -2,5 +2,6 @@ probe kernel.function("irq_*").call {
665f76
       x = 10; y = 10; z = 10; w = 10
665f76
       $1
665f76
       assert(!(x != 10 || y != 10 || z != 10 || w != 10), "memory got corrupted by " . @1)
665f76
-      exit()
665f76
 }
665f76
+
665f76
+probe timer.s(10) { exit () }
665f76
commit cd6399e621646856824ea96b11605a0f52011272
665f76
Author: Frank Ch. Eigler <fche@redhat.com>
665f76
Date:   Fri Dec 4 21:33:21 2020 -0500
665f76
665f76
    dyninst transport: add _stp_print_*lock_irq* stubs
665f76
    
665f76
    Recent code on the transport/linux side needs a few new (stub)
665f76
    functions and type decls.
665f76
665f76
diff --git a/runtime/dyninst/print.c b/runtime/dyninst/print.c
665f76
index 9d91224ba..c78def272 100644
665f76
--- a/runtime/dyninst/print.c
665f76
+++ b/runtime/dyninst/print.c
665f76
@@ -107,4 +107,18 @@ static void _stp_print_char (const char c)
665f76
 	}
665f76
 }
665f76
 
665f76
+
665f76
+/* no-op stub synchronization */
665f76
+static bool _stp_print_trylock_irqsave(unsigned long *flags)
665f76
+{
665f76
+        (void) flags;
665f76
+        return true;
665f76
+}
665f76
+
665f76
+static void _stp_print_unlock_irqrestore(unsigned long *flags)
665f76
+{
665f76
+        (void) flags;
665f76
+}
665f76
+
665f76
+
665f76
 #endif /* _STAPDYN_PRINT_C_ */
665f76
diff --git a/runtime/dyninst/runtime_defines.h b/runtime/dyninst/runtime_defines.h
665f76
index 5c3dec519..d00c76a21 100644
665f76
--- a/runtime/dyninst/runtime_defines.h
665f76
+++ b/runtime/dyninst/runtime_defines.h
665f76
@@ -7,3 +7,6 @@
665f76
 #define STAPCONF_PAGEFAULT_DISABLE  1
665f76
 #define pagefault_disable()
665f76
 #define pagefault_enable()
665f76
+
665f76
+typedef int bool;
665f76
+enum { false=0, true=1 };
665f76
commit fd93cf71df80f7bb5aae707ea5a5875727a85770
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Wed Dec 9 12:55:10 2020 -0800
665f76
665f76
    PR26844: fix off-by-one error when copying printed backtraces
665f76
    
665f76
    Since log->buf isn't null-terminated, log->len represents the total
665f76
    number of bytes present in the log buffer for copying. The use of
665f76
    strlcpy() here with log->len as its size results in log->len - 1 bytes
665f76
    being copied, with the log->len'nth byte of the output buffer being set
665f76
    to zero to terminate the string. Use memcpy() instead to remedy this,
665f76
    while ensuring that the output buffer has space for null termination,
665f76
    since the output buffer needs to be terminated.
665f76
665f76
diff --git a/runtime/stack.c b/runtime/stack.c
665f76
index da23d4395..85883d6c4 100644
665f76
--- a/runtime/stack.c
665f76
+++ b/runtime/stack.c
665f76
@@ -692,6 +692,7 @@ static void _stp_stack_kernel_sprint(char *str, int size, struct context* c,
665f76
 	 * and clear the print buffer. */
665f76
 	struct _stp_log *log;
665f76
 	unsigned long flags;
665f76
+	int bytes;
665f76
 
665f76
 	if (!_stp_print_trylock_irqsave(&flags)) {
665f76
 		*str = '\0';
665f76
@@ -701,7 +702,9 @@ static void _stp_stack_kernel_sprint(char *str, int size, struct context* c,
665f76
 	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
 	__stp_print_flush(log);
665f76
 	_stp_stack_kernel_print(c, sym_flags);
665f76
-	strlcpy(str, log->buf, min_t(int, size, log->len));
665f76
+	bytes = min_t(int, size - 1, log->len);
665f76
+	memcpy(str, log->buf, bytes);
665f76
+	str[bytes] = '\0';
665f76
 	log->len = 0;
665f76
 	_stp_print_unlock_irqrestore(&flags);
665f76
 }
665f76
@@ -716,6 +719,7 @@ static void _stp_stack_user_sprint(char *str, int size, struct context* c,
665f76
 	 * and clear the print buffer. */
665f76
 	struct _stp_log *log;
665f76
 	unsigned long flags;
665f76
+	int bytes;
665f76
 
665f76
 	if (!_stp_print_trylock_irqsave(&flags)) {
665f76
 		*str = '\0';
665f76
@@ -725,7 +729,9 @@ static void _stp_stack_user_sprint(char *str, int size, struct context* c,
665f76
 	log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());
665f76
 	__stp_print_flush(log);
665f76
 	_stp_stack_user_print(c, sym_flags);
665f76
-	strlcpy(str, log->buf, min_t(int, size, log->len));
665f76
+	bytes = min_t(int, size - 1, log->len);
665f76
+	memcpy(str, log->buf, bytes);
665f76
+	str[bytes] = '\0';
665f76
 	log->len = 0;
665f76
 	_stp_print_unlock_irqrestore(&flags);
665f76
 }
665f76
commit 8819e2a04596deb2fe427d261bebcaf3d2620dfb
665f76
Author: Sultan Alsawaf <sultan@openresty.com>
665f76
Date:   Wed Dec 9 17:22:20 2020 -0800
665f76
665f76
    always use per-cpu bulkmode relayfs files to communicate with userspace
665f76
    
665f76
    Using a mutex_trylock() in __stp_print_flush() leads to a lot of havoc,
665f76
    for numerous. Firstly, since __stp_print_flush() can be called from IRQ
665f76
    context, holding the inode mutex from here would make the mutex owner
665f76
    become nonsense, since mutex locks can only be held in contexts backed
665f76
    by the scheduler. Secondly, the mutex_trylock implementation has a
665f76
    spin_lock() inside of it that leads to two issues: IRQs aren't disabled
665f76
    when acquiring this spin_lock(), so using it from IRQ context can lead
665f76
    to a deadlock, and since spin locks can have tracepoints via
665f76
    lock_acquire(), the spin_lock() can recurse on itself inside a stap
665f76
    probe and deadlock, like so:
665f76
    
665f76
     #0 [ffff88017f6d7a08] kvm_wait at ffffffff81079f5a
665f76
     #1 [ffff88017f6d7a30] __pv_queued_spin_lock_slowpath at ffffffff8114f51e
665f76
     #2 [ffff88017f6d7a70] queued_spin_lock_slowpath at ffffffff810e842b
665f76
     #3 [ffff88017f6d7a80] mutex_trylock at ffffffff81882b1b
665f76
     #4 [ffff88017f6d7ab8] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
     #5 [ffff88017f6d7ad8] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
     #6 [ffff88017f6d7b10] probe_7879 at ffffffffc0a98c85 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
     #7 [ffff88017f6d7b38] enter_real_tracepoint_probe_1543 at ffffffffc0c3b757 [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
     #8 [ffff88017f6d7b70] enter_tracepoint_probe_1543 at ffffffffc09b117e [stap_47650d3377d05db0ab7cbbaa25765809__11657]
665f76
     #9 [ffff88017f6d7b80] lock_acquire at ffffffff811460ba
665f76
    
665f76
    The reason the mutex_trylock() was needed in the first place was because
665f76
    staprun doesn't properly use the relayfs API when reading buffers in
665f76
    non-bulk mode. It tries to read all CPUs' buffers from a single thread,
665f76
    when it should be reading each CPU's buffer from a thread running on
665f76
    said CPU in order to utilize relayfs' synchronization guarantees, which
665f76
    are made by disabling IRQs on the local CPU when a buffer is modified.
665f76
    
665f76
    This change makes staprun always use per-CPU threads to read print
665f76
    buffers so that we don't need the mutex_trylock() in the print flush
665f76
    routine, which resolves a wide variety of serious bugs.
665f76
    
665f76
    We also need to adjust the transport sub-buffer count to accommodate for
665f76
    frequent print flushing. The sub-buffer size is now reduced to match the
665f76
    log buffer size, which is 8192 by default, and the number of sub-buffers
665f76
    is increased to 256. This uses exactly the same amount of memory as
665f76
    before.
665f76
665f76
diff --git a/runtime/print_flush.c b/runtime/print_flush.c
665f76
index acd6a32d9..f4d72d30f 100644
665f76
--- a/runtime/print_flush.c
665f76
+++ b/runtime/print_flush.c
665f76
@@ -18,6 +18,7 @@
665f76
 
665f76
 static void __stp_print_flush(struct _stp_log *log)
665f76
 {
665f76
+	char *bufp = log->buf;
665f76
 	size_t len = log->len;
665f76
 	void *entry = NULL;
665f76
 
665f76
@@ -26,126 +27,20 @@ static void __stp_print_flush(struct _stp_log *log)
665f76
 		return;
665f76
 
665f76
 	log->len = 0;
665f76
-
665f76
 	dbug_trans(1, "len = %zu\n", len);
665f76
-
665f76
-#ifdef STP_BULKMODE
665f76
-#ifdef NO_PERCPU_HEADERS
665f76
-	{
665f76
-		char *bufp = log->buf;
665f76
-		int inode_locked;
665f76
-
665f76
-		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
665f76
-			atomic_inc (&_stp_transport_failures);
665f76
-#ifndef STP_TRANSPORT_RISKY
665f76
-			return;
665f76
-#endif
665f76
-		}
665f76
-
665f76
-		while (len > 0) {
665f76
-			size_t bytes_reserved;
665f76
-
665f76
-			bytes_reserved = _stp_data_write_reserve(len, &entry);
665f76
-			if (likely(entry && bytes_reserved > 0)) {
665f76
-				memcpy(_stp_data_entry_data(entry), bufp,
665f76
-				       bytes_reserved);
665f76
-				_stp_data_write_commit(entry);
665f76
-				bufp += bytes_reserved;
665f76
-				len -= bytes_reserved;
665f76
-			}
665f76
-			else {
665f76
-				atomic_inc(&_stp_transport_failures);
665f76
-				break;
665f76
-			}
665f76
-		}
665f76
-
665f76
-		if (inode_locked)
665f76
-			_stp_transport_unlock_relay_inode();
665f76
-	}
665f76
-
665f76
-#else  /* !NO_PERCPU_HEADERS */
665f76
-
665f76
-	{
665f76
-		char *bufp = log->buf;
665f76
-		struct _stp_trace t = {	.sequence = _stp_seq_inc(),
665f76
-					.pdu_len = len};
665f76
+	do {
665f76
 		size_t bytes_reserved;
665f76
-		int inode_locked;
665f76
 
665f76
-		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
665f76
-			atomic_inc (&_stp_transport_failures);
665f76
-#ifndef STP_TRANSPORT_RISKY
665f76
-			return;
665f76
-#endif
665f76
-		}
665f76
-
665f76
-		bytes_reserved = _stp_data_write_reserve(sizeof(struct _stp_trace), &entry);
665f76
-		if (likely(entry && bytes_reserved > 0)) {
665f76
-			/* prevent unaligned access by using memcpy() */
665f76
-			memcpy(_stp_data_entry_data(entry), &t, sizeof(t));
665f76
+		bytes_reserved = _stp_data_write_reserve(len, &entry);
665f76
+		if (likely(entry && bytes_reserved)) {
665f76
+			memcpy(_stp_data_entry_data(entry), bufp,
665f76
+			       bytes_reserved);
665f76
 			_stp_data_write_commit(entry);
665f76
-		}
665f76
-		else {
665f76
+			bufp += bytes_reserved;
665f76
+			len -= bytes_reserved;
665f76
+		} else {
665f76
 			atomic_inc(&_stp_transport_failures);
665f76
-			goto done;
665f76
+			break;
665f76
 		}
665f76
-
665f76
-		while (len > 0) {
665f76
-			bytes_reserved = _stp_data_write_reserve(len, &entry);
665f76
-			if (likely(entry && bytes_reserved > 0)) {
665f76
-				memcpy(_stp_data_entry_data(entry), bufp,
665f76
-				       bytes_reserved);
665f76
-				_stp_data_write_commit(entry);
665f76
-				bufp += bytes_reserved;
665f76
-				len -= bytes_reserved;
665f76
-			}
665f76
-			else {
665f76
-				atomic_inc(&_stp_transport_failures);
665f76
-				break;
665f76
-			}
665f76
-		}
665f76
-
665f76
-	done:
665f76
-
665f76
-		if (inode_locked)
665f76
-			_stp_transport_unlock_relay_inode();
665f76
-	}
665f76
-#endif /* !NO_PERCPU_HEADERS */
665f76
-
665f76
-#else  /* !STP_BULKMODE */
665f76
-
665f76
-	{
665f76
-		char *bufp = log->buf;
665f76
-		int inode_locked;
665f76
-
665f76
-		if (!(inode_locked = _stp_transport_trylock_relay_inode())) {
665f76
-			atomic_inc (&_stp_transport_failures);
665f76
-#ifndef STP_TRANSPORT_RISKY
665f76
-			dbug_trans(0, "discarding %zu bytes of data\n", len);
665f76
-			return;
665f76
-#endif
665f76
-		}
665f76
-
665f76
-		dbug_trans(1, "calling _stp_data_write...\n");
665f76
-		while (len > 0) {
665f76
-			size_t bytes_reserved;
665f76
-
665f76
-			bytes_reserved = _stp_data_write_reserve(len, &entry);
665f76
-			if (likely(entry && bytes_reserved > 0)) {
665f76
-				memcpy(_stp_data_entry_data(entry), bufp,
665f76
-				       bytes_reserved);
665f76
-				_stp_data_write_commit(entry);
665f76
-				bufp += bytes_reserved;
665f76
-				len -= bytes_reserved;
665f76
-			}
665f76
-			else {
665f76
-			    atomic_inc(&_stp_transport_failures);
665f76
-			    break;
665f76
-			}
665f76
-		}
665f76
-
665f76
-		if (inode_locked)
665f76
-			_stp_transport_unlock_relay_inode();
665f76
-	}
665f76
-#endif /* !STP_BULKMODE */
665f76
+	} while (len > 0);
665f76
 }
665f76
diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c
665f76
index ff621f71d..2ba5eea7d 100644
665f76
--- a/runtime/transport/relay_v2.c
665f76
+++ b/runtime/transport/relay_v2.c
665f76
@@ -67,7 +67,7 @@ static size_t __stp_relay_switch_subbuf(struct rchan_buf *buf, size_t length)
665f76
 		return 0;
665f76
 
665f76
 	if (unlikely(length > buf->chan->subbuf_size))
665f76
-		goto toobig;
665f76
+		length = buf->chan->subbuf_size;
665f76
 
665f76
 	if (buf->offset != buf->chan->subbuf_size + 1) {
665f76
 		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
665f76
@@ -98,14 +98,7 @@ static size_t __stp_relay_switch_subbuf(struct rchan_buf *buf, size_t length)
665f76
 	buf->data = new;
665f76
 	buf->padding[new_subbuf] = 0;
665f76
 
665f76
-	if (unlikely(length + buf->offset > buf->chan->subbuf_size))
665f76
-		goto toobig;
665f76
-
665f76
 	return length;
665f76
-
665f76
-toobig:
665f76
-	buf->chan->last_toobig = length;
665f76
-	return 0;
665f76
 }
665f76
 
665f76
 static void __stp_relay_wakeup_readers(struct rchan_buf *buf)
665f76
@@ -117,24 +110,17 @@ static void __stp_relay_wakeup_readers(struct rchan_buf *buf)
665f76
 
665f76
 static void __stp_relay_wakeup_timer(stp_timer_callback_parameter_t unused)
665f76
 {
665f76
-#ifdef STP_BULKMODE
665f76
 	int i;
665f76
-#endif
665f76
 
665f76
 	if (atomic_read(&_stp_relay_data.wakeup)) {
665f76
 		struct rchan_buf *buf;
665f76
 		
665f76
 		atomic_set(&_stp_relay_data.wakeup, 0);
665f76
-#ifdef STP_BULKMODE
665f76
 		for_each_possible_cpu(i) {
665f76
 			buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
665f76
 						    i);
665f76
 			__stp_relay_wakeup_readers(buf);
665f76
 		}
665f76
-#else
665f76
-		buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);
665f76
-		__stp_relay_wakeup_readers(buf);
665f76
-#endif
665f76
 	}
665f76
 
665f76
 	if (atomic_read(&_stp_relay_data.transport_state) == STP_TRANSPORT_RUNNING)
665f76
@@ -235,55 +221,8 @@ static void _stp_transport_data_fs_stop(void)
665f76
 		atomic_set (&_stp_relay_data.transport_state, STP_TRANSPORT_STOPPED);
665f76
 		del_timer_sync(&_stp_relay_data.timer);
665f76
 		dbug_trans(0, "flushing...\n");
665f76
-		if (_stp_relay_data.rchan) {
665f76
-			struct rchan_buf *buf;
665f76
-
665f76
-			/* NB we cannot call relay_flush() directly here since
665f76
-			 * we need to do inode locking ourselves.
665f76
-			 */
665f76
-
665f76
-#ifdef STP_BULKMODE
665f76
-			unsigned int i;
665f76
-			struct rchan *rchan = _stp_relay_data.rchan;
665f76
-
665f76
-			for_each_possible_cpu(i) {
665f76
-				buf = _stp_get_rchan_subbuf(rchan->buf, i);
665f76
-				if (buf) {
665f76
-					struct inode *inode = buf->dentry->d_inode;
665f76
-
665f76
-					/* NB we are in the syscall context which
665f76
-					 * allows sleeping. The following inode
665f76
-					 * locking might sleep. See PR26131. */
665f76
-					_stp_lock_inode(inode);
665f76
-
665f76
-					/* NB we intentionally avoids calling
665f76
-					 * our own __stp_relay_switch_subbuf()
665f76
-					 * since here we can sleep. */
665f76
-					relay_switch_subbuf(buf, 0);
665f76
-
665f76
-					_stp_unlock_inode(inode);
665f76
-				}
665f76
-			}
665f76
-#else  /* !STP_BULKMODE */
665f76
-			buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);
665f76
-
665f76
-			if (buf != NULL) {
665f76
-				struct inode *inode = buf->dentry->d_inode;
665f76
-
665f76
-				/* NB we are in the syscall context which allows
665f76
-				 * sleeping. The following inode locking might
665f76
-				 * sleep. See PR26131. */
665f76
-				_stp_lock_inode(inode);
665f76
-
665f76
-				/* NB we intentionally avoids calling
665f76
-				 * our own __stp_relay_switch_subbuf()
665f76
-				 * since here we can sleep. */
665f76
-				relay_switch_subbuf(buf, 0);
665f76
-
665f76
-				_stp_unlock_inode(inode);
665f76
-			}
665f76
-#endif
665f76
-		}
665f76
+		if (_stp_relay_data.rchan)
665f76
+			relay_flush(_stp_relay_data.rchan);
665f76
 	}
665f76
 }
665f76
 
665f76
@@ -308,9 +247,7 @@ static int _stp_transport_data_fs_init(void)
665f76
 
665f76
 	/* Create "trace" file. */
665f76
 	npages = _stp_subbuf_size * _stp_nsubbufs;
665f76
-#ifdef STP_BULKMODE
665f76
 	npages *= num_online_cpus();
665f76
-#endif
665f76
 	npages >>= PAGE_SHIFT;
665f76
 	si_meminfo(&si);
665f76
 #define MB(i) (unsigned long)((i) >> (20 - PAGE_SHIFT))
665f76
@@ -347,9 +284,7 @@ static int _stp_transport_data_fs_init(void)
665f76
         {
665f76
                 u64 relay_mem;
665f76
                 relay_mem = _stp_subbuf_size * _stp_nsubbufs;
665f76
-#ifdef STP_BULKMODE
665f76
                 relay_mem *= num_online_cpus();
665f76
-#endif
665f76
                 _stp_allocated_net_memory += relay_mem;
665f76
                 _stp_allocated_memory += relay_mem;
665f76
         }
665f76
@@ -386,12 +321,7 @@ _stp_data_write_reserve(size_t size_request, void **entry)
665f76
 		return -EINVAL;
665f76
 
665f76
 	buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
665f76
-#ifdef STP_BULKMODE
665f76
-				    smp_processor_id()
665f76
-#else
665f76
-				    0
665f76
-#endif
665f76
-				    );
665f76
+				    smp_processor_id());
665f76
 	if (unlikely(buf->offset + size_request > buf->chan->subbuf_size)) {
665f76
 		size_request = __stp_relay_switch_subbuf(buf, size_request);
665f76
 		if (!size_request)
665f76
@@ -411,65 +341,10 @@ static unsigned char *_stp_data_entry_data(void *entry)
665f76
 
665f76
 static int _stp_data_write_commit(void *entry)
665f76
 {
665f76
-	/* Nothing to do here. */
665f76
-	return 0;
665f76
-}
665f76
-
665f76
-static noinline int _stp_transport_trylock_relay_inode(void)
665f76
-{
665f76
-	unsigned i;
665f76
 	struct rchan_buf *buf;
665f76
-	struct inode *inode;
665f76
-#ifdef DEBUG_TRANS
665f76
-	cycles_t begin;
665f76
-#endif
665f76
 
665f76
 	buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,
665f76
-#ifdef STP_BULKMODE
665f76
-				    smp_processor_id()
665f76
-#else
665f76
-				    0
665f76
-#endif