Tree - rpms/systemtap - CentOS Git server

rpms / systemtap

Blame SOURCES/rhbz1906662.patch

Blob History Raw

		b8e312	`commit 374d37118ae1274077a425261ef1428151eb6d7c`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Tue Nov 10 10:03:34 2020 -0800`
		b8e312
		b8e312	`stp_utrace: disable IRQs when holding the bucket spin lock`
		b8e312
		b8e312	`This lock can be acquired from inside an IRQ, leading to a deadlock:`
		b8e312
		b8e312	`WARNING: inconsistent lock state`
		b8e312	`4.14.35-1902.6.6.el7uek.x86_64.debug #2 Tainted: G OE`
		b8e312	`--------------------------------`
		b8e312	`inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.`
		b8e312	`sh/15779 [HC1[1]:SC0[0]:HE0:SE1] takes:`
		b8e312	`(&(lock)->rlock#3){?.+.}, at: [<ffffffffc0c080b0>] _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`{HARDIRQ-ON-W} state was registered at:`
		b8e312	`lock_acquire+0xe0/0x238`
		b8e312	`_raw_spin_lock+0x3d/0x7a`
		b8e312	`utrace_task_alloc+0xa4/0xe3 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`utrace_attach_task+0x136/0x194 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`__stp_utrace_attach+0x57/0x216 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`stap_start_task_finder+0x12e/0x33f [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`systemtap_module_init+0x114d/0x11f0 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`_stp_handle_start+0xea/0x1c5 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`_stp_ctl_write_cmd+0x28d/0x2d1 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`full_proxy_write+0x67/0xbb`
		b8e312	`__vfs_write+0x3a/0x170`
		b8e312	`vfs_write+0xc7/0x1c0`
		b8e312	`SyS_write+0x58/0xbf`
		b8e312	`do_syscall_64+0x7e/0x22c`
		b8e312	`entry_SYSCALL_64_after_hwframe+0x16e/0x0`
		b8e312	`irq event stamp: 9454`
		b8e312	`hardirqs last enabled at (9453): [<ffffffffa696c960>] _raw_write_unlock_irqrestore+0x40/0x67`
		b8e312	`hardirqs last disabled at (9454): [<ffffffffa6a05417>] apic_timer_interrupt+0x1c7/0x1d1`
		b8e312	`softirqs last enabled at (9202): [<ffffffffa6c00361>] __do_softirq+0x361/0x4e5`
		b8e312	`softirqs last disabled at (9195): [<ffffffffa60aeb76>] irq_exit+0xf6/0x102`
		b8e312
		b8e312	`other info that might help us debug this:`
		b8e312	`Possible unsafe locking scenario:`
		b8e312
		b8e312	`CPU0`
		b8e312	`----`
		b8e312	`lock(&(lock)->rlock#3);`
		b8e312	`<Interrupt>`
		b8e312	`lock(&(lock)->rlock#3);`
		b8e312
		b8e312	`* DEADLOCK *`
		b8e312
		b8e312	`no locks held by sh/15779.`
		b8e312
		b8e312	`stack backtrace:`
		b8e312	`CPU: 16 PID: 15779 Comm: sh Tainted: G OE 4.14.35-1902.6.6.el7uek.x86_64.debug #2`
		b8e312	`Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014`
		b8e312	`Call Trace:`
		b8e312	`<IRQ>`
		b8e312	`dump_stack+0x81/0xb6`
		b8e312	`print_usage_bug+0x1fc/0x20d`
		b8e312	`? check_usage_backwards+0x130/0x12b`
		b8e312	`mark_lock+0x1f8/0x27b`
		b8e312	`__lock_acquire+0x6e7/0x165a`
		b8e312	`? sched_clock_local+0x18/0x81`
		b8e312	`? perf_swevent_hrtimer+0x136/0x151`
		b8e312	`lock_acquire+0xe0/0x238`
		b8e312	`? _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`_raw_spin_lock_irqsave+0x55/0x97`
		b8e312	`? _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`_stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`_stp_ctl_get_buffer+0x69/0x215 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`_stp_ctl_send+0x4e/0x169 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`_stp_vlog+0xac/0x143 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`? _stp_utrace_probe_cb+0xa4/0xa4 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`_stp_warn+0x6a/0x88 [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`function___global_warn__overload_0+0x60/0xac [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`probe_67+0xce/0x10e [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`_stp_hrtimer_notify_function+0x2db/0x55f [orxray_lj_lua_fgraph_XXXXXXX]`
		b8e312	`__hrtimer_run_queues+0x132/0x5c5`
		b8e312	`hrtimer_interrupt+0xb7/0x1ca`
		b8e312	`smp_apic_timer_interrupt+0xa5/0x35a`
		b8e312	`apic_timer_interrupt+0x1cc/0x1d1`
		b8e312	`</IRQ>`
		b8e312
		b8e312	`diff --git a/runtime/stp_utrace.c b/runtime/stp_utrace.c`
		b8e312	`index e2880f1e4..46ba48923 100644`
		b8e312	`--- a/runtime/stp_utrace.c`
		b8e312	`+++ b/runtime/stp_utrace.c`
		b8e312	`@@ -490,9 +490,9 @@ static int utrace_exit(void)`
		b8e312	`rcu_read_lock();`
		b8e312	`stap_hlist_for_each_entry_rcu(utrace, node, &bucket->head, hlist) {`
		b8e312	`utrace->freed = true;`
		b8e312	`- stp_spin_lock(&bucket->lock);`
		b8e312	`+ stp_spin_lock_irqsave(&bucket->lock, flags);`
		b8e312	`hlist_del_rcu(&utrace->hlist);`
		b8e312	`- stp_spin_unlock(&bucket->lock);`
		b8e312	`+ stp_spin_unlock_irqrestore(&bucket->lock, flags);`
		b8e312
		b8e312	`utrace_cleanup(utrace);`
		b8e312	`}`
		b8e312	`@@ -724,6 +724,7 @@ static struct utrace utrace_task_alloc(struct utrace_bucket bucket,`
		b8e312	`struct task_struct *task)`
		b8e312	`{`
		b8e312	`struct utrace *utrace;`
		b8e312	`+ unsigned long flags;`
		b8e312
		b8e312	`utrace = kmem_cache_zalloc(utrace_cachep, STP_ALLOC_FLAGS);`
		b8e312	`if (unlikely(!utrace))`
		b8e312	`@@ -739,9 +740,9 @@ static struct utrace utrace_task_alloc(struct utrace_bucket bucket,`
		b8e312	`atomic_set(&utrace->resume_work_added, 0);`
		b8e312	`atomic_set(&utrace->report_work_added, 0);`
		b8e312
		b8e312	`- stp_spin_lock(&bucket->lock);`
		b8e312	`+ stp_spin_lock_irqsave(&bucket->lock, flags);`
		b8e312	`hlist_add_head_rcu(&utrace->hlist, &bucket->head);`
		b8e312	`- stp_spin_unlock(&bucket->lock);`
		b8e312	`+ stp_spin_unlock_irqrestore(&bucket->lock, flags);`
		b8e312	`return utrace;`
		b8e312	`}`
		b8e312
		b8e312	`@@ -768,15 +769,17 @@ static struct utrace utrace_task_alloc(struct utrace_bucket bucket,`
		b8e312	`*/`
		b8e312	`static void utrace_free(struct utrace_bucket bucket, struct utrace utrace)`
		b8e312	`{`
		b8e312	`+ unsigned long flags;`
		b8e312	`+`
		b8e312	`if (unlikely(!utrace))`
		b8e312	`return;`
		b8e312
		b8e312	`/* Remove this utrace from the mapping list of tasks to`
		b8e312	`* struct utrace. */`
		b8e312	`utrace->freed = true;`
		b8e312	`- stp_spin_lock(&bucket->lock);`
		b8e312	`+ stp_spin_lock_irqsave(&bucket->lock, flags);`
		b8e312	`hlist_del_rcu(&utrace->hlist);`
		b8e312	`- stp_spin_unlock(&bucket->lock);`
		b8e312	`+ stp_spin_unlock_irqrestore(&bucket->lock, flags);`
		b8e312
		b8e312	`/* Free the utrace struct. */`
		b8e312	`#ifdef STP_TF_DEBUG`
		b8e312	`commit 6a092f5ae824d4ce972c10b8681b9272e2fd67f3`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Tue Nov 17 11:03:53 2020 -0800`
		b8e312
		b8e312	`task_finder: call _stp_vma_done() upon error to fix memory leak`
		b8e312
		b8e312	`The memory allocated inside stap_initialize_vma_map() is not freed upon`
		b8e312	`error when the task finder is started because a call to _stp_vma_done()`
		b8e312	`in the error path is missing. Add it to fix the leak.`
		b8e312
		b8e312	`diff --git a/task_finder.cxx b/task_finder.cxx`
		b8e312	`index d08d44a75..7c45e728b 100644`
		b8e312	`--- a/task_finder.cxx`
		b8e312	`+++ b/task_finder.cxx`
		b8e312	`@@ -66,6 +66,7 @@ task_finder_derived_probe_group::emit_module_init (systemtap_session& s)`
		b8e312
		b8e312	`s.op->newline() << "if (rc) {";`
		b8e312	`s.op->newline(1) << "stap_stop_task_finder();";`
		b8e312	`+ s.op->newline() << "_stp_vma_done();";`
		b8e312	`s.op->newline(-1) << "}";`
		b8e312	`s.op->newline(-1) << "}";`
		b8e312	`}`
		b8e312	`commit 3c4f82ca024df4f8e213f7c77418493262d4a4d7`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Tue Nov 24 10:50:10 2020 -0800`
		b8e312
		b8e312	`runtime_context: factor out RCU usage using a rw lock`
		b8e312
		b8e312	`We can factor out the RCU insanity in here by just adding in a rw lock`
		b8e312	`and using that to synchronize _stp_runtime_contexts_free() with any code`
		b8e312	`that has the runtime context held.`
		b8e312
		b8e312	`diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h`
		b8e312	`index 41fecba81..18566957a 100644`
		b8e312	`--- a/runtime/linux/runtime_context.h`
		b8e312	`+++ b/runtime/linux/runtime_context.h`
		b8e312	`@@ -11,15 +11,14 @@`
		b8e312	`#ifndef _LINUX_RUNTIME_CONTEXT_H_`
		b8e312	`#define _LINUX_RUNTIME_CONTEXT_H_`
		b8e312
		b8e312	`-#ifndef __rcu`
		b8e312	`-#define __rcu`
		b8e312	`-#endif`
		b8e312	`-`
		b8e312	`-static struct context __rcu *contexts[NR_CPUS] = { NULL };`
		b8e312	`+/* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */`
		b8e312	`+static DEFINE_RWLOCK(_stp_context_lock);`
		b8e312	`+static DEFINE_PER_CPU(struct context *, contexts);`
		b8e312	`+static atomic_t _stp_context_stop = ATOMIC_INIT(0);`
		b8e312
		b8e312	`static int _stp_runtime_contexts_alloc(void)`
		b8e312	`{`
		b8e312	`- int cpu;`
		b8e312	`+ unsigned int cpu;`
		b8e312
		b8e312	`for_each_possible_cpu(cpu) {`
		b8e312	`/* Module init, so in user context, safe to use`
		b8e312	`@@ -31,91 +30,67 @@ static int _stp_runtime_contexts_alloc(void)`
		b8e312	`(unsigned long) sizeof (struct context));`
		b8e312	`return -ENOMEM;`
		b8e312	`}`
		b8e312	`- rcu_assign_pointer(contexts[cpu], c);`
		b8e312	`+ per_cpu(contexts, cpu) = c;`
		b8e312	`}`
		b8e312	`return 0;`
		b8e312	`}`
		b8e312
		b8e312	`/* We should be free of all probes by this time, but for example the timer for`
		b8e312	`* _stp_ctl_work_callback may still be running and looking for contexts. We`
		b8e312	`- * use RCU-sched synchronization to be sure its safe to free them. */`
		b8e312	`+ * use _stp_context_stop and a write lock to be sure its safe to free them. */`
		b8e312	`static void _stp_runtime_contexts_free(void)`
		b8e312	`{`
		b8e312	`- // Note that 'free_contexts' is static because it is`
		b8e312	`- // (probably) too big to fit on a kernel function's stack.`
		b8e312	`- static struct context *free_contexts[NR_CPUS] = { NULL };`
		b8e312	`- int cpu;`
		b8e312	`+ unsigned long flags;`
		b8e312	`+ unsigned int cpu;`
		b8e312
		b8e312	`- /* First, save all the pointers. */`
		b8e312	`- rcu_read_lock_sched();`
		b8e312	`- for_each_possible_cpu(cpu) {`
		b8e312	`- free_contexts[cpu] = rcu_dereference_sched(contexts[cpu]);`
		b8e312	`- }`
		b8e312	`- rcu_read_unlock_sched();`
		b8e312	`+ /* Sync to make sure existing readers are done */`
		b8e312	`+ atomic_set(&_stp_context_stop, 1);`
		b8e312	`+ write_lock_irqsave(&_stp_context_lock, flags);`
		b8e312	`+ write_unlock_irqrestore(&_stp_context_lock, flags);`
		b8e312
		b8e312	`- /* Now clear all pointers to prevent new readers. */`
		b8e312	`- for_each_possible_cpu(cpu) {`
		b8e312	`- rcu_assign_pointer(contexts[cpu], NULL);`
		b8e312	`- }`
		b8e312	`-`
		b8e312	`- /* Sync to make sure existing readers are done. */`
		b8e312	`- stp_synchronize_sched();`
		b8e312	`-`
		b8e312	`- /* Now we can actually free the contexts. */`
		b8e312	`- for_each_possible_cpu(cpu) {`
		b8e312	`- struct context *c = free_contexts[cpu];`
		b8e312	`- if (c != NULL) {`
		b8e312	`- free_contexts[cpu] = NULL;`
		b8e312	`- _stp_vfree(c);`
		b8e312	`- }`
		b8e312	`- }`
		b8e312	`+ /* Now we can actually free the contexts */`
		b8e312	`+ for_each_possible_cpu(cpu)`
		b8e312	`+ _stp_vfree(per_cpu(contexts, cpu));`
		b8e312	`}`
		b8e312
		b8e312	`static inline struct context * _stp_runtime_get_context(void)`
		b8e312	`{`
		b8e312	`- // RHBZ1788662 rcu operations are rejected in idle-cpu contexts`
		b8e312	`- // in effect: skip probe if it's in rcu-idle state`
		b8e312	`-#if defined(STAPCONF_RCU_IS_WATCHING) \|\| LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0) // linux commit #5c173eb8`
		b8e312	`- if (! rcu_is_watching())`
		b8e312	`- return 0;`
		b8e312	`-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0) // linux commit #9b2e4f18`
		b8e312	`- if (! rcu_is_cpu_idle())`
		b8e312	`- return 0;`
		b8e312	`-#else`
		b8e312	`- ; // XXX older kernels didn't put tracepoints in idle-cpu`
		b8e312	`-#endif`
		b8e312	`- return rcu_dereference_sched(contexts[smp_processor_id()]);`
		b8e312	`+ if (atomic_read(&_stp_context_stop))`
		b8e312	`+ return NULL;`
		b8e312	`+`
		b8e312	`+ return per_cpu(contexts, smp_processor_id());`
		b8e312	`}`
		b8e312
		b8e312	`static struct context * _stp_runtime_entryfn_get_context(void)`
		b8e312	`+ __acquires(&_stp_context_lock)`
		b8e312	`{`
		b8e312	`struct context* __restrict__ c = NULL;`
		b8e312	`- preempt_disable ();`
		b8e312	`+`
		b8e312	`+ if (!read_trylock(&_stp_context_lock))`
		b8e312	`+ return NULL;`
		b8e312	`+`
		b8e312	`c = _stp_runtime_get_context();`
		b8e312	`if (c != NULL) {`
		b8e312	`- if (atomic_inc_return(&c->busy) == 1) {`
		b8e312	`- // NB: Notice we're not re-enabling preemption`
		b8e312	`+ if (!atomic_cmpxchg(&c->busy, 0, 1)) {`
		b8e312	`+ // NB: Notice we're not releasing _stp_context_lock`
		b8e312	`// here. We exepect the calling code to call`
		b8e312	`// _stp_runtime_entryfn_get_context() and`
		b8e312	`// _stp_runtime_entryfn_put_context() as a`
		b8e312	`// pair.`
		b8e312	`return c;`
		b8e312	`}`
		b8e312	`- atomic_dec(&c->busy);`
		b8e312	`}`
		b8e312	`- preempt_enable_no_resched();`
		b8e312	`+ read_unlock(&_stp_context_lock);`
		b8e312	`return NULL;`
		b8e312	`}`
		b8e312
		b8e312	`static inline void _stp_runtime_entryfn_put_context(struct context *c)`
		b8e312	`+ __releases(&_stp_context_lock)`
		b8e312	`{`
		b8e312	`if (c) {`
		b8e312	`- if (c == _stp_runtime_get_context())`
		b8e312	`- atomic_dec(&c->busy);`
		b8e312	`- /* else, warn about bad state? */`
		b8e312	`- preempt_enable_no_resched();`
		b8e312	`+ atomic_set(&c->busy, 0);`
		b8e312	`+ read_unlock(&_stp_context_lock);`
		b8e312	`}`
		b8e312	`- return;`
		b8e312	`}`
		b8e312
		b8e312	`static void _stp_runtime_context_wait(void)`
		b8e312	`@@ -130,9 +105,13 @@ static void _stp_runtime_context_wait(void)`
		b8e312	`int i;`
		b8e312
		b8e312	`holdon = 0;`
		b8e312	`- rcu_read_lock_sched();`
		b8e312	`+ read_lock(&_stp_context_lock);`
		b8e312	`+ if (atomic_read(&_stp_context_stop)) {`
		b8e312	`+ read_unlock(&_stp_context_lock);`
		b8e312	`+ break;`
		b8e312	`+ }`
		b8e312	`for_each_possible_cpu(i) {`
		b8e312	`- struct context *c = rcu_dereference_sched(contexts[i]);`
		b8e312	`+ struct context *c = per_cpu(contexts, i);`
		b8e312	`if (c != NULL`
		b8e312	`&& atomic_read (& c->busy)) {`
		b8e312	`holdon = 1;`
		b8e312	`@@ -146,7 +125,7 @@ static void _stp_runtime_context_wait(void)`
		b8e312	`}`
		b8e312	`}`
		b8e312	`}`
		b8e312	`- rcu_read_unlock_sched();`
		b8e312	`+ read_unlock(&_stp_context_lock);`
		b8e312
		b8e312	`/*`
		b8e312	`* Just in case things are really really stuck, a`
		b8e312	`commit bb25d64f7b6c98ef2fc8b711f19bd6271866d727`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Tue Dec 1 09:54:07 2020 -0800`
		b8e312
		b8e312	`runtime_context: synchronize _stp_context_stop more strictly`
		b8e312
		b8e312	`We're only reading _stp_context_stop while the read lock is held, so we`
		b8e312	`can move the modification of it to inside the write lock to ensure`
		b8e312	`strict memory ordering. As such, it no longer needs to be an atomic_t`
		b8e312	`variable.`
		b8e312
		b8e312	`We also don't need to disable IRQs when holding the write lock because`
		b8e312	`only read_trylock is used from IRQ context, not read_lock, so there's no`
		b8e312	`possibility of a deadlock occurring.`
		b8e312
		b8e312	`diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h`
		b8e312	`index 18566957a..e716e6d39 100644`
		b8e312	`--- a/runtime/linux/runtime_context.h`
		b8e312	`+++ b/runtime/linux/runtime_context.h`
		b8e312	`@@ -14,7 +14,7 @@`
		b8e312	`/* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */`
		b8e312	`static DEFINE_RWLOCK(_stp_context_lock);`
		b8e312	`static DEFINE_PER_CPU(struct context *, contexts);`
		b8e312	`-static atomic_t _stp_context_stop = ATOMIC_INIT(0);`
		b8e312	`+static bool _stp_context_stop;`
		b8e312
		b8e312	`static int _stp_runtime_contexts_alloc(void)`
		b8e312	`{`
		b8e312	`@@ -40,13 +40,12 @@ static int _stp_runtime_contexts_alloc(void)`
		b8e312	`* use _stp_context_stop and a write lock to be sure its safe to free them. */`
		b8e312	`static void _stp_runtime_contexts_free(void)`
		b8e312	`{`
		b8e312	`- unsigned long flags;`
		b8e312	`unsigned int cpu;`
		b8e312
		b8e312	`/* Sync to make sure existing readers are done */`
		b8e312	`- atomic_set(&_stp_context_stop, 1);`
		b8e312	`- write_lock_irqsave(&_stp_context_lock, flags);`
		b8e312	`- write_unlock_irqrestore(&_stp_context_lock, flags);`
		b8e312	`+ write_lock(&_stp_context_lock);`
		b8e312	`+ _stp_context_stop = true;`
		b8e312	`+ write_unlock(&_stp_context_lock);`
		b8e312
		b8e312	`/* Now we can actually free the contexts */`
		b8e312	`for_each_possible_cpu(cpu)`
		b8e312	`@@ -55,7 +54,7 @@ static void _stp_runtime_contexts_free(void)`
		b8e312
		b8e312	`static inline struct context * _stp_runtime_get_context(void)`
		b8e312	`{`
		b8e312	`- if (atomic_read(&_stp_context_stop))`
		b8e312	`+ if (_stp_context_stop)`
		b8e312	`return NULL;`
		b8e312
		b8e312	`return per_cpu(contexts, smp_processor_id());`
		b8e312	`@@ -106,7 +105,7 @@ static void _stp_runtime_context_wait(void)`
		b8e312
		b8e312	`holdon = 0;`
		b8e312	`read_lock(&_stp_context_lock);`
		b8e312	`- if (atomic_read(&_stp_context_stop)) {`
		b8e312	`+ if (_stp_context_stop) {`
		b8e312	`read_unlock(&_stp_context_lock);`
		b8e312	`break;`
		b8e312	`}`
		b8e312	`commit 0cc239e6f0fff79cb584fc857d3220402558db37`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Tue Dec 1 18:47:04 2020 -0800`
		b8e312
		b8e312	`runtime_context: replace _stp_context_lock with an atomic variable`
		b8e312
		b8e312	`We can't use any lock primitives here, such as spin locks or rw locks,`
		b8e312	`because lock_acquire() has tracepoints inside of it. This can cause a`
		b8e312	`deadlock, so we have to roll our own synchronization mechanism using an`
		b8e312	`atomic variable.`
		b8e312
		b8e312	`diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h`
		b8e312	`index e716e6d39..7dd240e1a 100644`
		b8e312	`--- a/runtime/linux/runtime_context.h`
		b8e312	`+++ b/runtime/linux/runtime_context.h`
		b8e312	`@@ -11,10 +11,9 @@`
		b8e312	`#ifndef _LINUX_RUNTIME_CONTEXT_H_`
		b8e312	`#define _LINUX_RUNTIME_CONTEXT_H_`
		b8e312
		b8e312	`-/* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */`
		b8e312	`-static DEFINE_RWLOCK(_stp_context_lock);`
		b8e312	`+/* Can't use a lock primitive for this because lock_acquire() has tracepoints */`
		b8e312	`+static atomic_t _stp_contexts_busy_ctr = ATOMIC_INIT(0);`
		b8e312	`static DEFINE_PER_CPU(struct context *, contexts);`
		b8e312	`-static bool _stp_context_stop;`
		b8e312
		b8e312	`static int _stp_runtime_contexts_alloc(void)`
		b8e312	`{`
		b8e312	`@@ -37,15 +36,14 @@ static int _stp_runtime_contexts_alloc(void)`
		b8e312
		b8e312	`/* We should be free of all probes by this time, but for example the timer for`
		b8e312	`* _stp_ctl_work_callback may still be running and looking for contexts. We`
		b8e312	`- * use _stp_context_stop and a write lock to be sure its safe to free them. */`
		b8e312	`+ * use _stp_contexts_busy_ctr to be sure its safe to free them. */`
		b8e312	`static void _stp_runtime_contexts_free(void)`
		b8e312	`{`
		b8e312	`unsigned int cpu;`
		b8e312
		b8e312	`/* Sync to make sure existing readers are done */`
		b8e312	`- write_lock(&_stp_context_lock);`
		b8e312	`- _stp_context_stop = true;`
		b8e312	`- write_unlock(&_stp_context_lock);`
		b8e312	`+ while (atomic_cmpxchg(&_stp_contexts_busy_ctr, 0, INT_MAX))`
		b8e312	`+ cpu_relax();`
		b8e312
		b8e312	`/* Now we can actually free the contexts */`
		b8e312	`for_each_possible_cpu(cpu)`
		b8e312	`@@ -54,24 +52,20 @@ static void _stp_runtime_contexts_free(void)`
		b8e312
		b8e312	`static inline struct context * _stp_runtime_get_context(void)`
		b8e312	`{`
		b8e312	`- if (_stp_context_stop)`
		b8e312	`- return NULL;`
		b8e312	`-`
		b8e312	`return per_cpu(contexts, smp_processor_id());`
		b8e312	`}`
		b8e312
		b8e312	`static struct context * _stp_runtime_entryfn_get_context(void)`
		b8e312	`- __acquires(&_stp_context_lock)`
		b8e312	`{`
		b8e312	`struct context* __restrict__ c = NULL;`
		b8e312
		b8e312	`- if (!read_trylock(&_stp_context_lock))`
		b8e312	`+ if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))`
		b8e312	`return NULL;`
		b8e312
		b8e312	`c = _stp_runtime_get_context();`
		b8e312	`if (c != NULL) {`
		b8e312	`if (!atomic_cmpxchg(&c->busy, 0, 1)) {`
		b8e312	`- // NB: Notice we're not releasing _stp_context_lock`
		b8e312	`+ // NB: Notice we're not releasing _stp_contexts_busy_ctr`
		b8e312	`// here. We exepect the calling code to call`
		b8e312	`// _stp_runtime_entryfn_get_context() and`
		b8e312	`// _stp_runtime_entryfn_put_context() as a`
		b8e312	`@@ -79,16 +73,15 @@ static struct context * _stp_runtime_entryfn_get_context(void)`
		b8e312	`return c;`
		b8e312	`}`
		b8e312	`}`
		b8e312	`- read_unlock(&_stp_context_lock);`
		b8e312	`+ atomic_dec(&_stp_contexts_busy_ctr);`
		b8e312	`return NULL;`
		b8e312	`}`
		b8e312
		b8e312	`static inline void _stp_runtime_entryfn_put_context(struct context *c)`
		b8e312	`- __releases(&_stp_context_lock)`
		b8e312	`{`
		b8e312	`if (c) {`
		b8e312	`atomic_set(&c->busy, 0);`
		b8e312	`- read_unlock(&_stp_context_lock);`
		b8e312	`+ atomic_dec(&_stp_contexts_busy_ctr);`
		b8e312	`}`
		b8e312	`}`
		b8e312
		b8e312	`@@ -104,11 +97,9 @@ static void _stp_runtime_context_wait(void)`
		b8e312	`int i;`
		b8e312
		b8e312	`holdon = 0;`
		b8e312	`- read_lock(&_stp_context_lock);`
		b8e312	`- if (_stp_context_stop) {`
		b8e312	`- read_unlock(&_stp_context_lock);`
		b8e312	`+ if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))`
		b8e312	`break;`
		b8e312	`- }`
		b8e312	`+`
		b8e312	`for_each_possible_cpu(i) {`
		b8e312	`struct context *c = per_cpu(contexts, i);`
		b8e312	`if (c != NULL`
		b8e312	`@@ -124,7 +115,7 @@ static void _stp_runtime_context_wait(void)`
		b8e312	`}`
		b8e312	`}`
		b8e312	`}`
		b8e312	`- read_unlock(&_stp_context_lock);`
		b8e312	`+ atomic_dec(&_stp_contexts_busy_ctr);`
		b8e312
		b8e312	`/*`
		b8e312	`* Just in case things are really really stuck, a`
		b8e312	`commit fbab0ea35e6af0d6599c6de3708b24008bf03ae6`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Wed Dec 2 11:07:11 2020 -0800`
		b8e312
		b8e312	`runtime_context: disable preempt while holding the context`
		b8e312
		b8e312	`After the context lock was converted to an atomic in the previous`
		b8e312	`commit, the preempt disable logic disappeared. Add it back.`
		b8e312
		b8e312	`diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h`
		b8e312	`index 7dd240e1a..7a1532e54 100644`
		b8e312	`--- a/runtime/linux/runtime_context.h`
		b8e312	`+++ b/runtime/linux/runtime_context.h`
		b8e312	`@@ -34,6 +34,24 @@ static int _stp_runtime_contexts_alloc(void)`
		b8e312	`return 0;`
		b8e312	`}`
		b8e312
		b8e312	`+static bool _stp_runtime_context_trylock(void)`
		b8e312	`+{`
		b8e312	`+ bool locked;`
		b8e312	`+`
		b8e312	`+ preempt_disable();`
		b8e312	`+ locked = atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX);`
		b8e312	`+ if (!locked)`
		b8e312	`+ preempt_enable_no_resched();`
		b8e312	`+`
		b8e312	`+ return locked;`
		b8e312	`+}`
		b8e312	`+`
		b8e312	`+static void _stp_runtime_context_unlock(void)`
		b8e312	`+{`
		b8e312	`+ atomic_dec(&_stp_contexts_busy_ctr);`
		b8e312	`+ preempt_enable_no_resched();`
		b8e312	`+}`
		b8e312	`+`
		b8e312	`/* We should be free of all probes by this time, but for example the timer for`
		b8e312	`* _stp_ctl_work_callback may still be running and looking for contexts. We`
		b8e312	`* use _stp_contexts_busy_ctr to be sure its safe to free them. */`
		b8e312	`@@ -59,7 +77,7 @@ static struct context * _stp_runtime_entryfn_get_context(void)`
		b8e312	`{`
		b8e312	`struct context* __restrict__ c = NULL;`
		b8e312
		b8e312	`- if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))`
		b8e312	`+ if (!_stp_runtime_context_trylock())`
		b8e312	`return NULL;`
		b8e312
		b8e312	`c = _stp_runtime_get_context();`
		b8e312	`@@ -73,7 +91,7 @@ static struct context * _stp_runtime_entryfn_get_context(void)`
		b8e312	`return c;`
		b8e312	`}`
		b8e312	`}`
		b8e312	`- atomic_dec(&_stp_contexts_busy_ctr);`
		b8e312	`+ _stp_runtime_context_unlock();`
		b8e312	`return NULL;`
		b8e312	`}`
		b8e312
		b8e312	`@@ -81,7 +99,7 @@ static inline void _stp_runtime_entryfn_put_context(struct context *c)`
		b8e312	`{`
		b8e312	`if (c) {`
		b8e312	`atomic_set(&c->busy, 0);`
		b8e312	`- atomic_dec(&_stp_contexts_busy_ctr);`
		b8e312	`+ _stp_runtime_context_unlock();`
		b8e312	`}`
		b8e312	`}`
		b8e312
		b8e312	`@@ -97,7 +115,7 @@ static void _stp_runtime_context_wait(void)`
		b8e312	`int i;`
		b8e312
		b8e312	`holdon = 0;`
		b8e312	`- if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX))`
		b8e312	`+ if (!_stp_runtime_context_trylock())`
		b8e312	`break;`
		b8e312
		b8e312	`for_each_possible_cpu(i) {`
		b8e312	`@@ -115,7 +133,7 @@ static void _stp_runtime_context_wait(void)`
		b8e312	`}`
		b8e312	`}`
		b8e312	`}`
		b8e312	`- atomic_dec(&_stp_contexts_busy_ctr);`
		b8e312	`+ _stp_runtime_context_unlock();`
		b8e312
		b8e312	`/*`
		b8e312	`* Just in case things are really really stuck, a`
		b8e312	`commit aedc044d5d38cb2fa6144d0a3345d06847862f1b`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Wed Dec 2 11:27:47 2020 -0800`
		b8e312
		b8e312	`task_finder_vma: add kfree_rcu() compat for old kernels`
		b8e312
		b8e312	`Newer RHEL 6 kernels have kfree_rcu(), but older ones do not. Using`
		b8e312	`kfree_rcu() is beneficial because it lets the RCU subsystem know that`
		b8e312	`the queued RCU callback is low-priority, and can be deferred, hence why`
		b8e312	`we don't replace kfree_rcu() with call_rcu() outright. Luckily,`
		b8e312	`kfree_rcu() is a macro so we can just #ifdef with it.`
		b8e312
		b8e312	`diff --git a/runtime/task_finder_vma.c b/runtime/task_finder_vma.c`
		b8e312	`index 7f0f6ed56..dc77a80f5 100644`
		b8e312	`--- a/runtime/task_finder_vma.c`
		b8e312	`+++ b/runtime/task_finder_vma.c`
		b8e312	`@@ -87,6 +87,15 @@ __stp_tf_vma_new_entry(void)`
		b8e312	`return entry;`
		b8e312	`}`
		b8e312
		b8e312	`+#ifndef kfree_rcu`
		b8e312	`+static void __stp_tf_vma_free_entry(struct rcu_head *rcu)`
		b8e312	`+{`
		b8e312	`+ struct __stp_tf_vma_entry entry = container_of(rcu, typeof(entry), rcu);`
		b8e312	`+`
		b8e312	`+ kfree(entry);`
		b8e312	`+}`
		b8e312	`+#endif`
		b8e312	`+`
		b8e312	`// __stp_tf_vma_put_entry(): Put a specified number of references on the entry.`
		b8e312	`static void`
		b8e312	`__stp_tf_vma_put_entry(struct __stp_tf_vma_bucket *bucket,`
		b8e312	`@@ -106,7 +115,11 @@ __stp_tf_vma_put_entry(struct __stp_tf_vma_bucket *bucket,`
		b8e312	`hlist_del_rcu(&entry->hlist);`
		b8e312	`stp_spin_unlock_irqrestore(&bucket->lock, flags);`
		b8e312
		b8e312	`+#ifdef kfree_rcu`
		b8e312	`kfree_rcu(entry, rcu);`
		b8e312	`+#else`
		b8e312	`+ call_rcu(&entry->rcu, __stp_tf_vma_free_entry);`
		b8e312	`+#endif`
		b8e312	`}`
		b8e312
		b8e312	`// stap_initialize_vma_map(): Initialize the free list. Grabs the`
		b8e312	`commit 6a27888b118b7a94650a68aae028957cdd5fb5f5`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Wed Dec 2 18:09:17 2020 -0800`
		b8e312
		b8e312	`REVERTME: tapset-timers: work around on-the-fly deadlocks caused by mutex_trylock`
		b8e312
		b8e312	`The following deadlock exists due to tracepoints existing inside a lock`
		b8e312	`that is used both inside probe context and outside probe context:`
		b8e312	`#0 [ffff88017f6d7a08] kvm_wait at ffffffff81079f5a`
		b8e312	`#1 [ffff88017f6d7a30] __pv_queued_spin_lock_slowpath at ffffffff8114f51e`
		b8e312	`#2 [ffff88017f6d7a70] queued_spin_lock_slowpath at ffffffff810e842b`
		b8e312	`#3 [ffff88017f6d7a80] mutex_trylock at ffffffff81882b1b`
		b8e312	`#4 [ffff88017f6d7ab8] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#5 [ffff88017f6d7ad8] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#6 [ffff88017f6d7b10] probe_7879 at ffffffffc0a98c85 [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#7 [ffff88017f6d7b38] enter_real_tracepoint_probe_1543 at ffffffffc0c3b757 [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#8 [ffff88017f6d7b70] enter_tracepoint_probe_1543 at ffffffffc09b117e [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#9 [ffff88017f6d7b80] lock_acquire at ffffffff811460ba`
		b8e312	`#10 [ffff88017f6d7be8] mutex_trylock at ffffffff81882a27`
		b8e312	`#11 [ffff88017f6d7c20] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#12 [ffff88017f6d7c40] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#13 [ffff88017f6d7c78] _stp_vlog at ffffffffc09b8d32 [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#14 [ffff88017f6d7cd8] _stp_dbug at ffffffffc09ba43b [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#15 [ffff88017f6d7d38] systemtap_module_refresh at ffffffffc09ba51d [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#16 [ffff88017f6d7d50] module_refresher at ffffffffc09ba53e [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#17 [ffff88017f6d7d60] process_one_work at ffffffff810da9cc`
		b8e312	`#18 [ffff88017f6d7de8] worker_thread at ffffffff810dafe6`
		b8e312	`#19 [ffff88017f6d7e48] kthread at ffffffff810e44cf`
		b8e312	`#20 [ffff88017f6d7f50] ret_from_fork_nospec_begin at ffffffff818958dd`
		b8e312
		b8e312	`Note the deadlock due to _stp_transport_trylock_relay_inode recursing`
		b8e312	`onto itself via mutex_trylock.`
		b8e312
		b8e312	`This is a temporary fix for the issue until a proper patch is made to`
		b8e312	`remove the mutex_trylock from __stp_print_flush. This should be reverted`
		b8e312	`when that patch lands (it will have something to do with bulkmode).`
		b8e312
		b8e312	`diff --git a/tapset-timers.cxx b/tapset-timers.cxx`
		b8e312	`index 10da17cda..503498c85 100644`
		b8e312	`--- a/tapset-timers.cxx`
		b8e312	`+++ b/tapset-timers.cxx`
		b8e312	`@@ -391,11 +391,11 @@ hrtimer_derived_probe_group::emit_module_refresh (systemtap_session& s)`
		b8e312	`s.op->newline(+1) << "struct stap_hrtimer_probe* stp = &stap_hrtimer_probes[i];";`
		b8e312	`// timer disabled, but condition says enabled?`
		b8e312	`s.op->newline( 0) << "if (!stp->enabled && stp->probe->cond_enabled) {";`
		b8e312	`- s.op->newline(+1) << "dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);";`
		b8e312	`+ s.op->newline(+1) << "//dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);";`
		b8e312	`s.op->newline( 0) << "_stp_hrtimer_start(stp);";`
		b8e312	`// timer enabled, but condition says disabled?`
		b8e312	`s.op->newline(-1) << "} else if (stp->enabled && !stp->probe->cond_enabled) {";`
		b8e312	`- s.op->newline(+1) << "dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);";`
		b8e312	`+ s.op->newline(+1) << "//dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);";`
		b8e312	`s.op->newline( 0) << "_stp_hrtimer_cancel(stp);";`
		b8e312	`s.op->newline(-1) << "}";`
		b8e312	`s.op->newline( 0) << "stp->enabled = stp->probe->cond_enabled;";`
		b8e312	`commit 7187dcf39412fcb25c432d318be8e49a6051f055`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Thu Dec 3 12:57:34 2020 -0800`
		b8e312
		b8e312	`runtime: fix print races in IRQ context and during print cleanup`
		b8e312
		b8e312	`Prints can race when there's a print called from IRQ context or a print`
		b8e312	`called while print cleanup takes place, which can lead to garbled print`
		b8e312	`messages, out-of-bounds memory accesses, and memory use-after-free. This`
		b8e312	`is one example of racy modification of the print buffer len in IRQ`
		b8e312	`context which caused a panic due to an out-of-bounds memory access:`
		b8e312
		b8e312	`BUG: unable to handle kernel paging request at ffffe8ffff621000`
		b8e312	`IP: [<ffffffffc05da0f3>] _stp_vsprint_memory+0x83/0x950 [stap_2c44636dfda18135ca3012a752599da6_13_533]`
		b8e312	`PGD 174b90067 PUD 174b8f067 PMD 174b93067 PTE 0`
		b8e312	`Oops: 0002 [#1] SMP`
		b8e312	`CPU: 12 PID: 3468 Comm: cat Kdump: loaded Tainted: G OE ------------ 3.10.0-1127.19.1.el7.x86_64.debug #1`
		b8e312	`Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014`
		b8e312	`task: ffff88001f4f0000 ti: ffff88004ea5c000 task.ti: ffff88004ea5c000`
		b8e312	`RIP: 0010:[<ffffffffc05da0f3>] [<ffffffffc05da0f3>] _stp_vsprint_memory+0x83/0x950 [stap_2c44636dfda18135ca3012a752599da6_13_533]`
		b8e312	`RSP: 0018:ffff88004ea5f9a8 EFLAGS: 00010082`
		b8e312	`RAX: ffffe8ffff621001 RBX: ffffe8ffff620ff2 RCX: fffffffffffffffe`
		b8e312	`RDX: 000000000000006e RSI: ffffffffffffffff RDI: ffffc90002c23730`
		b8e312	`RBP: ffff88004ea5fa28 R08: 00000000ffffffff R09: 0000000000000073`
		b8e312	`R10: ffffc90002c243d7 R11: 0000000000000001 R12: ffffc90002c2373f`
		b8e312	`R13: ffffe8ffff621004 R14: 0000000000000012 R15: 00000000fffffffe`
		b8e312	`FS: 00007f8a9b1d4740(0000) GS:ffff880179e00000(0000) knlGS:0000000000000000`
		b8e312	`CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033`
		b8e312	`CR2: ffffe8ffff621000 CR3: 00000000b3e3c000 CR4: 0000000000360fe0`
		b8e312	`DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000`
		b8e312	`DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400`
		b8e312	`Call Trace:`
		b8e312	`[<ffffffff8103eb89>] ? sched_clock+0x9/0x10`
		b8e312	`[<ffffffff8114036f>] ? lock_release_holdtime.part.30+0xf/0x1a0`
		b8e312	`[<ffffffffc05dcb80>] function___global_trace__overload_0+0x5b0/0x1220 [stap_2c44636dfda18135ca3012a752599da6_13_533]`
		b8e312	`[<ffffffffc05d8993>] ? stp_lock_probe+0x53/0xe0 [stap_2c44636dfda18135ca3012a752599da6_13_533]`
		b8e312	`[<ffffffff8188d879>] ? kretprobe_trampoline_holder+0x9/0x9`
		b8e312	`[<ffffffffc05e0662>] probe_7118+0x82/0xe0 [stap_2c44636dfda18135ca3012a752599da6_13_533]`
		b8e312	`[<ffffffffc05de866>] enter_kretprobe_common+0x256/0x490 [stap_2c44636dfda18135ca3012a752599da6_13_533]`
		b8e312	`[<ffffffff813489f1>] ? proc_sys_open+0x51/0x60`
		b8e312	`[<ffffffffc05dead0>] enter_kretprobe_probe+0x10/0x20 [stap_2c44636dfda18135ca3012a752599da6_13_533]`
		b8e312	`[<ffffffff8188e1d8>] trampoline_handler+0x148/0x220`
		b8e312	`[<ffffffff813489f1>] ? proc_sys_open+0x51/0x60`
		b8e312	`[<ffffffff8188d89e>] kretprobe_trampoline+0x25/0x57`
		b8e312	`[<ffffffff813489f1>] ? proc_sys_open+0x51/0x60`
		b8e312	`[<ffffffff8188d879>] kretprobe_trampoline_holder+0x9/0x9`
		b8e312	`[<ffffffff81384702>] ? security_inode_permission+0x22/0x30`
		b8e312	`[<ffffffff813489a0>] ? sysctl_head_finish+0x50/0x50`
		b8e312	`[<ffffffff812ac11d>] vfs_open+0x5d/0xb0`
		b8e312	`[<ffffffff812bb74a>] ? may_open+0x5a/0x120`
		b8e312	`[<ffffffff812c0af5>] do_last+0x285/0x15b0`
		b8e312	`[<ffffffff812bf18e>] ? link_path_walk+0x27e/0x8c0`
		b8e312	`[<ffffffff812c1ef0>] path_openat+0xd0/0x5d0`
		b8e312	`[<ffffffff8107a7f3>] ? kvm_clock_read+0x33/0x40`
		b8e312	`[<ffffffff812c38ad>] do_filp_open+0x4d/0xb0`
		b8e312	`[<ffffffff81889497>] ? _raw_spin_unlock+0x27/0x40`
		b8e312	`[<ffffffff812d5a9b>] ? __alloc_fd+0xfb/0x270`
		b8e312	`[<ffffffff812ad784>] do_sys_open+0x124/0x220`
		b8e312	`[<ffffffff812ad89e>] SyS_open+0x1e/0x20`
		b8e312	`[<ffffffff8188d879>] kretprobe_trampoline_holder+0x9/0x9`
		b8e312
		b8e312	`This patch resolves the IRQ print races by disabling IRQs on the local`
		b8e312	`CPU when accessing said CPU's print buffer, and resolves the cleanup`
		b8e312	`races with a lock. We also protect against data corruption and panics`
		b8e312	`from prints inside NMIs now by checking if the current CPU was accessing`
		b8e312	`the log buffer when an NMI fired; in this case, the NMI's prints will be`
		b8e312	`dropped, as there is no way to safely service them without creating a`
		b8e312	`dedicated log buffer for them. This is achieved by forbidding reentrancy`
		b8e312	`with respect to _stp_print_trylock_irqsave() when the runtime context`
		b8e312	`isn't held. Reentrancy is otherwise allowed when the runtime context is`
		b8e312	`held because the runtime context provides reentrancy protection.`
		b8e312
		b8e312	`diff --git a/runtime/linux/io.c b/runtime/linux/io.c`
		b8e312	`index 74a032c52..122708e2a 100644`
		b8e312	`--- a/runtime/linux/io.c`
		b8e312	`+++ b/runtime/linux/io.c`
		b8e312	`@@ -20,9 +20,6 @@`
		b8e312
		b8e312	`#define WARN_STRING "WARNING: "`
		b8e312	`#define ERR_STRING "ERROR: "`
		b8e312	`-#if (STP_LOG_BUF_LEN < 10) /* sizeof(WARN_STRING) */`
		b8e312	`-#error "STP_LOG_BUF_LEN is too short"`
		b8e312	`-#endif`
		b8e312
		b8e312	`enum code { INFO=0, WARN, ERROR, DBUG };`
		b8e312
		b8e312	`@@ -31,25 +28,37 @@ static void _stp_vlog (enum code type, const char func, int line, const char f`
		b8e312
		b8e312	`static void _stp_vlog (enum code type, const char func, int line, const char fmt, va_list args)`
		b8e312	`{`
		b8e312	`+ struct _stp_log *log;`
		b8e312	`+ unsigned long flags;`
		b8e312	`+ size_t bytes_avail;`
		b8e312	`int num;`
		b8e312	`- char *buf = per_cpu_ptr(Stp_lbuf, get_cpu());`
		b8e312	`+ char *buf;`
		b8e312	`int start = 0;`
		b8e312
		b8e312	`+ if (!_stp_print_trylock_irqsave(&flags))`
		b8e312	`+ return;`
		b8e312	`+`
		b8e312	`+ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`+ bytes_avail = STP_BUFFER_SIZE - log->len;`
		b8e312	`+ if (unlikely(!bytes_avail))`
		b8e312	`+ goto err_unlock;`
		b8e312	`+`
		b8e312	`+ buf = &log->buf[log->len];`
		b8e312	`if (type == DBUG) {`
		b8e312	`- start = _stp_snprintf(buf, STP_LOG_BUF_LEN, "%s:%d: ", func, line);`
		b8e312	`+ start = _stp_snprintf(buf, bytes_avail, "%s:%d: ", func, line);`
		b8e312	`} else if (type == WARN) {`
		b8e312	`- /* This strcpy() is OK, since we know STP_LOG_BUF_LEN`
		b8e312	`- * is > sizeof(WARN_STRING). */`
		b8e312	`- strcpy (buf, WARN_STRING);`
		b8e312	`- start = sizeof(WARN_STRING) - 1;`
		b8e312	`+ strncpy(buf, WARN_STRING, bytes_avail);`
		b8e312	`+ start = min(bytes_avail, sizeof(WARN_STRING) - 1);`
		b8e312	`} else if (type == ERROR) {`
		b8e312	`- /* This strcpy() is OK, since we know STP_LOG_BUF_LEN`
		b8e312	`- * is > sizeof(ERR_STRING) (which is < sizeof(WARN_STRING). */`
		b8e312	`- strcpy (buf, ERR_STRING);`
		b8e312	`- start = sizeof(ERR_STRING) - 1;`
		b8e312	`+ strncpy(buf, ERR_STRING, bytes_avail);`
		b8e312	`+ start = min(bytes_avail, sizeof(ERR_STRING) - 1);`
		b8e312	`}`
		b8e312
		b8e312	`- num = vscnprintf (buf + start, STP_LOG_BUF_LEN - start - 1, fmt, args);`
		b8e312	`+ bytes_avail -= start;`
		b8e312	`+ if (unlikely(!bytes_avail))`
		b8e312	`+ goto err_unlock;`
		b8e312	`+`
		b8e312	`+ num = vscnprintf(buf + start, bytes_avail - 1, fmt, args);`
		b8e312	`if (num + start) {`
		b8e312	`if (buf[num + start - 1] != '\n') {`
		b8e312	`buf[num + start] = '\n';`
		b8e312	`@@ -66,12 +75,13 @@ static void _stp_vlog (enum code type, const char func, int line, const char f`
		b8e312	`if (type != DBUG) {`
		b8e312	`_stp_ctl_send(STP_OOB_DATA, buf, start + num + 1);`
		b8e312	`} else {`
		b8e312	`- _stp_print(buf);`
		b8e312	`- _stp_print_flush();`
		b8e312	`+ log->len += start + num;`
		b8e312	`+ __stp_print_flush(log);`
		b8e312	`}`
		b8e312	`#endif`
		b8e312	`}`
		b8e312	`- put_cpu();`
		b8e312	`+err_unlock:`
		b8e312	`+ _stp_print_unlock_irqrestore(&flags);`
		b8e312	`}`
		b8e312
		b8e312	`/** Prints warning.`
		b8e312	`diff --git a/runtime/linux/print.c b/runtime/linux/print.c`
		b8e312	`index 777bca8b0..2aa2f1c8d 100644`
		b8e312	`--- a/runtime/linux/print.c`
		b8e312	`+++ b/runtime/linux/print.c`
		b8e312	`@@ -35,84 +35,179 @@`
		b8e312	`* @{`
		b8e312	`*/`
		b8e312
		b8e312	`-typedef struct __stp_pbuf {`
		b8e312	`- uint32_t len; /* bytes used in the buffer */`
		b8e312	`+struct _stp_log {`
		b8e312	`+ unsigned int len; /* Bytes used in the buffer */`
		b8e312	`char buf[STP_BUFFER_SIZE];`
		b8e312	`-} _stp_pbuf;`
		b8e312	`+ atomic_t reentrancy_lock;`
		b8e312	`+};`
		b8e312	`+#include "print_flush.c"`
		b8e312
		b8e312	`-static void *Stp_pbuf = NULL;`
		b8e312	`+static struct _stp_log *_stp_log_pcpu;`
		b8e312	`+`
		b8e312	`+/*`
		b8e312	`+ * An atomic counter is used to synchronize every possible print buffer usage`
		b8e312	`+ * with the _stp_print_cleanup() function. The cleanup function sets the counter`
		b8e312	`+ * to INT_MAX after waiting for everything using the print buffer to finish. We`
		b8e312	`+ * cannot use a lock primitive to implement this because lock_acquire() contains`
		b8e312	`+ * tracepoints and print statements are used both inside and outside of probes.`
		b8e312	`+ * If the lock were only used inside probes, the runtime context would protect`
		b8e312	`+ * us from recursing into the lock_acquire() tracepoints and deadlocking. We`
		b8e312	`+ * instead use _stp_print_ctr as if it were a read-write lock.`
		b8e312	`+ */`
		b8e312	`+static atomic_t _stp_print_ctr = ATOMIC_INIT(0);`
		b8e312
		b8e312	`-/** private buffer for _stp_vlog() */`
		b8e312	`-#ifndef STP_LOG_BUF_LEN`
		b8e312	`-#define STP_LOG_BUF_LEN 256`
		b8e312	`-#endif`
		b8e312	`+/*`
		b8e312	`+ * This disables IRQs to make per-CPU print buffer accesses atomic. There is a`
		b8e312	`+ * reentrancy protection mechanism specifically for NMIs, since they can violate`
		b8e312	`+ * our atomic guarantee. Reentrancy is otherwise allowed within code sections`
		b8e312	`+ * that have the runtime context held (via _stp_runtime_entryfn_get_context()).`
		b8e312	`+ */`
		b8e312	`+static bool _stp_print_trylock_irqsave(unsigned long *flags)`
		b8e312	`+{`
		b8e312	`+ bool context_held = false;`
		b8e312	`+ struct _stp_log *log;`
		b8e312	`+`
		b8e312	`+ local_irq_save(*flags);`
		b8e312	`+ if (!atomic_add_unless(&_stp_print_ctr, 1, INT_MAX))`
		b8e312	`+ goto irq_restore;`
		b8e312	`+`
		b8e312	`+ /*`
		b8e312	`+ * Check the per-CPU reentrancy lock for contention, unless the runtime`
		b8e312	`+ * context is already held, in which case we already have reentrancy`
		b8e312	`+ * protection. Otherwise, if the reentrancy lock is contented, that`
		b8e312	`+ * means we're either inside an NMI that fired while the current CPU was`
		b8e312	`+ * accessing the log buffer, or something is trying to nest calls to`
		b8e312	`+ * _stp_print_trylock_irqsave(). Our only choice is to reject the log`
		b8e312	`+ * access attempt in this case because log buffer corruption and panics`
		b8e312	`+ * could ensue if we're inside an NMI.`
		b8e312	`+ */`
		b8e312	`+ if (_stp_runtime_context_trylock()) {`
		b8e312	`+ struct context *c = _stp_runtime_get_context();`
		b8e312	`+ context_held = c && atomic_read(&c->busy);`
		b8e312	`+ _stp_runtime_context_unlock();`
		b8e312	`+ }`
		b8e312
		b8e312	`-typedef char _stp_lbuf[STP_LOG_BUF_LEN];`
		b8e312	`-static void *Stp_lbuf = NULL;`
		b8e312	`+ /* Fall back onto the reentrancy lock if the context isn't held */`
		b8e312	`+ if (!context_held) {`
		b8e312	`+ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`+ if (atomic_cmpxchg(&log->reentrancy_lock, 0, 1))`
		b8e312	`+ goto print_unlock;`
		b8e312	`+ }`
		b8e312	`+`
		b8e312	`+ return true;`
		b8e312	`+`
		b8e312	`+print_unlock:`
		b8e312	`+ atomic_dec(&_stp_print_ctr);`
		b8e312	`+irq_restore:`
		b8e312	`+ local_irq_restore(*flags);`
		b8e312	`+ return false;`
		b8e312	`+}`
		b8e312	`+`
		b8e312	`+static void _stp_print_unlock_irqrestore(unsigned long *flags)`
		b8e312	`+{`
		b8e312	`+ bool context_held = false;`
		b8e312	`+ struct _stp_log *log;`
		b8e312	`+`
		b8e312	`+ if (_stp_runtime_context_trylock()) {`
		b8e312	`+ struct context *c = _stp_runtime_get_context();`
		b8e312	`+ context_held = c && atomic_read(&c->busy);`
		b8e312	`+ _stp_runtime_context_unlock();`
		b8e312	`+ }`
		b8e312	`+`
		b8e312	`+ if (!context_held) {`
		b8e312	`+ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`+ atomic_set(&log->reentrancy_lock, 0);`
		b8e312	`+ }`
		b8e312	`+`
		b8e312	`+ atomic_dec(&_stp_print_ctr);`
		b8e312	`+ local_irq_restore(*flags);`
		b8e312	`+}`
		b8e312
		b8e312	`/* create percpu print and io buffers */`
		b8e312	`static int _stp_print_init (void)`
		b8e312	`{`
		b8e312	`- Stp_pbuf = _stp_alloc_percpu(sizeof(_stp_pbuf));`
		b8e312	`- if (unlikely(Stp_pbuf == 0))`
		b8e312	`- return -1;`
		b8e312	`-`
		b8e312	`- /* now initialize IO buffer used in io.c */`
		b8e312	`- Stp_lbuf = _stp_alloc_percpu(sizeof(_stp_lbuf));`
		b8e312	`- if (unlikely(Stp_lbuf == 0)) {`
		b8e312	`- _stp_free_percpu(Stp_pbuf);`
		b8e312	`- return -1;`
		b8e312	`+ unsigned int cpu;`
		b8e312	`+`
		b8e312	`+ _stp_log_pcpu = _stp_alloc_percpu(sizeof(*_stp_log_pcpu));`
		b8e312	`+ if (!_stp_log_pcpu)`
		b8e312	`+ return -ENOMEM;`
		b8e312	`+`
		b8e312	`+ for_each_possible_cpu(cpu) {`
		b8e312	`+ struct _stp_log *log = per_cpu_ptr(_stp_log_pcpu, cpu);`
		b8e312	`+`
		b8e312	`+ log->reentrancy_lock = (atomic_t)ATOMIC_INIT(0);`
		b8e312	`}`
		b8e312	`return 0;`
		b8e312	`}`
		b8e312
		b8e312	`static void _stp_print_cleanup (void)`
		b8e312	`{`
		b8e312	`- if (Stp_pbuf)`
		b8e312	`- _stp_free_percpu(Stp_pbuf);`
		b8e312	`- if (Stp_lbuf)`
		b8e312	`- _stp_free_percpu(Stp_lbuf);`
		b8e312	`-}`
		b8e312	`+ unsigned int cpu;`
		b8e312
		b8e312	`-#include "print_flush.c"`
		b8e312	`+ /* Wait for the loggers to finish modifying the print buffers */`
		b8e312	`+ while (atomic_cmpxchg(&_stp_print_ctr, 0, INT_MAX))`
		b8e312	`+ cpu_relax();`
		b8e312	`+`
		b8e312	`+ for_each_possible_cpu(cpu) {`
		b8e312	`+ struct _stp_log *log = per_cpu_ptr(_stp_log_pcpu, cpu);`
		b8e312	`+`
		b8e312	`+ /*`
		b8e312	`+ * Flush anything that could be left in the print buffer. It is`
		b8e312	`+ * safe to do this without any kind of synchronization mechanism`
		b8e312	`+ * because nothing is using this print buffer anymore.`
		b8e312	`+ */`
		b8e312	`+ __stp_print_flush(log);`
		b8e312	`+ }`
		b8e312	`+`
		b8e312	`+ _stp_free_percpu(_stp_log_pcpu);`
		b8e312	`+}`
		b8e312
		b8e312	`static inline void _stp_print_flush(void)`
		b8e312	`{`
		b8e312	`- stp_print_flush(per_cpu_ptr(Stp_pbuf, smp_processor_id()));`
		b8e312	`+ struct _stp_log *log;`
		b8e312	`+ unsigned long flags;`
		b8e312	`+`
		b8e312	`+ if (!_stp_print_trylock_irqsave(&flags))`
		b8e312	`+ return;`
		b8e312	`+`
		b8e312	`+ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`+ __stp_print_flush(log);`
		b8e312	`+ _stp_print_unlock_irqrestore(&flags);`
		b8e312	`}`
		b8e312	`+`
		b8e312	`#ifndef STP_MAXBINARYARGS`
		b8e312	`#define STP_MAXBINARYARGS 127`
		b8e312	`#endif`
		b8e312
		b8e312
		b8e312	`-/** Reserves space in the output buffer for direct I/O.`
		b8e312	`+/** Reserves space in the output buffer for direct I/O. Must be called with`
		b8e312	`+ * _stp_print_trylock_irqsave() held.`
		b8e312	`*/`
		b8e312	`static void * _stp_reserve_bytes (int numbytes)`
		b8e312	`{`
		b8e312	`- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());`
		b8e312	`- int size = STP_BUFFER_SIZE - pb->len;`
		b8e312	`- void * ret;`
		b8e312	`+ struct _stp_log *log;`
		b8e312	`+ char *ret;`
		b8e312
		b8e312	`if (unlikely(numbytes == 0 \|\| numbytes > STP_BUFFER_SIZE))`
		b8e312	`return NULL;`
		b8e312
		b8e312	`- if (unlikely(numbytes > size))`
		b8e312	`- _stp_print_flush();`
		b8e312	`+ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`+ if (unlikely(numbytes > (STP_BUFFER_SIZE - log->len)))`
		b8e312	`+ __stp_print_flush(log);`
		b8e312
		b8e312	`- ret = pb->buf + pb->len;`
		b8e312	`- pb->len += numbytes;`
		b8e312	`+ ret = &log->buf[log->len];`
		b8e312	`+ log->len += numbytes;`
		b8e312	`return ret;`
		b8e312	`}`
		b8e312
		b8e312
		b8e312	`static void _stp_unreserve_bytes (int numbytes)`
		b8e312	`{`
		b8e312	`- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());`
		b8e312	`-`
		b8e312	`- if (unlikely(numbytes == 0 \|\| numbytes > pb->len))`
		b8e312	`- return;`
		b8e312	`+ struct _stp_log *log;`
		b8e312
		b8e312	`- pb->len -= numbytes;`
		b8e312	`+ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`+ if (numbytes <= log->len)`
		b8e312	`+ log->len -= numbytes;`
		b8e312	`}`
		b8e312
		b8e312	`/** Write 64-bit args directly into the output stream.`
		b8e312	`@@ -123,22 +218,25 @@ static void _stp_unreserve_bytes (int numbytes)`
		b8e312	`*/`
		b8e312	`static void _stp_print_binary (int num, ...)`
		b8e312	`{`
		b8e312	`+ unsigned long flags;`
		b8e312	`va_list vargs;`
		b8e312	`int i;`
		b8e312	`int64_t *args;`
		b8e312	`-`
		b8e312	`+`
		b8e312	`if (unlikely(num > STP_MAXBINARYARGS))`
		b8e312	`num = STP_MAXBINARYARGS;`
		b8e312
		b8e312	`- args = _stp_reserve_bytes(num * sizeof(int64_t));`
		b8e312	`+ if (!_stp_print_trylock_irqsave(&flags))`
		b8e312	`+ return;`
		b8e312
		b8e312	`- if (likely(args != NULL)) {`
		b8e312	`+ args = _stp_reserve_bytes(num * sizeof(int64_t));`
		b8e312	`+ if (args) {`
		b8e312	`va_start(vargs, num);`
		b8e312	`- for (i = 0; i < num; i++) {`
		b8e312	`+ for (i = 0; i < num; i++)`
		b8e312	`args[i] = va_arg(vargs, int64_t);`
		b8e312	`- }`
		b8e312	`va_end(vargs);`
		b8e312	`}`
		b8e312	`+ _stp_print_unlock_irqrestore(&flags);`
		b8e312	`}`
		b8e312
		b8e312	`/** Print into the print buffer.`
		b8e312	`@@ -149,6 +247,7 @@ static void _stp_print_binary (int num, ...)`
		b8e312	`static void _stp_printf (const char *fmt, ...)`
		b8e312	`{`
		b8e312	`va_list args;`
		b8e312	`+`
		b8e312	`va_start(args, fmt);`
		b8e312	`_stp_vsnprintf(NULL, 0, fmt, args);`
		b8e312	`va_end(args);`
		b8e312	`@@ -160,37 +259,36 @@ static void _stp_printf (const char *fmt, ...)`
		b8e312
		b8e312	`static void _stp_print (const char *str)`
		b8e312	`{`
		b8e312	`- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());`
		b8e312	`- char *end = pb->buf + STP_BUFFER_SIZE;`
		b8e312	`- char *ptr = pb->buf + pb->len;`
		b8e312	`- char instr = (char )str;`
		b8e312	`-`
		b8e312	`- while (ptr < end && *instr)`
		b8e312	`- ptr++ = instr++;`
		b8e312	`-`
		b8e312	`- /* Did loop terminate due to lack of buffer space? */`
		b8e312	`- if (unlikely(*instr)) {`
		b8e312	`- /* Don't break strings across subbufs. */`
		b8e312	`- /* Restart after flushing. */`
		b8e312	`- _stp_print_flush();`
		b8e312	`- end = pb->buf + STP_BUFFER_SIZE;`
		b8e312	`- ptr = pb->buf + pb->len;`
		b8e312	`- instr = (char *)str;`
		b8e312	`- while (ptr < end && *instr)`
		b8e312	`- ptr++ = instr++;`
		b8e312	`+ struct _stp_log *log;`
		b8e312	`+ unsigned long flags;`
		b8e312	`+`
		b8e312	`+ if (!_stp_print_trylock_irqsave(&flags))`
		b8e312	`+ return;`
		b8e312	`+`
		b8e312	`+ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`+ while (1) {`
		b8e312	`+ while (log->len < STP_BUFFER_SIZE && *str)`
		b8e312	`+ log->buf[log->len++] = *str++;`
		b8e312	`+ if (likely(!*str))`
		b8e312	`+ break;`
		b8e312	`+ __stp_print_flush(log);`
		b8e312	`}`
		b8e312	`- pb->len = ptr - pb->buf;`
		b8e312	`+ _stp_print_unlock_irqrestore(&flags);`
		b8e312	`}`
		b8e312
		b8e312	`static void _stp_print_char (const char c)`
		b8e312	`{`
		b8e312	`- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());`
		b8e312	`- int size = STP_BUFFER_SIZE - pb->len;`
		b8e312	`- if (unlikely(1 >= size))`
		b8e312	`- _stp_print_flush();`
		b8e312	`-`
		b8e312	`- pb->buf[pb->len] = c;`
		b8e312	`- pb->len ++;`
		b8e312	`+ struct _stp_log *log;`
		b8e312	`+ unsigned long flags;`
		b8e312	`+`
		b8e312	`+ if (!_stp_print_trylock_irqsave(&flags))`
		b8e312	`+ return;`
		b8e312	`+`
		b8e312	`+ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`+ if (unlikely(log->len == STP_BUFFER_SIZE))`
		b8e312	`+ __stp_print_flush(log);`
		b8e312	`+ log->buf[log->len++] = c;`
		b8e312	`+ _stp_print_unlock_irqrestore(&flags);`
		b8e312	`}`
		b8e312
		b8e312	`static void _stp_print_kernel_info(char sname, char vstr, int ctx, int num_probes)`
		b8e312	`diff --git a/runtime/print.h b/runtime/print.h`
		b8e312	`index ede71f033..ffdea594d 100644`
		b8e312	`--- a/runtime/print.h`
		b8e312	`+++ b/runtime/print.h`
		b8e312	`@@ -10,6 +10,9 @@`
		b8e312	`#ifndef _STP_PRINT_H_`
		b8e312	`#define _STP_PRINT_H_`
		b8e312
		b8e312	`+/* The lock must be held with IRQs disabled to do any printing */`
		b8e312	`+static bool _stp_print_trylock_irqsave(unsigned long *flags);`
		b8e312	`+static void _stp_print_unlock_irqrestore(unsigned long *flags);`
		b8e312	`static int _stp_print_init(void);`
		b8e312	`static void _stp_print_cleanup(void);`
		b8e312	`static void *_stp_reserve_bytes(int numbytes);`
		b8e312	`diff --git a/runtime/print_flush.c b/runtime/print_flush.c`
		b8e312	`index cf40a2645..acd6a32d9 100644`
		b8e312	`--- a/runtime/print_flush.c`
		b8e312	`+++ b/runtime/print_flush.c`
		b8e312	`@@ -13,40 +13,31 @@`
		b8e312	`* is filled, or this is called. This MUST be called before returning`
		b8e312	`* from a probe or accumulated output in the print buffer will be lost.`
		b8e312	`*`
		b8e312	`- * @note Preemption must be disabled to use this.`
		b8e312	`+ * @note Interrupts must be disabled to use this.`
		b8e312	`*/`
		b8e312
		b8e312	`-static STP_DEFINE_SPINLOCK(_stp_print_lock);`
		b8e312	`-`
		b8e312	`-void stp_print_flush(_stp_pbuf *pb)`
		b8e312	`+static void __stp_print_flush(struct _stp_log *log)`
		b8e312	`{`
		b8e312	`- size_t len = pb->len;`
		b8e312	`+ size_t len = log->len;`
		b8e312	`void *entry = NULL;`
		b8e312
		b8e312	`/* check to see if there is anything in the buffer */`
		b8e312	`if (likely(len == 0))`
		b8e312	`return;`
		b8e312
		b8e312	`- pb->len = 0;`
		b8e312	`-`
		b8e312	`- if (unlikely(_stp_transport_get_state() != STP_TRANSPORT_RUNNING))`
		b8e312	`- return;`
		b8e312	`+ log->len = 0;`
		b8e312
		b8e312	`dbug_trans(1, "len = %zu\n", len);`
		b8e312
		b8e312	`#ifdef STP_BULKMODE`
		b8e312	`#ifdef NO_PERCPU_HEADERS`
		b8e312	`{`
		b8e312	`- struct context* __restrict__ c = NULL;`
		b8e312	`- char *bufp = pb->buf;`
		b8e312	`+ char *bufp = log->buf;`
		b8e312	`int inode_locked;`
		b8e312
		b8e312	`- c = _stp_runtime_entryfn_get_context();`
		b8e312	`-`
		b8e312	`if (!(inode_locked = _stp_transport_trylock_relay_inode())) {`
		b8e312	`atomic_inc (&_stp_transport_failures);`
		b8e312	`#ifndef STP_TRANSPORT_RISKY`
		b8e312	`- _stp_runtime_entryfn_put_context(c);`
		b8e312	`return;`
		b8e312	`#endif`
		b8e312	`}`
		b8e312	`@@ -70,26 +61,20 @@ void stp_print_flush(_stp_pbuf *pb)`
		b8e312
		b8e312	`if (inode_locked)`
		b8e312	`_stp_transport_unlock_relay_inode();`
		b8e312	`-`
		b8e312	`- _stp_runtime_entryfn_put_context(c);`
		b8e312	`}`
		b8e312
		b8e312	`#else /* !NO_PERCPU_HEADERS */`
		b8e312
		b8e312	`{`
		b8e312	`- struct context* __restrict__ c = NULL;`
		b8e312	`- char *bufp = pb->buf;`
		b8e312	`+ char *bufp = log->buf;`
		b8e312	`struct _stp_trace t = { .sequence = _stp_seq_inc(),`
		b8e312	`.pdu_len = len};`
		b8e312	`size_t bytes_reserved;`
		b8e312	`int inode_locked;`
		b8e312
		b8e312	`- c = _stp_runtime_entryfn_get_context();`
		b8e312	`-`
		b8e312	`if (!(inode_locked = _stp_transport_trylock_relay_inode())) {`
		b8e312	`atomic_inc (&_stp_transport_failures);`
		b8e312	`#ifndef STP_TRANSPORT_RISKY`
		b8e312	`- _stp_runtime_entryfn_put_context(c);`
		b8e312	`return;`
		b8e312	`#endif`
		b8e312	`}`
		b8e312	`@@ -124,48 +109,24 @@ void stp_print_flush(_stp_pbuf *pb)`
		b8e312
		b8e312	`if (inode_locked)`
		b8e312	`_stp_transport_unlock_relay_inode();`
		b8e312	`-`
		b8e312	`- _stp_runtime_entryfn_put_context(c);`
		b8e312	`}`
		b8e312	`#endif /* !NO_PERCPU_HEADERS */`
		b8e312
		b8e312	`#else /* !STP_BULKMODE */`
		b8e312
		b8e312	`{`
		b8e312	`- unsigned long flags;`
		b8e312	`- struct context* __restrict__ c = NULL;`
		b8e312	`- char *bufp = pb->buf;`
		b8e312	`+ char *bufp = log->buf;`
		b8e312	`int inode_locked;`
		b8e312
		b8e312	`- /* Prevent probe reentrancy on _stp_print_lock.`
		b8e312	`- *`
		b8e312	`- * Since stp_print_flush may be called from probe context, we`
		b8e312	`- * have to make sure that its lock, _stp_print_lock, can't`
		b8e312	`- * possibly be held outside probe context too. We ensure this`
		b8e312	`- * by grabbing the context here, so any probe triggered by this`
		b8e312	`- * region will appear reentrant and be skipped rather than`
		b8e312	`- * deadlock. Failure to get_context just means we're already`
		b8e312	`- * in a probe, which is fine.`
		b8e312	`- *`
		b8e312	`- * (see also _stp_ctl_send for a similar situation)`
		b8e312	`- *`
		b8e312	`- * A better solution would be to replace this`
		b8e312	`- * concurrency-control-laden effort with a lockless`
		b8e312	`- * algorithm.`
		b8e312	`- */`
		b8e312	`- c = _stp_runtime_entryfn_get_context();`
		b8e312	`-`
		b8e312	`if (!(inode_locked = _stp_transport_trylock_relay_inode())) {`
		b8e312	`atomic_inc (&_stp_transport_failures);`
		b8e312	`#ifndef STP_TRANSPORT_RISKY`
		b8e312	`dbug_trans(0, "discarding %zu bytes of data\n", len);`
		b8e312	`- _stp_runtime_entryfn_put_context(c);`
		b8e312	`return;`
		b8e312	`#endif`
		b8e312	`}`
		b8e312
		b8e312	`dbug_trans(1, "calling _stp_data_write...\n");`
		b8e312	`- stp_spin_lock_irqsave(&_stp_print_lock, flags);`
		b8e312	`while (len > 0) {`
		b8e312	`size_t bytes_reserved;`
		b8e312
		b8e312	`@@ -182,12 +143,9 @@ void stp_print_flush(_stp_pbuf *pb)`
		b8e312	`break;`
		b8e312	`}`
		b8e312	`}`
		b8e312	`- stp_spin_unlock_irqrestore(&_stp_print_lock, flags);`
		b8e312
		b8e312	`if (inode_locked)`
		b8e312	`_stp_transport_unlock_relay_inode();`
		b8e312	`-`
		b8e312	`- _stp_runtime_entryfn_put_context(c);`
		b8e312	`}`
		b8e312	`#endif /* !STP_BULKMODE */`
		b8e312	`}`
		b8e312	`diff --git a/runtime/stack.c b/runtime/stack.c`
		b8e312	`index 241ccf793..da23d4395 100644`
		b8e312	`--- a/runtime/stack.c`
		b8e312	`+++ b/runtime/stack.c`
		b8e312	`@@ -690,13 +690,20 @@ static void _stp_stack_kernel_sprint(char str, int size, struct context c,`
		b8e312	`* then call _stp_stack_print,`
		b8e312	`* then copy the result into the output string`
		b8e312	`* and clear the print buffer. */`
		b8e312	`- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());`
		b8e312	`- _stp_print_flush();`
		b8e312	`+ struct _stp_log *log;`
		b8e312	`+ unsigned long flags;`
		b8e312
		b8e312	`- _stp_stack_kernel_print(c, sym_flags);`
		b8e312	`+ if (!_stp_print_trylock_irqsave(&flags)) {`
		b8e312	`+ *str = '\0';`
		b8e312	`+ return;`
		b8e312	`+ }`
		b8e312
		b8e312	`- strlcpy(str, pb->buf, size < (int)pb->len ? size : (int)pb->len);`
		b8e312	`- pb->len = 0;`
		b8e312	`+ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`+ __stp_print_flush(log);`
		b8e312	`+ _stp_stack_kernel_print(c, sym_flags);`
		b8e312	`+ strlcpy(str, log->buf, min_t(int, size, log->len));`
		b8e312	`+ log->len = 0;`
		b8e312	`+ _stp_print_unlock_irqrestore(&flags);`
		b8e312	`}`
		b8e312
		b8e312	`static void _stp_stack_user_sprint(char str, int size, struct context c,`
		b8e312	`@@ -707,13 +714,20 @@ static void _stp_stack_user_sprint(char str, int size, struct context c,`
		b8e312	`* then call _stp_stack_print,`
		b8e312	`* then copy the result into the output string`
		b8e312	`* and clear the print buffer. */`
		b8e312	`- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id());`
		b8e312	`- _stp_print_flush();`
		b8e312	`+ struct _stp_log *log;`
		b8e312	`+ unsigned long flags;`
		b8e312
		b8e312	`- _stp_stack_user_print(c, sym_flags);`
		b8e312	`+ if (!_stp_print_trylock_irqsave(&flags)) {`
		b8e312	`+ *str = '\0';`
		b8e312	`+ return;`
		b8e312	`+ }`
		b8e312
		b8e312	`- strlcpy(str, pb->buf, size < (int)pb->len ? size : (int)pb->len);`
		b8e312	`- pb->len = 0;`
		b8e312	`+ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`+ __stp_print_flush(log);`
		b8e312	`+ _stp_stack_user_print(c, sym_flags);`
		b8e312	`+ strlcpy(str, log->buf, min_t(int, size, log->len));`
		b8e312	`+ log->len = 0;`
		b8e312	`+ _stp_print_unlock_irqrestore(&flags);`
		b8e312	`}`
		b8e312
		b8e312	`#endif /* _STACK_C_ */`
		b8e312	`diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c`
		b8e312	`index 57955334b..44e69b68c 100644`
		b8e312	`--- a/runtime/transport/transport.c`
		b8e312	`+++ b/runtime/transport/transport.c`
		b8e312	`@@ -540,8 +540,8 @@ static void _stp_transport_close(void)`
		b8e312	`current->pid);`
		b8e312	`_stp_cleanup_and_exit(0);`
		b8e312	`_stp_unregister_ctl_channel();`
		b8e312	`+ _stp_print_cleanup(); /* Requires the transport, so free this first */`
		b8e312	`_stp_transport_fs_close();`
		b8e312	`- _stp_print_cleanup(); /* free print buffers */`
		b8e312	`_stp_mem_debug_done();`
		b8e312
		b8e312	`dbug_trans(1, "---- CLOSED ----\n");`
		b8e312	`diff --git a/runtime/vsprintf.c b/runtime/vsprintf.c`
		b8e312	`index 28fd18f16..417d9f7f3 100644`
		b8e312	`--- a/runtime/vsprintf.c`
		b8e312	`+++ b/runtime/vsprintf.c`
		b8e312	`@@ -542,6 +542,8 @@ _stp_vsnprintf(char buf, size_t size, const char fmt, va_list args)`
		b8e312	`number of chars for from string */`
		b8e312	`int qualifier; /* 'h', 'l', or 'L' for integer fields */`
		b8e312	`int num_bytes = 0;`
		b8e312	`+ unsigned long irqflags = 0;`
		b8e312	`+ bool got_print_lock = false;`
		b8e312
		b8e312	`/* Reject out-of-range values early */`
		b8e312	`if (unlikely((int) size < 0))`
		b8e312	`@@ -724,11 +726,14 @@ _stp_vsnprintf(char buf, size_t size, const char fmt, va_list args)`
		b8e312	`num_bytes = STP_BUFFER_SIZE;`
		b8e312	`}`
		b8e312
		b8e312	`+ if (!_stp_print_trylock_irqsave(&irqflags))`
		b8e312	`+ return 0;`
		b8e312	`str = (char*)_stp_reserve_bytes(num_bytes);`
		b8e312	`if (str == NULL) {`
		b8e312	`_stp_error("Couldn't reserve any print buffer space\n");`
		b8e312	`- return 0;`
		b8e312	`+ goto err_unlock;`
		b8e312	`}`
		b8e312	`+ got_print_lock = true;`
		b8e312	`size = num_bytes;`
		b8e312	`end = str + size - 1;`
		b8e312
		b8e312	`@@ -820,8 +825,10 @@ _stp_vsnprintf(char buf, size_t size, const char fmt, va_list args)`
		b8e312	`field_width, precision,`
		b8e312	`*fmt, flags);`
		b8e312	`if (unlikely(str == NULL)) {`
		b8e312	`- if (num_bytes > 0)`
		b8e312	`+ if (num_bytes > 0) {`
		b8e312	`_stp_unreserve_bytes(num_bytes);`
		b8e312	`+ goto err_unlock;`
		b8e312	`+ }`
		b8e312	`return 0;`
		b8e312	`}`
		b8e312	`continue;`
		b8e312	`@@ -923,7 +930,14 @@ _stp_vsnprintf(char buf, size_t size, const char fmt, va_list args)`
		b8e312	`/* don't write out a null byte if the buf size is zero */`
		b8e312	`*end = '\0';`
		b8e312	`}`
		b8e312	`+`
		b8e312	`+ if (got_print_lock)`
		b8e312	`+ _stp_print_unlock_irqrestore(&irqflags);`
		b8e312	`return str-buf;`
		b8e312	`+`
		b8e312	`+err_unlock:`
		b8e312	`+ _stp_print_unlock_irqrestore(&irqflags);`
		b8e312	`+ return 0;`
		b8e312	`}`
		b8e312
		b8e312	`#endif /* _VSPRINTF_C_ */`
		b8e312	`diff --git a/translate.cxx b/translate.cxx`
		b8e312	`index 53f1d0725..f0195486c 100644`
		b8e312	`--- a/translate.cxx`
		b8e312	`+++ b/translate.cxx`
		b8e312	`@@ -1354,6 +1354,9 @@ c_unparser::emit_compiled_printfs ()`
		b8e312	`o->newline() << "unsigned long ptr_value;";`
		b8e312	`o->newline() << "int num_bytes;";`
		b8e312
		b8e312	`+ if (print_to_stream)`
		b8e312	`+ o->newline() << "unsigned long irqflags;";`
		b8e312	`+`
		b8e312	`o->newline() << "(void) width;";`
		b8e312	`o->newline() << "(void) precision;";`
		b8e312	`o->newline() << "(void) ptr_value;";`
		b8e312	`@@ -1452,7 +1455,9 @@ c_unparser::emit_compiled_printfs ()`
		b8e312	`}`
		b8e312
		b8e312	`o->newline() << "num_bytes = clamp(num_bytes, 0, STP_BUFFER_SIZE);";`
		b8e312	`- o->newline() << "str = (char*)_stp_reserve_bytes(num_bytes);";`
		b8e312	`+ o->newline() << "if (!_stp_print_trylock_irqsave(&irqflags))";`
		b8e312	`+ o->newline(1) << "return;";`
		b8e312	`+ o->newline(-1) << "str = (char*)_stp_reserve_bytes(num_bytes);";`
		b8e312	`o->newline() << "end = str ? str + num_bytes - 1 : 0;";`
		b8e312	`}`
		b8e312	`else // !print_to_stream`
		b8e312	`@@ -1547,8 +1552,14 @@ c_unparser::emit_compiled_printfs ()`
		b8e312	`o->newline() << "if (unlikely(str == NULL)) {";`
		b8e312	`o->indent(1);`
		b8e312	`if (print_to_stream)`
		b8e312	`+ {`
		b8e312	`o->newline() << "_stp_unreserve_bytes(num_bytes);";`
		b8e312	`- o->newline() << "return;";`
		b8e312	`+ o->newline() << "goto err_unlock;";`
		b8e312	`+ }`
		b8e312	`+ else`
		b8e312	`+ {`
		b8e312	`+ o->newline() << "return;";`
		b8e312	`+ }`
		b8e312	`o->newline(-1) << "}";`
		b8e312	`break;`
		b8e312
		b8e312	`@@ -1575,6 +1586,11 @@ c_unparser::emit_compiled_printfs ()`
		b8e312
		b8e312	`o->newline(-1) << "}";`
		b8e312
		b8e312	`+ if (print_to_stream)`
		b8e312	`+ {`
		b8e312	`+ o->newline(-1) << "err_unlock:";`
		b8e312	`+ o->newline(1) << "_stp_print_unlock_irqrestore(&irqflags);";`
		b8e312	`+ }`
		b8e312	`o->newline(-1) << "}";`
		b8e312	`}`
		b8e312	`o->newline() << "#endif // STP_LEGACY_PRINT";`
		b8e312	`commit e8c7a2067ec7fc6315ee9bc34a010ec5f0369c5c`
		b8e312	`Author: Frank Ch. Eigler <fche@redhat.com>`
		b8e312	`Date: Fri Dec 4 19:33:22 2020 -0500`
		b8e312
		b8e312	`testsuite pr14536.stp: toughen`
		b8e312
		b8e312	`This test case stresses nesting of heavy duty processing (backtrace`
		b8e312	`printing) within kernel interrupt processing paths. It seems to`
		b8e312	`sometimes trigger problems - so let's make the test harder to make`
		b8e312	`latent problems show up more likely. Instead of quitting after the`
		b8e312	`first irq_* function hit, stick around for 10 seconds.`
		b8e312
		b8e312	`diff --git a/testsuite/systemtap.base/pr14546.stp b/testsuite/systemtap.base/pr14546.stp`
		b8e312	`index 3e59a6f3a..e4c205a8f 100644`
		b8e312	`--- a/testsuite/systemtap.base/pr14546.stp`
		b8e312	`+++ b/testsuite/systemtap.base/pr14546.stp`
		b8e312	`@@ -2,5 +2,6 @@ probe kernel.function("irq_*").call {`
		b8e312	`x = 10; y = 10; z = 10; w = 10`
		b8e312	`$1`
		b8e312	`assert(!(x != 10 \|\| y != 10 \|\| z != 10 \|\| w != 10), "memory got corrupted by " . @1)`
		b8e312	`- exit()`
		b8e312	`}`
		b8e312	`+`
		b8e312	`+probe timer.s(10) { exit () }`
		b8e312	`commit cd6399e621646856824ea96b11605a0f52011272`
		b8e312	`Author: Frank Ch. Eigler <fche@redhat.com>`
		b8e312	`Date: Fri Dec 4 21:33:21 2020 -0500`
		b8e312
		b8e312	`dyninst transport: add _stp_print_lock_irq stubs`
		b8e312
		b8e312	`Recent code on the transport/linux side needs a few new (stub)`
		b8e312	`functions and type decls.`
		b8e312
		b8e312	`diff --git a/runtime/dyninst/print.c b/runtime/dyninst/print.c`
		b8e312	`index 9d91224ba..c78def272 100644`
		b8e312	`--- a/runtime/dyninst/print.c`
		b8e312	`+++ b/runtime/dyninst/print.c`
		b8e312	`@@ -107,4 +107,18 @@ static void _stp_print_char (const char c)`
		b8e312	`}`
		b8e312	`}`
		b8e312
		b8e312	`+`
		b8e312	`+/* no-op stub synchronization */`
		b8e312	`+static bool _stp_print_trylock_irqsave(unsigned long *flags)`
		b8e312	`+{`
		b8e312	`+ (void) flags;`
		b8e312	`+ return true;`
		b8e312	`+}`
		b8e312	`+`
		b8e312	`+static void _stp_print_unlock_irqrestore(unsigned long *flags)`
		b8e312	`+{`
		b8e312	`+ (void) flags;`
		b8e312	`+}`
		b8e312	`+`
		b8e312	`+`
		b8e312	`#endif /* _STAPDYN_PRINT_C_ */`
		b8e312	`diff --git a/runtime/dyninst/runtime_defines.h b/runtime/dyninst/runtime_defines.h`
		b8e312	`index 5c3dec519..d00c76a21 100644`
		b8e312	`--- a/runtime/dyninst/runtime_defines.h`
		b8e312	`+++ b/runtime/dyninst/runtime_defines.h`
		b8e312	`@@ -7,3 +7,6 @@`
		b8e312	`#define STAPCONF_PAGEFAULT_DISABLE 1`
		b8e312	`#define pagefault_disable()`
		b8e312	`#define pagefault_enable()`
		b8e312	`+`
		b8e312	`+typedef int bool;`
		b8e312	`+enum { false=0, true=1 };`
		b8e312	`commit fd93cf71df80f7bb5aae707ea5a5875727a85770`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Wed Dec 9 12:55:10 2020 -0800`
		b8e312
		b8e312	`PR26844: fix off-by-one error when copying printed backtraces`
		b8e312
		b8e312	`Since log->buf isn't null-terminated, log->len represents the total`
		b8e312	`number of bytes present in the log buffer for copying. The use of`
		b8e312	`strlcpy() here with log->len as its size results in log->len - 1 bytes`
		b8e312	`being copied, with the log->len'nth byte of the output buffer being set`
		b8e312	`to zero to terminate the string. Use memcpy() instead to remedy this,`
		b8e312	`while ensuring that the output buffer has space for null termination,`
		b8e312	`since the output buffer needs to be terminated.`
		b8e312
		b8e312	`diff --git a/runtime/stack.c b/runtime/stack.c`
		b8e312	`index da23d4395..85883d6c4 100644`
		b8e312	`--- a/runtime/stack.c`
		b8e312	`+++ b/runtime/stack.c`
		b8e312	`@@ -692,6 +692,7 @@ static void _stp_stack_kernel_sprint(char str, int size, struct context c,`
		b8e312	`* and clear the print buffer. */`
		b8e312	`struct _stp_log *log;`
		b8e312	`unsigned long flags;`
		b8e312	`+ int bytes;`
		b8e312
		b8e312	`if (!_stp_print_trylock_irqsave(&flags)) {`
		b8e312	`*str = '\0';`
		b8e312	`@@ -701,7 +702,9 @@ static void _stp_stack_kernel_sprint(char str, int size, struct context c,`
		b8e312	`log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`__stp_print_flush(log);`
		b8e312	`_stp_stack_kernel_print(c, sym_flags);`
		b8e312	`- strlcpy(str, log->buf, min_t(int, size, log->len));`
		b8e312	`+ bytes = min_t(int, size - 1, log->len);`
		b8e312	`+ memcpy(str, log->buf, bytes);`
		b8e312	`+ str[bytes] = '\0';`
		b8e312	`log->len = 0;`
		b8e312	`_stp_print_unlock_irqrestore(&flags);`
		b8e312	`}`
		b8e312	`@@ -716,6 +719,7 @@ static void _stp_stack_user_sprint(char str, int size, struct context c,`
		b8e312	`* and clear the print buffer. */`
		b8e312	`struct _stp_log *log;`
		b8e312	`unsigned long flags;`
		b8e312	`+ int bytes;`
		b8e312
		b8e312	`if (!_stp_print_trylock_irqsave(&flags)) {`
		b8e312	`*str = '\0';`
		b8e312	`@@ -725,7 +729,9 @@ static void _stp_stack_user_sprint(char str, int size, struct context c,`
		b8e312	`log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id());`
		b8e312	`__stp_print_flush(log);`
		b8e312	`_stp_stack_user_print(c, sym_flags);`
		b8e312	`- strlcpy(str, log->buf, min_t(int, size, log->len));`
		b8e312	`+ bytes = min_t(int, size - 1, log->len);`
		b8e312	`+ memcpy(str, log->buf, bytes);`
		b8e312	`+ str[bytes] = '\0';`
		b8e312	`log->len = 0;`
		b8e312	`_stp_print_unlock_irqrestore(&flags);`
		b8e312	`}`
		b8e312	`commit 8819e2a04596deb2fe427d261bebcaf3d2620dfb`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Wed Dec 9 17:22:20 2020 -0800`
		b8e312
		b8e312	`always use per-cpu bulkmode relayfs files to communicate with userspace`
		b8e312
		b8e312	`Using a mutex_trylock() in __stp_print_flush() leads to a lot of havoc,`
		b8e312	`for numerous. Firstly, since __stp_print_flush() can be called from IRQ`
		b8e312	`context, holding the inode mutex from here would make the mutex owner`
		b8e312	`become nonsense, since mutex locks can only be held in contexts backed`
		b8e312	`by the scheduler. Secondly, the mutex_trylock implementation has a`
		b8e312	`spin_lock() inside of it that leads to two issues: IRQs aren't disabled`
		b8e312	`when acquiring this spin_lock(), so using it from IRQ context can lead`
		b8e312	`to a deadlock, and since spin locks can have tracepoints via`
		b8e312	`lock_acquire(), the spin_lock() can recurse on itself inside a stap`
		b8e312	`probe and deadlock, like so:`
		b8e312
		b8e312	`#0 [ffff88017f6d7a08] kvm_wait at ffffffff81079f5a`
		b8e312	`#1 [ffff88017f6d7a30] __pv_queued_spin_lock_slowpath at ffffffff8114f51e`
		b8e312	`#2 [ffff88017f6d7a70] queued_spin_lock_slowpath at ffffffff810e842b`
		b8e312	`#3 [ffff88017f6d7a80] mutex_trylock at ffffffff81882b1b`
		b8e312	`#4 [ffff88017f6d7ab8] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#5 [ffff88017f6d7ad8] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#6 [ffff88017f6d7b10] probe_7879 at ffffffffc0a98c85 [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#7 [ffff88017f6d7b38] enter_real_tracepoint_probe_1543 at ffffffffc0c3b757 [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#8 [ffff88017f6d7b70] enter_tracepoint_probe_1543 at ffffffffc09b117e [stap_47650d3377d05db0ab7cbbaa25765809__11657]`
		b8e312	`#9 [ffff88017f6d7b80] lock_acquire at ffffffff811460ba`
		b8e312
		b8e312	`The reason the mutex_trylock() was needed in the first place was because`
		b8e312	`staprun doesn't properly use the relayfs API when reading buffers in`
		b8e312	`non-bulk mode. It tries to read all CPUs' buffers from a single thread,`
		b8e312	`when it should be reading each CPU's buffer from a thread running on`
		b8e312	`said CPU in order to utilize relayfs' synchronization guarantees, which`
		b8e312	`are made by disabling IRQs on the local CPU when a buffer is modified.`
		b8e312
		b8e312	`This change makes staprun always use per-CPU threads to read print`
		b8e312	`buffers so that we don't need the mutex_trylock() in the print flush`
		b8e312	`routine, which resolves a wide variety of serious bugs.`
		b8e312
		b8e312	`We also need to adjust the transport sub-buffer count to accommodate for`
		b8e312	`frequent print flushing. The sub-buffer size is now reduced to match the`
		b8e312	`log buffer size, which is 8192 by default, and the number of sub-buffers`
		b8e312	`is increased to 256. This uses exactly the same amount of memory as`
		b8e312	`before.`
		b8e312
		b8e312	`diff --git a/runtime/print_flush.c b/runtime/print_flush.c`
		b8e312	`index acd6a32d9..f4d72d30f 100644`
		b8e312	`--- a/runtime/print_flush.c`
		b8e312	`+++ b/runtime/print_flush.c`
		b8e312	`@@ -18,6 +18,7 @@`
		b8e312
		b8e312	`static void __stp_print_flush(struct _stp_log *log)`
		b8e312	`{`
		b8e312	`+ char *bufp = log->buf;`
		b8e312	`size_t len = log->len;`
		b8e312	`void *entry = NULL;`
		b8e312
		b8e312	`@@ -26,126 +27,20 @@ static void __stp_print_flush(struct _stp_log *log)`
		b8e312	`return;`
		b8e312
		b8e312	`log->len = 0;`
		b8e312	`-`
		b8e312	`dbug_trans(1, "len = %zu\n", len);`
		b8e312	`-`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`-#ifdef NO_PERCPU_HEADERS`
		b8e312	`- {`
		b8e312	`- char *bufp = log->buf;`
		b8e312	`- int inode_locked;`
		b8e312	`-`
		b8e312	`- if (!(inode_locked = _stp_transport_trylock_relay_inode())) {`
		b8e312	`- atomic_inc (&_stp_transport_failures);`
		b8e312	`-#ifndef STP_TRANSPORT_RISKY`
		b8e312	`- return;`
		b8e312	`-#endif`
		b8e312	`- }`
		b8e312	`-`
		b8e312	`- while (len > 0) {`
		b8e312	`- size_t bytes_reserved;`
		b8e312	`-`
		b8e312	`- bytes_reserved = _stp_data_write_reserve(len, &entry);`
		b8e312	`- if (likely(entry && bytes_reserved > 0)) {`
		b8e312	`- memcpy(_stp_data_entry_data(entry), bufp,`
		b8e312	`- bytes_reserved);`
		b8e312	`- _stp_data_write_commit(entry);`
		b8e312	`- bufp += bytes_reserved;`
		b8e312	`- len -= bytes_reserved;`
		b8e312	`- }`
		b8e312	`- else {`
		b8e312	`- atomic_inc(&_stp_transport_failures);`
		b8e312	`- break;`
		b8e312	`- }`
		b8e312	`- }`
		b8e312	`-`
		b8e312	`- if (inode_locked)`
		b8e312	`- _stp_transport_unlock_relay_inode();`
		b8e312	`- }`
		b8e312	`-`
		b8e312	`-#else /* !NO_PERCPU_HEADERS */`
		b8e312	`-`
		b8e312	`- {`
		b8e312	`- char *bufp = log->buf;`
		b8e312	`- struct _stp_trace t = { .sequence = _stp_seq_inc(),`
		b8e312	`- .pdu_len = len};`
		b8e312	`+ do {`
		b8e312	`size_t bytes_reserved;`
		b8e312	`- int inode_locked;`
		b8e312
		b8e312	`- if (!(inode_locked = _stp_transport_trylock_relay_inode())) {`
		b8e312	`- atomic_inc (&_stp_transport_failures);`
		b8e312	`-#ifndef STP_TRANSPORT_RISKY`
		b8e312	`- return;`
		b8e312	`-#endif`
		b8e312	`- }`
		b8e312	`-`
		b8e312	`- bytes_reserved = _stp_data_write_reserve(sizeof(struct _stp_trace), &entry);`
		b8e312	`- if (likely(entry && bytes_reserved > 0)) {`
		b8e312	`- /* prevent unaligned access by using memcpy() */`
		b8e312	`- memcpy(_stp_data_entry_data(entry), &t, sizeof(t));`
		b8e312	`+ bytes_reserved = _stp_data_write_reserve(len, &entry);`
		b8e312	`+ if (likely(entry && bytes_reserved)) {`
		b8e312	`+ memcpy(_stp_data_entry_data(entry), bufp,`
		b8e312	`+ bytes_reserved);`
		b8e312	`_stp_data_write_commit(entry);`
		b8e312	`- }`
		b8e312	`- else {`
		b8e312	`+ bufp += bytes_reserved;`
		b8e312	`+ len -= bytes_reserved;`
		b8e312	`+ } else {`
		b8e312	`atomic_inc(&_stp_transport_failures);`
		b8e312	`- goto done;`
		b8e312	`+ break;`
		b8e312	`}`
		b8e312	`-`
		b8e312	`- while (len > 0) {`
		b8e312	`- bytes_reserved = _stp_data_write_reserve(len, &entry);`
		b8e312	`- if (likely(entry && bytes_reserved > 0)) {`
		b8e312	`- memcpy(_stp_data_entry_data(entry), bufp,`
		b8e312	`- bytes_reserved);`
		b8e312	`- _stp_data_write_commit(entry);`
		b8e312	`- bufp += bytes_reserved;`
		b8e312	`- len -= bytes_reserved;`
		b8e312	`- }`
		b8e312	`- else {`
		b8e312	`- atomic_inc(&_stp_transport_failures);`
		b8e312	`- break;`
		b8e312	`- }`
		b8e312	`- }`
		b8e312	`-`
		b8e312	`- done:`
		b8e312	`-`
		b8e312	`- if (inode_locked)`
		b8e312	`- _stp_transport_unlock_relay_inode();`
		b8e312	`- }`
		b8e312	`-#endif /* !NO_PERCPU_HEADERS */`
		b8e312	`-`
		b8e312	`-#else /* !STP_BULKMODE */`
		b8e312	`-`
		b8e312	`- {`
		b8e312	`- char *bufp = log->buf;`
		b8e312	`- int inode_locked;`
		b8e312	`-`
		b8e312	`- if (!(inode_locked = _stp_transport_trylock_relay_inode())) {`
		b8e312	`- atomic_inc (&_stp_transport_failures);`
		b8e312	`-#ifndef STP_TRANSPORT_RISKY`
		b8e312	`- dbug_trans(0, "discarding %zu bytes of data\n", len);`
		b8e312	`- return;`
		b8e312	`-#endif`
		b8e312	`- }`
		b8e312	`-`
		b8e312	`- dbug_trans(1, "calling _stp_data_write...\n");`
		b8e312	`- while (len > 0) {`
		b8e312	`- size_t bytes_reserved;`
		b8e312	`-`
		b8e312	`- bytes_reserved = _stp_data_write_reserve(len, &entry);`
		b8e312	`- if (likely(entry && bytes_reserved > 0)) {`
		b8e312	`- memcpy(_stp_data_entry_data(entry), bufp,`
		b8e312	`- bytes_reserved);`
		b8e312	`- _stp_data_write_commit(entry);`
		b8e312	`- bufp += bytes_reserved;`
		b8e312	`- len -= bytes_reserved;`
		b8e312	`- }`
		b8e312	`- else {`
		b8e312	`- atomic_inc(&_stp_transport_failures);`
		b8e312	`- break;`
		b8e312	`- }`
		b8e312	`- }`
		b8e312	`-`
		b8e312	`- if (inode_locked)`
		b8e312	`- _stp_transport_unlock_relay_inode();`
		b8e312	`- }`
		b8e312	`-#endif /* !STP_BULKMODE */`
		b8e312	`+ } while (len > 0);`
		b8e312	`}`
		b8e312	`diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c`
		b8e312	`index ff621f71d..2ba5eea7d 100644`
		b8e312	`--- a/runtime/transport/relay_v2.c`
		b8e312	`+++ b/runtime/transport/relay_v2.c`
		b8e312	`@@ -67,7 +67,7 @@ static size_t __stp_relay_switch_subbuf(struct rchan_buf *buf, size_t length)`
		b8e312	`return 0;`
		b8e312
		b8e312	`if (unlikely(length > buf->chan->subbuf_size))`
		b8e312	`- goto toobig;`
		b8e312	`+ length = buf->chan->subbuf_size;`
		b8e312
		b8e312	`if (buf->offset != buf->chan->subbuf_size + 1) {`
		b8e312	`buf->prev_padding = buf->chan->subbuf_size - buf->offset;`
		b8e312	`@@ -98,14 +98,7 @@ static size_t __stp_relay_switch_subbuf(struct rchan_buf *buf, size_t length)`
		b8e312	`buf->data = new;`
		b8e312	`buf->padding[new_subbuf] = 0;`
		b8e312
		b8e312	`- if (unlikely(length + buf->offset > buf->chan->subbuf_size))`
		b8e312	`- goto toobig;`
		b8e312	`-`
		b8e312	`return length;`
		b8e312	`-`
		b8e312	`-toobig:`
		b8e312	`- buf->chan->last_toobig = length;`
		b8e312	`- return 0;`
		b8e312	`}`
		b8e312
		b8e312	`static void __stp_relay_wakeup_readers(struct rchan_buf *buf)`
		b8e312	`@@ -117,24 +110,17 @@ static void __stp_relay_wakeup_readers(struct rchan_buf *buf)`
		b8e312
		b8e312	`static void __stp_relay_wakeup_timer(stp_timer_callback_parameter_t unused)`
		b8e312	`{`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`int i;`
		b8e312	`-#endif`
		b8e312
		b8e312	`if (atomic_read(&_stp_relay_data.wakeup)) {`
		b8e312	`struct rchan_buf *buf;`
		b8e312
		b8e312	`atomic_set(&_stp_relay_data.wakeup, 0);`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`for_each_possible_cpu(i) {`
		b8e312	`buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,`
		b8e312	`i);`
		b8e312	`__stp_relay_wakeup_readers(buf);`
		b8e312	`}`
		b8e312	`-#else`
		b8e312	`- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);`
		b8e312	`- __stp_relay_wakeup_readers(buf);`
		b8e312	`-#endif`
		b8e312	`}`
		b8e312
		b8e312	`if (atomic_read(&_stp_relay_data.transport_state) == STP_TRANSPORT_RUNNING)`
		b8e312	`@@ -235,55 +221,8 @@ static void _stp_transport_data_fs_stop(void)`
		b8e312	`atomic_set (&_stp_relay_data.transport_state, STP_TRANSPORT_STOPPED);`
		b8e312	`del_timer_sync(&_stp_relay_data.timer);`
		b8e312	`dbug_trans(0, "flushing...\n");`
		b8e312	`- if (_stp_relay_data.rchan) {`
		b8e312	`- struct rchan_buf *buf;`
		b8e312	`-`
		b8e312	`- /* NB we cannot call relay_flush() directly here since`
		b8e312	`- * we need to do inode locking ourselves.`
		b8e312	`- */`
		b8e312	`-`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`- unsigned int i;`
		b8e312	`- struct rchan *rchan = _stp_relay_data.rchan;`
		b8e312	`-`
		b8e312	`- for_each_possible_cpu(i) {`
		b8e312	`- buf = _stp_get_rchan_subbuf(rchan->buf, i);`
		b8e312	`- if (buf) {`
		b8e312	`- struct inode *inode = buf->dentry->d_inode;`
		b8e312	`-`
		b8e312	`- /* NB we are in the syscall context which`
		b8e312	`- * allows sleeping. The following inode`
		b8e312	`- * locking might sleep. See PR26131. */`
		b8e312	`- _stp_lock_inode(inode);`
		b8e312	`-`
		b8e312	`- /* NB we intentionally avoids calling`
		b8e312	`- * our own __stp_relay_switch_subbuf()`
		b8e312	`- * since here we can sleep. */`
		b8e312	`- relay_switch_subbuf(buf, 0);`
		b8e312	`-`
		b8e312	`- _stp_unlock_inode(inode);`
		b8e312	`- }`
		b8e312	`- }`
		b8e312	`-#else /* !STP_BULKMODE */`
		b8e312	`- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0);`
		b8e312	`-`
		b8e312	`- if (buf != NULL) {`
		b8e312	`- struct inode *inode = buf->dentry->d_inode;`
		b8e312	`-`
		b8e312	`- /* NB we are in the syscall context which allows`
		b8e312	`- * sleeping. The following inode locking might`
		b8e312	`- * sleep. See PR26131. */`
		b8e312	`- _stp_lock_inode(inode);`
		b8e312	`-`
		b8e312	`- /* NB we intentionally avoids calling`
		b8e312	`- * our own __stp_relay_switch_subbuf()`
		b8e312	`- * since here we can sleep. */`
		b8e312	`- relay_switch_subbuf(buf, 0);`
		b8e312	`-`
		b8e312	`- _stp_unlock_inode(inode);`
		b8e312	`- }`
		b8e312	`-#endif`
		b8e312	`- }`
		b8e312	`+ if (_stp_relay_data.rchan)`
		b8e312	`+ relay_flush(_stp_relay_data.rchan);`
		b8e312	`}`
		b8e312	`}`
		b8e312
		b8e312	`@@ -308,9 +247,7 @@ static int _stp_transport_data_fs_init(void)`
		b8e312
		b8e312	`/* Create "trace" file. */`
		b8e312	`npages = _stp_subbuf_size * _stp_nsubbufs;`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`npages *= num_online_cpus();`
		b8e312	`-#endif`
		b8e312	`npages >>= PAGE_SHIFT;`
		b8e312	`si_meminfo(&si);`
		b8e312	`#define MB(i) (unsigned long)((i) >> (20 - PAGE_SHIFT))`
		b8e312	`@@ -347,9 +284,7 @@ static int _stp_transport_data_fs_init(void)`
		b8e312	`{`
		b8e312	`u64 relay_mem;`
		b8e312	`relay_mem = _stp_subbuf_size * _stp_nsubbufs;`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`relay_mem *= num_online_cpus();`
		b8e312	`-#endif`
		b8e312	`_stp_allocated_net_memory += relay_mem;`
		b8e312	`_stp_allocated_memory += relay_mem;`
		b8e312	`}`
		b8e312	`@@ -386,12 +321,7 @@ _stp_data_write_reserve(size_t size_request, void **entry)`
		b8e312	`return -EINVAL;`
		b8e312
		b8e312	`buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`- smp_processor_id()`
		b8e312	`-#else`
		b8e312	`- 0`
		b8e312	`-#endif`
		b8e312	`- );`
		b8e312	`+ smp_processor_id());`
		b8e312	`if (unlikely(buf->offset + size_request > buf->chan->subbuf_size)) {`
		b8e312	`size_request = __stp_relay_switch_subbuf(buf, size_request);`
		b8e312	`if (!size_request)`
		b8e312	`@@ -411,65 +341,10 @@ static unsigned char _stp_data_entry_data(void entry)`
		b8e312
		b8e312	`static int _stp_data_write_commit(void *entry)`
		b8e312	`{`
		b8e312	`- /* Nothing to do here. */`
		b8e312	`- return 0;`
		b8e312	`-}`
		b8e312	`-`
		b8e312	`-static noinline int _stp_transport_trylock_relay_inode(void)`
		b8e312	`-{`
		b8e312	`- unsigned i;`
		b8e312	`struct rchan_buf *buf;`
		b8e312	`- struct inode *inode;`
		b8e312	`-#ifdef DEBUG_TRANS`
		b8e312	`- cycles_t begin;`
		b8e312	`-#endif`
		b8e312
		b8e312	`buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`- smp_processor_id()`
		b8e312	`-#else`
		b8e312	`- 0`
		b8e312	`-#endif`
		b8e312	`- );`
		b8e312	`- if (buf == NULL)`
		b8e312	`- return 0;`
		b8e312	`-`
		b8e312	`- inode = buf->dentry->d_inode;`
		b8e312	`-`
		b8e312	`-#ifdef DEBUG_TRANS`
		b8e312	`- begin = get_cycles();`
		b8e312	`-#endif`
		b8e312	`-`
		b8e312	`- /* NB this bounded spinlock is needed for stream mode. it is observed`
		b8e312	`- * that almost all of the iterations needed are less than 50K iterations`
		b8e312	`- * or about 300K cycles.`
		b8e312	`- */`
		b8e312	`- for (i = 0; i < 50 * 1000; i++) {`
		b8e312	`- if (_stp_trylock_inode(inode)) {`
		b8e312	`- dbug_trans(3, "got inode lock: i=%u: cycles: %llu", i,`
		b8e312	`- get_cycles() - begin);`
		b8e312	`- return 1;`
		b8e312	`- }`
		b8e312	`- }`
		b8e312	`-`
		b8e312	`- dbug_trans(0, "failed to get inode lock: i=%u: cycles: %llu", i,`
		b8e312	`- get_cycles() - begin);`
		b8e312	`+ smp_processor_id());`
		b8e312	`+ __stp_relay_switch_subbuf(buf, 0);`
		b8e312	`return 0;`
		b8e312	`}`
		b8e312	`-`
		b8e312	`-static void _stp_transport_unlock_relay_inode(void)`
		b8e312	`-{`
		b8e312	`- struct rchan_buf *buf;`
		b8e312	`-`
		b8e312	`- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf,`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`- smp_processor_id()`
		b8e312	`-#else`
		b8e312	`- 0`
		b8e312	`-#endif`
		b8e312	`- );`
		b8e312	`- if (buf == NULL)`
		b8e312	`- return;`
		b8e312	`-`
		b8e312	`- _stp_unlock_inode(buf->dentry->d_inode);`
		b8e312	`-}`
		b8e312	`diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c`
		b8e312	`index 96426eb7b..1be3e9485 100644`
		b8e312	`--- a/runtime/transport/transport.c`
		b8e312	`+++ b/runtime/transport/transport.c`
		b8e312	`@@ -49,7 +49,6 @@ static int _stp_probes_started = 0;`
		b8e312	`* transport state flag is atomic. */`
		b8e312	`static atomic_t _stp_transport_state = ATOMIC_INIT(_STP_TS_UNINITIALIZED);`
		b8e312
		b8e312	`-static inline int _stp_trylock_inode(struct inode *inode);`
		b8e312	`static inline void _stp_lock_inode(struct inode *inode);`
		b8e312	`static inline void _stp_unlock_inode(struct inode *inode);`
		b8e312
		b8e312	`@@ -70,8 +69,8 @@ static inline void _stp_unlock_inode(struct inode *inode);`
		b8e312	`#include "procfs.c"`
		b8e312	`#include "control.c"`
		b8e312
		b8e312	`-static unsigned _stp_nsubbufs = 8;`
		b8e312	`-static unsigned _stp_subbuf_size = 65536*4;`
		b8e312	`+static unsigned _stp_nsubbufs = 256;`
		b8e312	`+static unsigned _stp_subbuf_size = STP_BUFFER_SIZE;`
		b8e312
		b8e312	`/* module parameters */`
		b8e312	`static int _stp_bufsize;`
		b8e312	`@@ -643,23 +642,6 @@ err0:`
		b8e312	`return -1;`
		b8e312	`}`
		b8e312
		b8e312	`-/* returns 1 when the lock is successfully acquired, 0 otherwise. */`
		b8e312	`-static inline int _stp_trylock_inode(struct inode *inode)`
		b8e312	`-{`
		b8e312	`-#ifdef STAPCONF_INODE_RWSEM`
		b8e312	`- return inode_trylock(inode);`
		b8e312	`-#else`
		b8e312	`-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)`
		b8e312	`- return mutex_trylock(&inode->i_mutex);`
		b8e312	`-#else`
		b8e312	`- /* NB down_trylock() uses a different convention where 0 means`
		b8e312	`- * the lock is successfully acquired.`
		b8e312	`- */`
		b8e312	`- return !down_trylock(&inode->i_sem);`
		b8e312	`-#endif`
		b8e312	`-#endif`
		b8e312	`-}`
		b8e312	`-`
		b8e312	`static inline void _stp_lock_inode(struct inode *inode)`
		b8e312	`{`
		b8e312	`#ifdef STAPCONF_INODE_RWSEM`
		b8e312	`diff --git a/runtime/transport/transport.h b/runtime/transport/transport.h`
		b8e312	`index 51723b7f5..cc09fc0ae 100644`
		b8e312	`--- a/runtime/transport/transport.h`
		b8e312	`+++ b/runtime/transport/transport.h`
		b8e312	`@@ -98,24 +98,6 @@ enum _stp_transport_state {`
		b8e312	`*/`
		b8e312	`static enum _stp_transport_state _stp_transport_get_state(void);`
		b8e312
		b8e312	`-/*`
		b8e312	`- * _stp_transport_trylock_relay_inode`
		b8e312	`- *`
		b8e312	`- * This function locks the relay file inode to protect against relay readers`
		b8e312	`- * (i.e., staprun/stapio).`
		b8e312	`- * Returns whether the lock is successfully obtained.`
		b8e312	`- */`
		b8e312	`-static noinline int _stp_transport_trylock_relay_inode(void);`
		b8e312	`-`
		b8e312	`-/*`
		b8e312	`- * _stp_transport_unlock_relay_inode`
		b8e312	`- *`
		b8e312	`- * This function releases the lock obtained by`
		b8e312	`- * _stp_transport_trylock_relay_inode.`
		b8e312	`- * should only call this when the lock is indeed obtained.`
		b8e312	`- */`
		b8e312	`-static void _stp_transport_unlock_relay_inode(void);`
		b8e312	`-`
		b8e312	`/*`
		b8e312	`* _stp_transport_data_fs_init`
		b8e312	`*`
		b8e312	`diff --git a/staprun/relay.c b/staprun/relay.c`
		b8e312	`index 2f5f2e06a..c76e76719 100644`
		b8e312	`--- a/staprun/relay.c`
		b8e312	`+++ b/staprun/relay.c`
		b8e312	`@@ -131,6 +131,7 @@ static void reader_thread(void data)`
		b8e312	`sigset_t sigs;`
		b8e312	`off_t wsize = 0;`
		b8e312	`int fnum = 0;`
		b8e312	`+ cpu_set_t cpu_mask;`
		b8e312
		b8e312	`sigemptyset(&sigs;;`
		b8e312	`sigaddset(&sigs,SIGUSR2);`
		b8e312	`@@ -139,21 +140,18 @@ static void reader_thread(void data)`
		b8e312	`sigfillset(&sigs;;`
		b8e312	`sigdelset(&sigs,SIGUSR2);`
		b8e312
		b8e312	`- if (bulkmode) {`
		b8e312	`- cpu_set_t cpu_mask;`
		b8e312	`- CPU_ZERO(&cpu_mask);`
		b8e312	`- CPU_SET(cpu, &cpu_mask);`
		b8e312	`- if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 )`
		b8e312	`- _perr("sched_setaffinity");`
		b8e312	`+ CPU_ZERO(&cpu_mask);`
		b8e312	`+ CPU_SET(cpu, &cpu_mask);`
		b8e312	`+ if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 )`
		b8e312	`+ _perr("sched_setaffinity");`
		b8e312	`#ifdef NEED_PPOLL`
		b8e312	`- /* Without a real ppoll, there is a small race condition that could */`
		b8e312	`- /* block ppoll(). So use a timeout to prevent that. */`
		b8e312	`- timeout->tv_sec = 10;`
		b8e312	`- timeout->tv_nsec = 0;`
		b8e312	`+ /* Without a real ppoll, there is a small race condition that could */`
		b8e312	`+ /* block ppoll(). So use a timeout to prevent that. */`
		b8e312	`+ timeout->tv_sec = 10;`
		b8e312	`+ timeout->tv_nsec = 0;`
		b8e312	`#else`
		b8e312	`- timeout = NULL;`
		b8e312	`+ timeout = NULL;`
		b8e312	`#endif`
		b8e312	`- }`
		b8e312
		b8e312	`if (reader_timeout_ms && timeout) {`
		b8e312	`timeout->tv_sec = reader_timeout_ms / 1000;`
		b8e312	`@@ -358,11 +356,6 @@ int init_relayfs(void)`
		b8e312	`_err("couldn't open %s.\n", buf);`
		b8e312	`return -1;`
		b8e312	`}`
		b8e312	`- if (ncpus > 1 && bulkmode == 0) {`
		b8e312	`- _err("ncpus=%d, bulkmode = %d\n", ncpus, bulkmode);`
		b8e312	`- _err("This is inconsistent! Please file a bug report. Exiting now.\n");`
		b8e312	`- return -1;`
		b8e312	`- }`
		b8e312
		b8e312	`/* PR7097 */`
		b8e312	`if (load_only)`
		b8e312	`commit d86b64029598f69b47d9cf4295f30b7093f38cfc`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Wed Dec 9 17:22:27 2020 -0800`
		b8e312
		b8e312	`Revert "REVERTME: tapset-timers: work around on-the-fly deadlocks caused by mutex_trylock"`
		b8e312
		b8e312	`This reverts commit 6a27888b118b7a94650a68aae028957cdd5fb5f5.`
		b8e312
		b8e312	`No longer needed. As promised, we're reverting this.`
		b8e312
		b8e312	`diff --git a/tapset-timers.cxx b/tapset-timers.cxx`
		b8e312	`index 503498c85..10da17cda 100644`
		b8e312	`--- a/tapset-timers.cxx`
		b8e312	`+++ b/tapset-timers.cxx`
		b8e312	`@@ -391,11 +391,11 @@ hrtimer_derived_probe_group::emit_module_refresh (systemtap_session& s)`
		b8e312	`s.op->newline(+1) << "struct stap_hrtimer_probe* stp = &stap_hrtimer_probes[i];";`
		b8e312	`// timer disabled, but condition says enabled?`
		b8e312	`s.op->newline( 0) << "if (!stp->enabled && stp->probe->cond_enabled) {";`
		b8e312	`- s.op->newline(+1) << "//dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);";`
		b8e312	`+ s.op->newline(+1) << "dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);";`
		b8e312	`s.op->newline( 0) << "_stp_hrtimer_start(stp);";`
		b8e312	`// timer enabled, but condition says disabled?`
		b8e312	`s.op->newline(-1) << "} else if (stp->enabled && !stp->probe->cond_enabled) {";`
		b8e312	`- s.op->newline(+1) << "//dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);";`
		b8e312	`+ s.op->newline(+1) << "dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);";`
		b8e312	`s.op->newline( 0) << "_stp_hrtimer_cancel(stp);";`
		b8e312	`s.op->newline(-1) << "}";`
		b8e312	`s.op->newline( 0) << "stp->enabled = stp->probe->cond_enabled;";`
		b8e312	`commit 3abe2c40b2dae499aff2e31beff121fbe43f7654`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Fri Dec 11 12:31:25 2020 -0800`
		b8e312
		b8e312	`transport: set is_global to zero even when bulkmode is disabled`
		b8e312
		b8e312	`This is needed now that we always want per-cpu logger threads. When`
		b8e312	`is_global is set to a non-zero value, relay won't create per-cpu log`
		b8e312	`files.`
		b8e312
		b8e312	`diff --git a/runtime/transport/debugfs.c b/runtime/transport/debugfs.c`
		b8e312	`index 28a5bf89d..bd20281a0 100644`
		b8e312	`--- a/runtime/transport/debugfs.c`
		b8e312	`+++ b/runtime/transport/debugfs.c`
		b8e312	`@@ -256,13 +256,8 @@ __stp_debugfs_relay_create_buf_file_callback(const char *filename,`
		b8e312	`* cause relay_open() to create a single global buffer rather`
		b8e312	`* than the default set of per-cpu buffers.`
		b8e312	`*/`
		b8e312	`- if (is_global) {`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`+ if (is_global)`
		b8e312	`*is_global = 0;`
		b8e312	`-#else`
		b8e312	`- *is_global = 1;`
		b8e312	`-#endif`
		b8e312	`- }`
		b8e312
		b8e312	`if (IS_ERR(file)) {`
		b8e312	`file = NULL;`
		b8e312	`diff --git a/runtime/transport/procfs.c b/runtime/transport/procfs.c`
		b8e312	`index 262409356..b0a5d5760 100644`
		b8e312	`--- a/runtime/transport/procfs.c`
		b8e312	`+++ b/runtime/transport/procfs.c`
		b8e312	`@@ -328,13 +328,8 @@ __stp_procfs_relay_create_buf_file_callback(const char *filename,`
		b8e312	`unsigned i = 0;`
		b8e312	`struct inode* in;`
		b8e312
		b8e312	`- if (is_global) {`
		b8e312	`-#ifdef STP_BULKMODE`
		b8e312	`+ if (is_global)`
		b8e312	`*is_global = 0;`
		b8e312	`-#else`
		b8e312	`- *is_global = 1;`
		b8e312	`-#endif`
		b8e312	`- }`
		b8e312
		b8e312	`if (parent != _stp_procfs_module_dir_path.dentry)`
		b8e312	`goto out;`
		b8e312	`commit a26bf7890196395d73ac193b23e271398731745d`
		b8e312	`Author: Frank Ch. Eigler <fche@redhat.com>`
		b8e312	`Date: Fri Dec 11 15:39:29 2020 -0500`
		b8e312
		b8e312	`relay transport: comment on STP_BULK message`
		b8e312
		b8e312	`While we've eliminated any STP_BULKMODE effects from the way relayfs`
		b8e312	`files are used ("always bulkmode"), staprun/stapio still need to know`
		b8e312	`whether the user intended "stap -b" or not, so they can save files`
		b8e312	`stpd_cpu* files separately.`
		b8e312
		b8e312	`diff --git a/runtime/transport/control.c b/runtime/transport/control.c`
		b8e312	`index 9343b3c28..d123bef2f 100644`
		b8e312	`--- a/runtime/transport/control.c`
		b8e312	`+++ b/runtime/transport/control.c`
		b8e312	`@@ -88,6 +88,9 @@ static ssize_t _stp_ctl_write_cmd(struct file file, const char __user buf, siz`
		b8e312	`break;`
		b8e312
		b8e312	`case STP_BULK:`
		b8e312	`+ // NB: this signals the runtime to save separate`
		b8e312	`+ // per-cpu files; our kernel->userspace files are now`
		b8e312	`+ // always bulkmode (trace$N files).`
		b8e312	`#ifdef STP_BULKMODE`
		b8e312	`// no action needed`
		b8e312	`break;`
		b8e312	`commit b43eb4ed690bf2421978ed2896667e45e60c3400`
		b8e312	`Author: Cosmin Tanislav <demonsingur@gmail.com>`
		b8e312	`Date: Thu Dec 10 16:48:54 2020 -0500`
		b8e312
		b8e312	`bugfix: runtime: transport: handle more error cases in module init`
		b8e312
		b8e312	`Signed-off-by: Sultan Alsawaf <sultan@openresty.com>`
		b8e312
		b8e312	`diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c`
		b8e312	`index 2ba5eea7d..27729f4c8 100644`
		b8e312	`--- a/runtime/transport/relay_v2.c`
		b8e312	`+++ b/runtime/transport/relay_v2.c`
		b8e312	`@@ -277,6 +277,7 @@ static int _stp_transport_data_fs_init(void)`
		b8e312	`#endif /* (RELAYFS_CHANNEL_VERSION < 7) */`
		b8e312	`if (!_stp_relay_data.rchan) {`
		b8e312	`rc = -ENOENT;`
		b8e312	`+ errk("%s: relay_open() failed: %d\n", THIS_MODULE->name, rc);`
		b8e312	`goto err;`
		b8e312	`}`
		b8e312	`/* Increment _stp_allocated_memory and _stp_allocated_net_memory to account for buffers`
		b8e312	`diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c`
		b8e312	`index 1be3e9485..f005e14e2 100644`
		b8e312	`--- a/runtime/transport/transport.c`
		b8e312	`+++ b/runtime/transport/transport.c`
		b8e312	`@@ -552,6 +552,8 @@ static void _stp_transport_close(void)`
		b8e312	`*/`
		b8e312	`static int _stp_transport_init(void)`
		b8e312	`{`
		b8e312	`+ int ret;`
		b8e312	`+`
		b8e312	`dbug_trans(1, "transport_init\n");`
		b8e312	`#ifdef STAPCONF_TASK_UID`
		b8e312	`_stp_uid = current->uid;`
		b8e312	`@@ -603,20 +605,28 @@ static int _stp_transport_init(void)`
		b8e312	`dbug_trans(1, "Using %d subbufs of size %d\n", _stp_nsubbufs, _stp_subbuf_size);`
		b8e312	`}`
		b8e312
		b8e312	`- if (_stp_transport_fs_init(THIS_MODULE->name) != 0)`
		b8e312	`+ ret = _stp_transport_fs_init(THIS_MODULE->name);`
		b8e312	`+ if (ret)`
		b8e312	`goto err0;`
		b8e312
		b8e312	`/* create control channel */`
		b8e312	`- if (_stp_register_ctl_channel() < 0)`
		b8e312	`+ ret = _stp_register_ctl_channel();`
		b8e312	`+ if (ret < 0)`
		b8e312	`goto err1;`
		b8e312
		b8e312	`/* create print buffers */`
		b8e312	`- if (_stp_print_init() < 0)`
		b8e312	`+ ret = _stp_print_init();`
		b8e312	`+ if (ret < 0) {`
		b8e312	`+ errk("%s: can't create print buffers!", THIS_MODULE->name);`
		b8e312	`goto err2;`
		b8e312	`+ }`
		b8e312
		b8e312	`/* set _stp_module_self dynamic info */`
		b8e312	`- if (_stp_module_update_self() < 0)`
		b8e312	`+ ret = _stp_module_update_self();`
		b8e312	`+ if (ret < 0) {`
		b8e312	`+ errk("%s: can't update dynamic info!", THIS_MODULE->name);`
		b8e312	`goto err3;`
		b8e312	`+ }`
		b8e312
		b8e312	`/* start transport */`
		b8e312	`_stp_transport_data_fs_start();`
		b8e312	`@@ -639,7 +649,7 @@ err2:`
		b8e312	`err1:`
		b8e312	`_stp_transport_fs_close();`
		b8e312	`err0:`
		b8e312	`- return -1;`
		b8e312	`+ return ret;`
		b8e312	`}`
		b8e312
		b8e312	`static inline void _stp_lock_inode(struct inode *inode)`
		b8e312	`commit 341bf33f14062269c52bcebaa309518d9972ca00`
		b8e312	`Author: Frank Ch. Eigler <fche@redhat.com>`
		b8e312	`Date: Fri Dec 11 18:06:36 2020 -0500`
		b8e312
		b8e312	`staprun: handle more and fewer cpus better`
		b8e312
		b8e312	`NR_CPUS was a hard-coded minimum and maximum on the number of CPUs`
		b8e312	`worth of trace$N files staprun/stapio would open at startup. While a`
		b8e312	`constant is useful for array sizing (and so might as well be really`
		b8e312	`large), the actual iteration should be informed by get_nprocs_conf(3).`
		b8e312
		b8e312	`This patch replaces NR_CPUS with MAX_NR_CPUS (now 1024, why not), and`
		b8e312	`limits open/thread iterations to the actual number of processors. It`
		b8e312	`even prints an error if a behemoth >1K-core machine comes into being.`
		b8e312
		b8e312	`diff --git a/staprun/relay.c b/staprun/relay.c`
		b8e312	`index c76e76719..3eb8df34b 100644`
		b8e312	`--- a/staprun/relay.c`
		b8e312	`+++ b/staprun/relay.c`
		b8e312	`@@ -12,16 +12,16 @@`
		b8e312
		b8e312	`#include "staprun.h"`
		b8e312
		b8e312	`-int out_fd[NR_CPUS];`
		b8e312	`+int out_fd[MAX_NR_CPUS];`
		b8e312	`int monitor_end = 0;`
		b8e312	`-static pthread_t reader[NR_CPUS];`
		b8e312	`-static int relay_fd[NR_CPUS];`
		b8e312	`-static int avail_cpus[NR_CPUS];`
		b8e312	`-static int switch_file[NR_CPUS];`
		b8e312	`-static pthread_mutex_t mutex[NR_CPUS];`
		b8e312	`+static pthread_t reader[MAX_NR_CPUS];`
		b8e312	`+static int relay_fd[MAX_NR_CPUS];`
		b8e312	`+static int avail_cpus[MAX_NR_CPUS];`
		b8e312	`+static int switch_file[MAX_NR_CPUS];`
		b8e312	`+static pthread_mutex_t mutex[MAX_NR_CPUS];`
		b8e312	`static int bulkmode = 0;`
		b8e312	`static volatile int stop_threads = 0;`
		b8e312	`-static time_t *time_backlog[NR_CPUS];`
		b8e312	`+static time_t *time_backlog[MAX_NR_CPUS];`
		b8e312	`static int backlog_order=0;`
		b8e312	`#define BACKLOG_MASK ((1 << backlog_order) - 1)`
		b8e312	`#define MONITORLINELENGTH 4096`
		b8e312	`@@ -313,12 +313,19 @@ int init_relayfs(void)`
		b8e312	`if (send_request(STP_BULK, rqbuf, sizeof(rqbuf)) == 0)`
		b8e312	`bulkmode = 1;`
		b8e312
		b8e312	`- /* Try to open a slew of per-cpu trace%d files. Per PR19241, we`
		b8e312	`- need to go through all potentially present CPUs up to NR_CPUS, that`
		b8e312	`- we hope is a reasonable limit. For !bulknode, "trace0" will be`
		b8e312	`- typically used. */`
		b8e312	`+ /* Try to open a slew of per-cpu trace%d files. Per PR19241,`
		b8e312	`+ we need to go through all potentially present CPUs up to`
		b8e312	`+ get_nprocs_conf(), up to MAX_NR_CPUS (used for array`
		b8e312	`+ allocations). For !bulknode, "trace0" will be typically`
		b8e312	`+ used, prior to systemtap 4.5; after, all are used. */`
		b8e312
		b8e312	`- for (i = 0; i < NR_CPUS; i++) {`
		b8e312	`+ int nprocs = get_nprocs_conf();`
		b8e312	`+ if (nprocs > MAX_NR_CPUS) {`
		b8e312	`+ err("Too many CPUs: get_nprocs_conf()=%d vs MAX_NR_CPUS=%d\n", nprocs, MAX_NR_CPUS);`
		b8e312	`+ return -1;`
		b8e312	`+ }`
		b8e312	`+`
		b8e312	`+ for (i = 0; i < nprocs; i++) {`
		b8e312	`relay_fd[i] = -1;`
		b8e312
		b8e312	`#ifdef HAVE_OPENAT`
		b8e312	`@@ -348,7 +355,8 @@ int init_relayfs(void)`
		b8e312	`}`
		b8e312	`}`
		b8e312	`ncpus = cpui;`
		b8e312	`- dbug(2, "ncpus=%d, bulkmode = %d\n", ncpus, bulkmode);`
		b8e312	`+ /* ncpus could be smaller than nprocs if some cpus are offline */`
		b8e312	`+ dbug(2, "ncpus=%d, nprocs=%d, bulkmode=%d\n", ncpus, nprocs, bulkmode);`
		b8e312	`for (i = 0; i < ncpus; i++)`
		b8e312	`dbug(2, "cpui=%d, relayfd=%d\n", i, avail_cpus[i]);`
		b8e312
		b8e312	`diff --git a/staprun/relay_old.c b/staprun/relay_old.c`
		b8e312	`index f0d2e918f..248e6059d 100644`
		b8e312	`--- a/staprun/relay_old.c`
		b8e312	`+++ b/staprun/relay_old.c`
		b8e312	`@@ -14,12 +14,12 @@`
		b8e312	`#include "staprun.h"`
		b8e312
		b8e312	`/* temporary per-cpu output written here for relayfs, filebase0...N */`
		b8e312	`-static int relay_fd[NR_CPUS];`
		b8e312	`-static int proc_fd[NR_CPUS];`
		b8e312	`-static FILE *percpu_tmpfile[NR_CPUS];`
		b8e312	`-static char *relay_buffer[NR_CPUS];`
		b8e312	`-static pthread_t reader[NR_CPUS];`
		b8e312	`-static int switch_file[NR_CPUS];`
		b8e312	`+static int relay_fd[MAX_NR_CPUS];`
		b8e312	`+static int proc_fd[MAX_NR_CPUS];`
		b8e312	`+static FILE *percpu_tmpfile[MAX_NR_CPUS];`
		b8e312	`+static char *relay_buffer[MAX_NR_CPUS];`
		b8e312	`+static pthread_t reader[MAX_NR_CPUS];`
		b8e312	`+static int switch_file[MAX_NR_CPUS];`
		b8e312	`static int bulkmode = 0;`
		b8e312	`unsigned subbuf_size = 0;`
		b8e312	`unsigned n_subbufs = 0;`
		b8e312	`@@ -37,7 +37,7 @@ static struct buf_status`
		b8e312	`{`
		b8e312	`struct _stp_buf_info info;`
		b8e312	`unsigned max_backlog; /* max # sub-buffers ready at one time */`
		b8e312	`-} status[NR_CPUS];`
		b8e312	`+} status[MAX_NR_CPUS];`
		b8e312
		b8e312
		b8e312	`/**`
		b8e312	`@@ -461,7 +461,13 @@ int init_oldrelayfs(void)`
		b8e312	`relay_fd[0] = -1;`
		b8e312	`out_fd[0] = 0;`
		b8e312
		b8e312	`- for (i = 0; i < NR_CPUS; i++) {`
		b8e312	`+ int nprocs = get_nprocs_conf();`
		b8e312	`+ if (nprocs > MAX_NR_CPUS) {`
		b8e312	`+ err("Too many CPUs: get_nprocs_conf()=%d vs MAX_NR_CPUS=%d\n", nprocs, MAX_NR_CPUS);`
		b8e312	`+ goto err;`
		b8e312	`+ }`
		b8e312	`+`
		b8e312	`+ for (i = 0; i < nprocs; i++) {`
		b8e312	`int ret = open_relayfs_files(i, relay_filebase, proc_filebase);`
		b8e312	`if (ret == 0)`
		b8e312	`break;`
		b8e312	`@@ -472,7 +478,8 @@ int init_oldrelayfs(void)`
		b8e312	`}`
		b8e312
		b8e312	`ncpus = i;`
		b8e312	`- dbug(2, "ncpus=%d\n", ncpus);`
		b8e312	`+ /* ncpus could be smaller than nprocs if some cpus are offline */`
		b8e312	`+ dbug(2, "ncpus=%d, nprocs=%d\n", ncpus, nprocs);`
		b8e312
		b8e312	`if (ncpus == 0) {`
		b8e312	`err("Couldn't open relayfs files.\n");`
		b8e312	`diff --git a/staprun/stap_merge.c b/staprun/stap_merge.c`
		b8e312	`index 7507f0e3d..87de7d465 100644`
		b8e312	`--- a/staprun/stap_merge.c`
		b8e312	`+++ b/staprun/stap_merge.c`
		b8e312	`@@ -31,15 +31,15 @@ static void usage (char *prog)`
		b8e312	`}`
		b8e312
		b8e312	`#define TIMESTAMP_SIZE (sizeof(int))`
		b8e312	`-#define NR_CPUS 256`
		b8e312	`+#define MAX_NR_CPUS 1024`
		b8e312
		b8e312	`int main (int argc, char *argv[])`
		b8e312	`{`
		b8e312	`char buf, outfile_name = NULL;`
		b8e312	`int c, i, j, rc, dropped=0;`
		b8e312	`- long count=0, min, num[NR_CPUS] = { 0 };`
		b8e312	`+ long count=0, min, num[MAX_NR_CPUS] = { 0 };`
		b8e312	`FILE *ofp = NULL;`
		b8e312	`- FILE *fp[NR_CPUS] = { 0 };`
		b8e312	`+ FILE *fp[MAX_NR_CPUS] = { 0 };`
		b8e312	`int ncpus, len, verbose = 0;`
		b8e312	`int bufsize = 65536;`
		b8e312
		b8e312	`@@ -67,6 +67,10 @@ int main (int argc, char *argv[])`
		b8e312
		b8e312	`i = 0;`
		b8e312	`while (optind < argc) {`
		b8e312	`+ if (i >= MAX_NR_CPUS) {`
		b8e312	`+ fprintf(stderr, "too many files (MAX_NR_CPUS=%d)\n", MAX_NR_CPUS);`
		b8e312	`+ return -1;`
		b8e312	`+ }`
		b8e312	`fp[i] = fopen(argv[optind++], "r");`
		b8e312	`if (!fp[i]) {`
		b8e312	`fprintf(stderr, "error opening file %s.\n", argv[optind - 1]);`
		b8e312	`diff --git a/staprun/staprun.h b/staprun/staprun.h`
		b8e312	`index e05dbe5b6..2d68bf527 100644`
		b8e312	`--- a/staprun/staprun.h`
		b8e312	`+++ b/staprun/staprun.h`
		b8e312	`@@ -37,6 +37,7 @@`
		b8e312	`#include <sys/wait.h>`
		b8e312	`#include <sys/statfs.h>`
		b8e312	`#include <syslog.h>`
		b8e312	`+#include <sys/sysinfo.h>`
		b8e312
		b8e312	`/* Include config.h to pick up dependency for --prefix usage. */`
		b8e312	`#include "../config.h"`
		b8e312	`@@ -285,10 +286,10 @@ extern int optopt;`
		b8e312	`extern int optind;`
		b8e312
		b8e312	`/* maximum number of CPUs we can handle */`
		b8e312	`-#define NR_CPUS 256`
		b8e312	`+#define MAX_NR_CPUS 1024`
		b8e312
		b8e312	`/* relay.c uses these /`
		b8e312	`-extern int out_fd[NR_CPUS];`
		b8e312	`+extern int out_fd[MAX_NR_CPUS];`
		b8e312
		b8e312	`/* relay_old uses these. Set in ctl.c */`
		b8e312	`extern unsigned subbuf_size;`
		b8e312	`commit b4b5a29b51586f75de16cacdb44bdf0b3ad0478e`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Mon Dec 14 13:20:34 2020 -0800`
		b8e312
		b8e312	`staprun: use the correct out_fd when bulkmode and fsize_max aren't used`
		b8e312
		b8e312	`When bulkmode and fsize_max aren't used, there is only one output fd and`
		b8e312	`it is stored at out_fd[avail_cpus[0]].`
		b8e312
		b8e312	`diff --git a/staprun/relay.c b/staprun/relay.c`
		b8e312	`index 3eb8df34b..d0202e52f 100644`
		b8e312	`--- a/staprun/relay.c`
		b8e312	`+++ b/staprun/relay.c`
		b8e312	`@@ -232,10 +232,17 @@ static void reader_thread(void data)`
		b8e312	`wbuf += bytes;`
		b8e312	`wsize += bytes;`
		b8e312	`} else {`
		b8e312	`- rc = write(out_fd[cpu], wbuf, wbytes);`
		b8e312	`+ int fd;`
		b8e312	`+ /* Only bulkmode and fsize_max use per-cpu output files. Otherwise,`
		b8e312	`+ there's just a single output fd stored at out_fd[avail_cpus[0]]. */`
		b8e312	`+ if (bulkmode \|\| fsize_max)`
		b8e312	`+ fd = out_fd[cpu];`
		b8e312	`+ else`
		b8e312	`+ fd = out_fd[avail_cpus[0]];`
		b8e312	`+ rc = write(fd, wbuf, wbytes);`
		b8e312	`if (rc <= 0) {`
		b8e312	`perr("Couldn't write to output %d for cpu %d, exiting.",`
		b8e312	`- out_fd[cpu], cpu);`
		b8e312	`+ fd, cpu);`
		b8e312	`goto error_out;`
		b8e312	`}`
		b8e312	`wbytes -= rc;`
		b8e312	`commit b26b4e2c257e0bd65134eed5e51d754227a4ed3f`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Wed Dec 30 14:21:42 2020 -0800`
		b8e312
		b8e312	`task_finder2: fix panics due to broken task work cancellation`
		b8e312
		b8e312	`The task_work_cancel() API uses function pointers to uniquely identify`
		b8e312	`task work structs, so there's no guarantee that a specific task work`
		b8e312	`struct we want to cancel is the one that will actually get canceled.`
		b8e312	`This issue would cause task work structs to be freed while they were`
		b8e312	`still queued up on the task's task-worker list.`
		b8e312
		b8e312	`This is an example of one such panic, where the DEBUG_MEM feature`
		b8e312	`reported that a __stp_tf_task_work struct (56 bytes) wasn't freed,`
		b8e312	`because that specific task worker got canceled and instead an active`
		b8e312	`task worker got freed!`
		b8e312
		b8e312	`orxray_resty_mem_X_35062: ERROR: Memory ffff8809ed388620 len=56 allocation type: kmalloc. Not freed.`
		b8e312	`BUG: unable to handle kernel paging request at ffffffffa0570877`
		b8e312	`IP: [<ffffffffa0570877>] 0xffffffffa0570876`
		b8e312	`PGD 1abd067 PUD 1abe063 PMD 1028286067 PTE 0`
		b8e312	`Oops: 0010 [#1] SMP`
		b8e312	`CPU: 3 PID: 1338 Comm: nginx Tainted: G OE ------------ 3.10.0-514.10.2.el7.x86_64.debug #1`
		b8e312	`Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014`
		b8e312	`task: ffff880eae2d0000 ti: ffff880eaf2e4000 task.ti: ffff880eaf2e4000`
		b8e312	`RIP: 0010:[<ffffffffa0570877>] [<ffffffffa0570877>] 0xffffffffa0570876`
		b8e312	`RSP: 0018:ffff880eaf2e7d78 EFLAGS: 00010282`
		b8e312	`RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000`
		b8e312	`RDX: ffff8809ed388640 RSI: 0000000000000000 RDI: ffff8809ed388640`
		b8e312	`RBP: ffff880eaf2e7da0 R08: 0000000000000000 R09: 0000000000000000`
		b8e312	`R10: 0000000000000001 R11: ffffffffff90001c R12: ffffffff8248b1c0`
		b8e312	`R13: ffff880eae2d0818 R14: ffff880eae2d0000 R15: 00007eff3d2490b0`
		b8e312	`FS: 00007eff3dcd2740(0000) GS:ffff881037c00000(0000) knlGS:0000000000000000`
		b8e312	`CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033`
		b8e312	`CR2: ffffffffa0570877 CR3: 0000000ebce67000 CR4: 00000000003406e0`
		b8e312	`DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000`
		b8e312	`DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400`
		b8e312	`Stack:`
		b8e312	`ffffffff810c6544 ffff880eaf2e7f58 ffff880eaf2e7e70 ffff880eae2d0000`
		b8e312	`00007eff3dcb3338 ffff880eaf2e7e38 ffffffff810b31ba ffff880eaf2e7dc0`
		b8e312	`ffffffff8106c279 ffff880eaf2e7e50 ffff880ef8a792c0 ffff880eaf2e7df8`
		b8e312	`Call Trace:`
		b8e312	`[<ffffffff810c6544>] ? task_work_run+0xb4/0xe0`
		b8e312	`[<ffffffff810b31ba>] get_signal_to_deliver+0x85a/0x960`
		b8e312	`[<ffffffff8106c279>] ? kvm_sched_clock_read+0x9/0x20`
		b8e312	`[<ffffffff810e7b4d>] ? sched_clock_local+0x1d/0x80`
		b8e312	`[<ffffffff810e7dd8>] ? sched_clock_cpu+0xb8/0xe0`
		b8e312	`[<ffffffff810324a7>] do_signal+0x57/0x6e0`
		b8e312	`[<ffffffff8176dba6>] ? int_very_careful+0x5/0xd`
		b8e312	`[<ffffffff81032b8f>] do_notify_resume+0x5f/0xb0`
		b8e312	`[<ffffffff8176dbfd>] int_signal+0x12/0x17`
		b8e312	`Code: Bad RIP value.`
		b8e312	`RIP [<ffffffffa0570877>] 0xffffffffa0570876`
		b8e312	`RSP <ffff880eaf2e7d78>`
		b8e312	`CR2: ffffffffa0570877`
		b8e312	`---[ end trace 1cdf8e5b522b246e ]---`
		b8e312
		b8e312	`diff --git a/runtime/linux/task_finder2.c b/runtime/linux/task_finder2.c`
		b8e312	`index 4e0b68f7c..ecf1f77fd 100644`
		b8e312	`--- a/runtime/linux/task_finder2.c`
		b8e312	`+++ b/runtime/linux/task_finder2.c`
		b8e312	`@@ -226,10 +226,22 @@ static void __stp_tf_cancel_all_task_work(void)`
		b8e312	`// Cancel all remaining requests.`
		b8e312	`stp_spin_lock_irqsave(&__stp_tf_task_work_list_lock, flags);`
		b8e312	`list_for_each_entry_safe(node, tmp, &__stp_tf_task_work_list, list) {`
		b8e312	`- if (stp_task_work_cancel(node->task, node->work.func)) {`
		b8e312	`- list_del(&node->list);`
		b8e312	`- _stp_kfree(node);`
		b8e312	`- }`
		b8e312	`+ struct __stp_tf_task_work *tf_work;`
		b8e312	`+ struct task_work *work;`
		b8e312	`+`
		b8e312	`+ work = stp_task_work_cancel(node->task, node->work.func);`
		b8e312	`+ if (!work)`
		b8e312	`+ continue;`
		b8e312	`+`
		b8e312	`+ /*`
		b8e312	`+ * There can be multiple queued task workers with the same`
		b8e312	`+ * worker func, so there's no guarantee that tf_work == node.`
		b8e312	`+ * Therefore, we can only free what stp_task_work_cancel() just`
		b8e312	`+ * gave us; freeing 'node' would be unsafe.`
		b8e312	`+ */`
		b8e312	`+ tf_work = container_of(work, typeof(*tf_work), work);`
		b8e312	`+ list_del(&tf_work->list);`
		b8e312	`+ _stp_kfree(tf_work);`
		b8e312	`}`
		b8e312	`stp_spin_unlock_irqrestore(&__stp_tf_task_work_list_lock, flags);`
		b8e312	`}`
		b8e312	`commit 96470399a5a6fba864b90afd15eda43cdc8c8ac4`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Wed Dec 30 15:42:11 2020 -0800`
		b8e312
		b8e312	`task_finder2: fix list corruption in __stp_tf_cancel_all_task_work()`
		b8e312
		b8e312	`The previous commit (b26b4e2c2 "task_finder2: fix panics due to broken`
		b8e312	`task work cancellation") made it possible for the next node in the task`
		b8e312	`work list to be free, which would made list_for_each_entry_safe() not so`
		b8e312	`safe anymore. Using list_for_each_entry_safe() is still the fastest`
		b8e312	`approach here, so when the next node in the list happens to be freed, we`
		b8e312	`should just restart iteration on the list.`
		b8e312
		b8e312	`diff --git a/runtime/linux/task_finder2.c b/runtime/linux/task_finder2.c`
		b8e312	`index ecf1f77fd..83fc17b5e 100644`
		b8e312	`--- a/runtime/linux/task_finder2.c`
		b8e312	`+++ b/runtime/linux/task_finder2.c`
		b8e312	`@@ -225,6 +225,7 @@ static void __stp_tf_cancel_all_task_work(void)`
		b8e312
		b8e312	`// Cancel all remaining requests.`
		b8e312	`stp_spin_lock_irqsave(&__stp_tf_task_work_list_lock, flags);`
		b8e312	`+restart:`
		b8e312	`list_for_each_entry_safe(node, tmp, &__stp_tf_task_work_list, list) {`
		b8e312	`struct __stp_tf_task_work *tf_work;`
		b8e312	`struct task_work *work;`
		b8e312	`@@ -242,6 +243,21 @@ static void __stp_tf_cancel_all_task_work(void)`
		b8e312	`tf_work = container_of(work, typeof(*tf_work), work);`
		b8e312	`list_del(&tf_work->list);`
		b8e312	`_stp_kfree(tf_work);`
		b8e312	`+`
		b8e312	`+ /*`
		b8e312	`+ * If the tf_work we just freed was the next node in the list,`
		b8e312	`+ * then we need to restart the list iteration because`
		b8e312	`+ * list_for_each_entry_safe() can't cope with the next node`
		b8e312	`+ * being freed. We still need to use list_for_each_entry_safe()`
		b8e312	`+ * because we need to get through one successful pass through`
		b8e312	`+ * the entire list, since it's not guaranteed that this list`
		b8e312	`+ * will be empty when this function exits, as there can still be`
		b8e312	`+ * active task workers running, which is fine since the`
		b8e312	`+ * stp_task_work API will wait for all task workers to finish`
		b8e312	`+ * before allowing the module to unload.`
		b8e312	`+ */`
		b8e312	`+ if (tf_work == tmp)`
		b8e312	`+ goto restart;`
		b8e312	`}`
		b8e312	`stp_spin_unlock_irqrestore(&__stp_tf_task_work_list_lock, flags);`
		b8e312	`}`
		b8e312	`commit 6cb54128e005d1220a7b064ee42b9f72561c28e7`
		b8e312	`Author: Sultan Alsawaf <sultan@openresty.com>`
		b8e312	`Date: Wed Dec 30 15:47:58 2020 -0800`
		b8e312
		b8e312	`task_finder2: fix task worker race on module unload`
		b8e312
		b8e312	`Unfortunately, __stp_tf_cancel_all_task_work() does not guarantee that`
		b8e312	`all of the task finder's task workers will be finished executing when it`
		b8e312	`returns. In this case, we rely on the stp_task_work API to prevent the`
		b8e312	`module from being unloaded while there are task workers in-flight, which`
		b8e312	`works, but the stp_task_work API is notified of a task worker finishing`
		b8e312	`before it actually finishes. Inside __stp_tf_task_worker_fn(), the`
		b8e312	`call to the task worker's function (tf_work->func) is where the final`
		b8e312	`refcount in the stp_task_work API could be put, but there will still be`
		b8e312	`instructions left in the task worker that will be executing for a short`
		b8e312	`time after that. In that short time, there can be a race where the`
		b8e312	`module is unloaded before the task worker finishes executing all of its`
		b8e312	`instructions, especially if the task worker gets preempted during this`
		b8e312	`time on a PREEMPT kernel.`
		b8e312
		b8e312	`To remedy this, we must ensure that the last instruction in`
		b8e312	`__stp_tf_task_worker_fn() is where the stp_task_work API is notified of`
		b8e312	`a task worker finishing.`
		b8e312
		b8e312	`diff --git a/runtime/linux/task_finder2.c b/runtime/linux/task_finder2.c`
		b8e312	`index 83fc17b5e..2bab19295 100644`
		b8e312	`--- a/runtime/linux/task_finder2.c`
		b8e312	`+++ b/runtime/linux/task_finder2.c`
		b8e312	`@@ -150,6 +150,7 @@ __stp_tf_task_worker_fn(struct task_work *work)`
		b8e312	`* workers for this task.`
		b8e312	`*/`
		b8e312	`__stp_tf_task_work_free(work);`
		b8e312	`+ stp_task_work_func_done();`
		b8e312	`}`
		b8e312
		b8e312	`static void`
		b8e312	`@@ -1066,11 +1067,8 @@ __stp_tf_clone_worker(struct task_work *work)`
		b8e312
		b8e312	`might_sleep();`
		b8e312	`if (atomic_read(&__stp_task_finder_state) != __STP_TF_RUNNING`
		b8e312	`- \|\| current->flags & PF_EXITING) {`
		b8e312	`- /* Remember that this task_work_func is finished. */`
		b8e312	`- stp_task_work_func_done();`
		b8e312	`+ \|\| current->flags & PF_EXITING)`
		b8e312	`return;`
		b8e312	`- }`
		b8e312
		b8e312	`__stp_tf_handler_start();`
		b8e312
		b8e312	`@@ -1085,10 +1083,6 @@ __stp_tf_clone_worker(struct task_work *work)`
		b8e312	`}`
		b8e312
		b8e312	`__stp_tf_handler_end();`
		b8e312	`-`
		b8e312	`- /* Remember that this task_work_func is finished. */`
		b8e312	`- stp_task_work_func_done();`
		b8e312	`- return;`
		b8e312	`}`
		b8e312
		b8e312
		b8e312	`@@ -1392,11 +1386,8 @@ __stp_tf_quiesce_worker(struct task_work *work)`
		b8e312
		b8e312	`might_sleep();`
		b8e312	`if (atomic_read(&__stp_task_finder_state) != __STP_TF_RUNNING`
		b8e312	`- \|\| current->flags & PF_EXITING) {`
		b8e312	`- /* Remember that this task_work_func is finished. */`
		b8e312	`- stp_task_work_func_done();`
		b8e312	`+ \|\| current->flags & PF_EXITING)`
		b8e312	`return;`
		b8e312	`- }`
		b8e312
		b8e312	`/* If we had a build-id based executable probe (so we have a`
		b8e312	`* tgt->build_id) set, we could not check it back in`
		b8e312	`@@ -1420,8 +1411,6 @@ __stp_tf_quiesce_worker(struct task_work *work)`
		b8e312	`(long) current->tgid, ok);`
		b8e312	`if (!ok) {`
		b8e312	`// stap_utrace_detach (current, & tgt->ops);`
		b8e312	`- /* Remember that this task_work_func is finished. */`
		b8e312	`- stp_task_work_func_done();`
		b8e312	`return;`
		b8e312	`}`
		b8e312	`}`
		b8e312	`@@ -1444,10 +1433,6 @@ __stp_tf_quiesce_worker(struct task_work *work)`
		b8e312	`__stp_call_callbacks(tgt, current, 1, (current->pid == current->tgid));`
		b8e312
		b8e312	`__stp_tf_handler_end();`
		b8e312	`-`
		b8e312	`- /* Remember that this task_work_func is finished. */`
		b8e312	`- stp_task_work_func_done();`
		b8e312	`- return;`
		b8e312	`}`
		b8e312
		b8e312	`static u32`
		b8e312	`@@ -1614,18 +1599,12 @@ __stp_tf_mmap_worker(struct task_work *work)`
		b8e312
		b8e312	`// See if we can find saved syscall info.`
		b8e312	`entry = __stp_tf_get_map_entry(current);`
		b8e312	`- if (entry == NULL) {`
		b8e312	`- /* Remember that this task_work_func is finished. */`
		b8e312	`- stp_task_work_func_done();`
		b8e312	`+ if (entry == NULL)`
		b8e312	`return;`
		b8e312	`- }`
		b8e312
		b8e312	`if (atomic_read(&__stp_task_finder_state) != __STP_TF_RUNNING`
		b8e312	`\|\| current->flags & PF_EXITING) {`
		b8e312	`__stp_tf_remove_map_entry(entry);`
		b8e312	`-`
		b8e312	`- /* Remember that this task_work_func is finished. */`
		b8e312	`- stp_task_work_func_done();`
		b8e312	`return;`
		b8e312	`}`
		b8e312
		b8e312	`@@ -1650,10 +1629,6 @@ __stp_tf_mmap_worker(struct task_work *work)`
		b8e312	`__stp_tf_remove_map_entry(entry);`
		b8e312
		b8e312	`__stp_tf_handler_end();`
		b8e312	`-`
		b8e312	`- /* Remember that this task_work_func is finished. */`
		b8e312	`- stp_task_work_func_done();`
		b8e312	`- return;`
		b8e312	`}`
		b8e312
		b8e312	`static u32`

rpms / systemtap

Source Code

Blame SOURCES/rhbz1906662.patch