From 958cb3476d957b48037917f2a59659e63f456e3e Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: May 18 2021 06:49:22 +0000 Subject: import gcc-toolset-10-systemtap-4.4-5.el8 --- diff --git a/.gcc-toolset-10-systemtap.metadata b/.gcc-toolset-10-systemtap.metadata index 8c485f1..accf03e 100644 --- a/.gcc-toolset-10-systemtap.metadata +++ b/.gcc-toolset-10-systemtap.metadata @@ -1 +1 @@ -14769266f6591b85895a5f40e516f8228e83476e SOURCES/systemtap-4.3.tar.gz +f126888adda90a0ec57f43f9db20fde68c8ef356 SOURCES/systemtap-4.4.tar.gz diff --git a/.gitignore b/.gitignore index adcfb77..b8f7ea7 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -SOURCES/systemtap-4.3.tar.gz +SOURCES/systemtap-4.4.tar.gz diff --git a/SOURCES/rhbz1650594.patch b/SOURCES/rhbz1650594.patch new file mode 100644 index 0000000..fd5afa9 --- /dev/null +++ b/SOURCES/rhbz1650594.patch @@ -0,0 +1,55 @@ +commit 734f5acf6c568c02254a33730c6f0fff016bdd09 +Author: Martin Cermak +Date: Wed Jan 20 22:09:49 2021 +0100 + + systemtap-service onboot: Skip updating the bootloader + + It shows up that just modifying the default initrd is good enough, + no need to call kernel-install or new-kernel-pkg. This speeds up + the systemtap-service onboot operation. + +diff --git a/initscript/systemtap.in b/initscript/systemtap.in +index 713f7a680..b1621ae2f 100755 +--- a/initscript/systemtap.in ++++ b/initscript/systemtap.in +@@ -46,8 +46,6 @@ STAPRUN=@bindir@/staprun + UNAME=/bin/uname + LSMOD=/sbin/lsmod + DRACUT=`which dracut` +-NEWKERNELPKG=/sbin/new-kernel-pkg +-KERNELINSTALL=/usr/bin/kernel-install + + # Not actually used directly, but needed by + # stap dracut module for inclusion in initramfs +@@ -839,10 +837,6 @@ backup_initramfs() { + + onboot () { + local s ret ss +- if [ ! -f "$NEWKERNELPKG" -a ! -f "$KERNELINSTALL" ]; then +- do_failure "Could not find $NEWKERNELPKG nor $KERNELINSTALL" +- return 1 +- fi + if [ ! -f "$DRACUT" ]; then + do_failure "$DRACUT not found" + return 1 +@@ -940,20 +934,6 @@ onboot () { + return 0 + fi + clog "done" +- # We're installing the initramfs in the default location, so user +- # expects the next boot to use it. Let's also update the bootloader. +- clog " Updating bootloader ... " -n +- if [ -x "$NEWKERNELPKG" ]; then +- logex $NEWKERNELPKG --initrdfile="$INITRAMFS" \ +- --update $KRELEASE +- else +- logex $KERNELINSTALL add $KRELEASE /boot/vmlinuz-$KRELEASE +- fi +- if [ $? -ne 0 ]; then +- do_failure "bootloader update exited with nonzero status" +- return 1 +- fi +- might_success "initramfs created and bootloader updated" + return 0 + } + diff --git a/SOURCES/rhbz1847676,1857749.patch b/SOURCES/rhbz1847676,1857749.patch deleted file mode 100644 index 80721d9..0000000 --- a/SOURCES/rhbz1847676,1857749.patch +++ /dev/null @@ -1,1403 +0,0 @@ -commit 11c39a7375bd2759b53b89236e755c91a4f5aad8 -Author: Frank Ch. Eigler -Date: Tue Jun 16 20:35:53 2020 -0400 - - RHBZ1847676: uprobes-inode tweaks redux - - Added (back) a spinlock to manage the stapiu_consumer -> process_list - structure, since it is occasionally travered from uprobe pre-handlers, - which are sometimes entered in atomic context (e.g. on rhel7). There, - the normal mutex_t is unsafe. So restoring a spinlock_t just for - those shortlived traversals, rhel7 and rawhide are both happy. - -diff --git a/runtime/linux/uprobes-inode.c b/runtime/linux/uprobes-inode.c -index 156360e..922c9f1 100644 ---- a/runtime/linux/uprobes-inode.c -+++ b/runtime/linux/uprobes-inode.c -@@ -143,7 +143,8 @@ struct stapiu_consumer { - struct list_head instance_list_head; // the resulting uprobe instances for this consumer - - struct list_head process_list_head; // the processes for this consumer -- -+ spinlock_t process_list_lock; // protect list; used briefly from even atomic contexts -+ - // List of perf counters used by each probe - // This list is an index into struct stap_perf_probe, - long perf_counters_dim; -@@ -174,16 +175,19 @@ stapiu_probe_prehandler (struct uprobe_consumer *inst, struct pt_regs *regs) - - // First find the related process, set by stapiu_change_plus. - // NB: This is a linear search performed for every probe hit! -- // This could be an algorithmic problem if the list gets large, but -- // we'll wait until this is demonstratedly a hotspot before optimizing. -- mutex_lock(&c->consumer_lock); -+ // This could be an algorithmic problem if the list gets large, -+ // but we'll wait until this is demonstratedly a hotspot before -+ // optimizing. NB: on rhel7 sometimes we're invoked from atomic -+ // context, so must be careful to use the spinlock, not the -+ // mutex. -+ spin_lock(&c->process_list_lock); - list_for_each_entry(p, &c->process_list_head, process_list) { - if (p->tgid == current->tgid) { - process = p; - break; - } - } -- mutex_unlock(&c->consumer_lock); -+ spin_unlock(&c->process_list_lock); - if (!process) { - #ifdef UPROBE_HANDLER_REMOVE - /* Once we're past the starting phase, we can be sure that any -@@ -344,7 +348,7 @@ static void - stapiu_decrement_semaphores(struct stapiu_consumer *consumers, size_t nconsumers) - { - size_t i; -- /* NB: no stapiu_process_slots_lock needed, as the task_finder engine is -+ /* NB: no process_list_lock use needed as the task_finder engine is - * already stopped by now, so no one else will mess with us. We need - * to be sleepable for access_process_vm. */ - for (i = 0; i < nconsumers; ++i) { -@@ -433,7 +437,8 @@ stapiu_init(struct stapiu_consumer *consumers, size_t nconsumers) - INIT_LIST_HEAD(&c->instance_list_head); - INIT_LIST_HEAD(&c->process_list_head); - mutex_init(&c->consumer_lock); -- -+ spin_lock_init(&c->process_list_lock); -+ - dbug_uprobes("registering task-finder for procname:%s buildid:%s\n", - ((char*)c->finder.procname ?: (char*)""), - ((char*)c->finder.build_id ?: (char*)"")); -@@ -560,7 +565,9 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - * calls us in this case with relocation=offset=0, so - * we don't have to worry about it. */ - p->base = relocation - offset; -+ spin_lock (&c->process_list_lock); - list_add(&p->process_list, &c->process_list_head); -+ spin_unlock (&c->process_list_lock); - - rc = 0; - mutex_unlock(&c->consumer_lock); -@@ -587,28 +594,40 @@ stapiu_change_semaphore_plus(struct stapiu_consumer* c, struct task_struct *task - { - int rc = 0; - struct stapiu_process *p; -+ int any_found; - - if (! c->sdt_sem_offset) // nothing to do - return 0; - -- /* NB: no lock after this point, as we need to be sleepable for -- * get/put_user semaphore action. The given process should be frozen -- * while we're busy, so it's not an issue. -- */ -- -- mutex_lock(&c->consumer_lock); -- -+ // NB: we mustn't hold a lock while changing the task memory, -+ // but we need a lock to protect the process_list from concurrent -+ // add/delete. So hold a spinlock during iteration until the first -+ // hit, then unlock & process. NB: We could in principle have multiple -+ // instances of the same process in the list (e.g., if the process -+ // somehow maps in the same solib multiple times). We can't easily -+ // both iterate this list (in a spinlock-protected safe way), and -+ // relax the spinlock enough to do a safe stapiu_write_task_semaphore() -+ // call within the loop. So we will hit only the copy in our list. -+ any_found = 0; -+ spin_lock(&c->process_list_lock); - /* Look through all the consumer's processes and increment semaphores. */ - list_for_each_entry(p, &c->process_list_head, process_list) { - unsigned long addr = p->base + c->sdt_sem_offset; - if (addr >= relocation && addr < relocation + length) { -- int rc2 = stapiu_write_task_semaphore(task, addr, +1); -+ int rc2; -+ // unlock list and process write for this entry -+ spin_unlock(&c->process_list_lock); -+ any_found=1; -+ rc2 = stapiu_write_task_semaphore(task, addr, +1); - if (!rc) -- rc = rc2; -+ rc = rc2; -+ break; // exit list_for_each loop - } - } -- -- mutex_unlock(&c->consumer_lock); -+ if (! any_found) -+ spin_unlock(&c->process_list_lock); -+ else -+ ; // already unlocked - - return rc; - } -@@ -635,8 +654,9 @@ stapiu_change_minus(struct stapiu_consumer* c, struct task_struct *task, - // process is dying anyway - // - the stapiu_consumer's process_list linked list will have a record - // of the dead process: well, not great, it'll be cleaned up eventually, -- // and cleaning it up NOW is tricky - need some spin lock to protect the list, -- // but not out sleepy mutex: -+ // and cleaning it up NOW is tricky - we could use the process_list_lock -+ // to protect the list (as done in stapiu_change_semaphore_plus), -+ // but not our sleepy mutex: - // - // [ 1955.410237] ? stapiu_change_minus+0x38/0xf0 [stap_54a723c01c50d972590a5c901516849_15522] - // [ 1955.411583] __mutex_lock+0x35/0x820 - -commit 4ccdfe4536d702612912e96d7b6278b169917eaa -Author: Frank Ch. Eigler -Date: Mon Jul 6 13:27:46 2020 -0400 - - RHBZ1847676 cont'd: more uprobes-inode/onthefly concurrency controls - - The systemtap.onthefly/*.exp tests had recently become hang-prone on - some kernels, for reasons still not completely understood. This set - of patches adds: - - - irq*-block spinlocks into uprobes-invoked paths, in case there is - peculiar reentrancy (from irq-related tracepoints) - - - a mutex lock/unlock into the stapiu_exit() path, in case there is - a concurrent stapiu_refresh() invoked by onthefly machinery around - exit time - - - restrictions into the onthefly module_refresh() translator code to - preclude STAP_SESSION_STOPPING as a time to do any sort of refresh - operation. Now probes that were disarmed will stay disarmed during - probe-end/error/etc. processing, which is always valid with the - spec, and avoids a class of late module-refresh ops - - Testing on rhel7 and rawhide indicates the reproducible hang is gone. - Our testsuite already tortures this code; invoke by hand via: - - % sudo make installcheck RUNTESTFLAGS="-v affection.exp hrtimer_onthefly.exp kprobes_onthefly.exp tracepoint_onthefly.exp uprobes_onthefly.exp" - -diff --git a/runtime/linux/uprobes-inode.c b/runtime/linux/uprobes-inode.c -index 922c9f1..3de7281 100644 ---- a/runtime/linux/uprobes-inode.c -+++ b/runtime/linux/uprobes-inode.c -@@ -172,6 +172,7 @@ stapiu_probe_prehandler (struct uprobe_consumer *inst, struct pt_regs *regs) - if (_stp_target) // need we filter by pid at all? - { - struct stapiu_process *p, *process = NULL; -+ unsigned long flags; - - // First find the related process, set by stapiu_change_plus. - // NB: This is a linear search performed for every probe hit! -@@ -180,14 +181,14 @@ stapiu_probe_prehandler (struct uprobe_consumer *inst, struct pt_regs *regs) - // optimizing. NB: on rhel7 sometimes we're invoked from atomic - // context, so must be careful to use the spinlock, not the - // mutex. -- spin_lock(&c->process_list_lock); -+ spin_lock_irqsave(&c->process_list_lock, flags); - list_for_each_entry(p, &c->process_list_head, process_list) { - if (p->tgid == current->tgid) { - process = p; - break; - } - } -- spin_unlock(&c->process_list_lock); -+ spin_unlock_irqrestore(&c->process_list_lock, flags); - if (!process) { - #ifdef UPROBE_HANDLER_REMOVE - /* Once we're past the starting phase, we can be sure that any -@@ -398,7 +399,7 @@ static void - stapiu_consumer_refresh(struct stapiu_consumer *c) - { - struct stapiu_instance *inst; -- -+ - mutex_lock(& c->consumer_lock); - - list_for_each_entry(inst, &c->instance_list_head, instance_list) { -@@ -420,7 +421,10 @@ stapiu_exit(struct stapiu_consumer *consumers, size_t nconsumers) - stapiu_decrement_semaphores(consumers, nconsumers); - for (i = 0; i < nconsumers; ++i) { - struct stapiu_consumer *c = &consumers[i]; -+ // protect against conceivable stapiu_refresh() at same time -+ mutex_lock(& c->consumer_lock); - stapiu_consumer_unreg(c); -+ mutex_unlock(& c->consumer_lock); - /* NB: task_finder needs no unregister. */ - } - } -@@ -480,6 +484,7 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - struct stapiu_instance *inst = NULL; - struct stapiu_process *p; - int j; -+ unsigned long flags; - - if (! inode) { - rc = -EINVAL; -@@ -565,9 +570,9 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - * calls us in this case with relocation=offset=0, so - * we don't have to worry about it. */ - p->base = relocation - offset; -- spin_lock (&c->process_list_lock); -+ spin_lock_irqsave (&c->process_list_lock, flags); - list_add(&p->process_list, &c->process_list_head); -- spin_unlock (&c->process_list_lock); -+ spin_unlock_irqrestore (&c->process_list_lock, flags); - - rc = 0; - mutex_unlock(&c->consumer_lock); -@@ -595,6 +600,7 @@ stapiu_change_semaphore_plus(struct stapiu_consumer* c, struct task_struct *task - int rc = 0; - struct stapiu_process *p; - int any_found; -+ unsigned long flags; - - if (! c->sdt_sem_offset) // nothing to do - return 0; -@@ -609,14 +615,14 @@ stapiu_change_semaphore_plus(struct stapiu_consumer* c, struct task_struct *task - // relax the spinlock enough to do a safe stapiu_write_task_semaphore() - // call within the loop. So we will hit only the copy in our list. - any_found = 0; -- spin_lock(&c->process_list_lock); -+ spin_lock_irqsave(&c->process_list_lock, flags); - /* Look through all the consumer's processes and increment semaphores. */ - list_for_each_entry(p, &c->process_list_head, process_list) { - unsigned long addr = p->base + c->sdt_sem_offset; - if (addr >= relocation && addr < relocation + length) { - int rc2; - // unlock list and process write for this entry -- spin_unlock(&c->process_list_lock); -+ spin_unlock_irqrestore(&c->process_list_lock, flags); - any_found=1; - rc2 = stapiu_write_task_semaphore(task, addr, +1); - if (!rc) -@@ -625,7 +631,7 @@ stapiu_change_semaphore_plus(struct stapiu_consumer* c, struct task_struct *task - } - } - if (! any_found) -- spin_unlock(&c->process_list_lock); -+ spin_unlock_irqrestore(&c->process_list_lock, flags); - else - ; // already unlocked - -diff --git a/translate.cxx b/translate.cxx -index 10b3d32..b10af5a 100644 ---- a/translate.cxx -+++ b/translate.cxx -@@ -2144,19 +2144,13 @@ c_unparser::emit_module_refresh () - o->newline() << "mutex_lock(&module_refresh_mutex);"; - - /* If we're not in STARTING/RUNNING state, don't try doing any work. -- PR16766 */ -+ PR16766. We don't want to run refresh ops during e.g. STOPPING, -+ so as to possibly activate uprobes near shutdown. */ - o->newline() << "state = atomic_read (session_state());"; -- o->newline() << "if (state != STAP_SESSION_RUNNING && state != STAP_SESSION_STARTING && state != STAP_SESSION_ERROR) {"; -- // cannot _stp_warn etc. since we're not in probe context -- o->newline(1) << "#if defined(__KERNEL__)"; -- o->newline() << "if (state != STAP_SESSION_STOPPING)"; -- o->newline(1) << "printk (KERN_ERR \"stap module notifier triggered in unexpected state %d\\n\", state);"; -- o->indent(-1); -- o->newline() << "#endif"; -- -+ o->newline() << "if (state != STAP_SESSION_RUNNING && state != STAP_SESSION_STARTING) {"; -+ o->newline(1); - if (!session->runtime_usermode_p()) - o->newline() << "mutex_unlock(&module_refresh_mutex);"; -- - o->newline() << "return;"; - o->newline(-1) << "}"; - - -commit 046fa017d2ab7fea1a4ba2295c31f768c072855e -Author: Frank Ch. Eigler -Date: Sun Jul 12 09:57:15 2020 -0400 - - RHBZ1847676 cont'd: one more uprobes-inode/onthefly concurrency control - - In uprobes-inode.c (stapiu_change_plus), the runtime can react to - arrivals of new mappings of a solib or executable by registering new - uprobes. Due to an assumption that this could not happen at - inconvenient times (such as a stapiu_refresh or near shutdown times), - the actual uprobes registration operation was done outside the - consumer_lock mutex being held. But it appears this can happen at bad - times, so the mutex needs to be held, just like within - stapiu_consumer_refresh(). - - The onthefly tests now survive iterating testing on rawhide+lockdep - and rhel7+lockdep. - -diff --git a/runtime/linux/uprobes-inode.c b/runtime/linux/uprobes-inode.c -index 3de7281..01c8a07 100644 ---- a/runtime/linux/uprobes-inode.c -+++ b/runtime/linux/uprobes-inode.c -@@ -575,12 +575,10 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - spin_unlock_irqrestore (&c->process_list_lock, flags); - - rc = 0; -- mutex_unlock(&c->consumer_lock); -- - // Register actual uprobe if cond_enabled right now - if (c->probe->cond_enabled) - (void) stapiu_register(inst, c); -- goto out; -+ goto out1; - - out2: - _stp_kfree(inst); - -commit a9a0131eb59e8abc197d3d2a553a86bcdec3dd70 -Author: Frank Ch. Eigler -Date: Fri Jul 17 22:33:04 2020 -0400 - - rhbz1857749: uprobes-inode regression in sdt semaphore setting - - Previous code neglected to set sdt.h semaphores for more than the - first process systemtap happened to encounter. This was from a - mistaken understanding of what it meant for stapiu_change_plus() to be - called with the same inode/consumer combination. Even though uprobes - are automatically shared, each new process still needs its perfctr and - sdt-semaphores individually set, so we do that now (as before the - rework of this code). Mechanized testing incoming shortly. - -diff --git a/runtime/linux/uprobes-inode.c b/runtime/linux/uprobes-inode.c -index 01c8a07..de81839 100644 ---- a/runtime/linux/uprobes-inode.c -+++ b/runtime/linux/uprobes-inode.c -@@ -190,6 +190,10 @@ stapiu_probe_prehandler (struct uprobe_consumer *inst, struct pt_regs *regs) - } - spin_unlock_irqrestore(&c->process_list_lock, flags); - if (!process) { -+ /* We know that we're in -c/-x mode, but this process is not -+ in the process hierarchy, so the uprobe should be ignored -+ and future hits prevented. PR15278 -+ */ - #ifdef UPROBE_HANDLER_REMOVE - /* Once we're past the starting phase, we can be sure that any - * processes which are executing code in a mapping have already -@@ -242,8 +246,8 @@ stapiu_register (struct stapiu_instance* inst, struct stapiu_consumer* c) - (unsigned long) inst->inode->i_ino, - (void*) (uintptr_t) c->offset, - c->probe->index, -- ((char*)c->finder.procname ?: (char*)""), -- ((char*)c->finder.build_id ?: (char*)"")); -+ ((char*)c->finder.procname ?: ((char*)c->solib_pathname ?: "")), -+ ((char*)c->finder.build_id ?: ((char*)c->solib_build_id ?: ""))); - - if (!c->return_p) { - inst->kconsumer.handler = stapiu_probe_prehandler; -@@ -444,8 +448,8 @@ stapiu_init(struct stapiu_consumer *consumers, size_t nconsumers) - spin_lock_init(&c->process_list_lock); - - dbug_uprobes("registering task-finder for procname:%s buildid:%s\n", -- ((char*)c->finder.procname ?: (char*)""), -- ((char*)c->finder.build_id ?: (char*)"")); -+ ((char*)c->finder.procname ?: ""), -+ ((char*)c->finder.build_id ?: "")); - - ret = stap_register_task_finder_target(&c->finder); - if (ret != 0) { -@@ -499,22 +503,22 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - if (rc) - goto out; - -- dbug_uprobes("notified for inode-offset u%sprobe " -+ dbug_uprobes("notified for inode-offset arrival u%sprobe " - "%lu:%p pidx %zu target procname:%s buildid:%s\n", - c->return_p ? "ret" : "", - (unsigned long) inode->i_ino, - (void*) (uintptr_t) c->offset, - c->probe->index, -- ((char*)c->finder.procname ?: (char*)""), -- ((char*)c->finder.build_id ?: (char*)"")); -+ ((char*)c->finder.procname ?: ((char*)c->solib_pathname ?: "")), -+ ((char*)c->finder.build_id ?: ((char*)c->solib_build_id ?: ""))); - - /* Check the buildid of the target (if we haven't already). We - * lock the target so we don't have concurrency issues. */ - mutex_lock(&c->consumer_lock); - -- // Check if we already have an instance for this inode, as though we -- // were called twice by task-finder mishap, or (hypothetically) the -- // shlib was mmapped twice. -+ // Check if we already have an instance for this inode. This is normal: -+ // if a different process maps the same solib, or forks into the same -+ // executable. In this case, we must not re-register the same uprobe. - list_for_each_entry(i, &c->instance_list_head, instance_list) { - if (i->inode == inode) { - inst = i; -@@ -522,28 +526,33 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - } - } - -- if (inst) { // wouldn't expect a re-notification -- if (inst->registered_p != c->probe->cond_enabled) -- // ... this should not happen -- ; -- goto out1; -- } -- -- // Normal case: need a new one. -- inst = _stp_kzalloc(sizeof(struct stapiu_instance)); -- if (! inst) { -- rc = -ENOMEM; -- goto out1; -- } -+ if (!inst) { // new instance; need new uprobe etc. -+ // Normal case: need a new one. -+ inst = _stp_kzalloc(sizeof(struct stapiu_instance)); -+ if (! inst) { -+ rc = -ENOMEM; -+ goto out1; -+ } - -- inst->sconsumer = c; // back link essential; that's how we go from uprobe *handler callback -+ inst->sconsumer = c; // back link essential; that's how we go from uprobe *handler callback -+ -+ /* Grab the inode first (to prevent TOCTTOU problems). */ -+ inst->inode = igrab(inode); -+ if (!inst->inode) { -+ rc = -EINVAL; -+ goto out2; -+ } -+ -+ // Add the inode/instance to the list -+ list_add(&inst->instance_list, &c->instance_list_head); - -- /* Grab the inode first (to prevent TOCTTOU problems). */ -- inst->inode = igrab(inode); -- if (!inst->inode) { -- rc = -EINVAL; -- goto out2; -+ // Register the actual uprobe if cond_enabled already -+ if (c->probe->cond_enabled) -+ (void) stapiu_register(inst, c); - } -+ -+ // ... but we may have to do per-process work anyway: perfctr -+ // initialization and sdt.h semaphore manipulation! - - // Perform perfctr registration if required - for (j=0; j < c->perf_counters_dim; j++) { -@@ -551,12 +560,10 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - (void) _stp_perf_read_init ((c->perf_counters)[j], task); - } - -- // Add the inode/instance to the list -- list_add(&inst->instance_list, &c->instance_list_head); -- - // Associate this consumer with this process. If we encounter - // resource problems here, we don't really have to undo the uprobe -- // registrations etc. already in effect. -+ // registrations etc. already in effect. It may break correct -+ // tracking of process hierarchy in -c/-x operation, but too bad. - p = _stp_kzalloc(sizeof(struct stapiu_process)); - if (! p) { - rc = -ENOMEM; -@@ -573,11 +580,10 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - spin_lock_irqsave (&c->process_list_lock, flags); - list_add(&p->process_list, &c->process_list_head); - spin_unlock_irqrestore (&c->process_list_lock, flags); -- -+ // NB: actual semaphore value bumping is done later -+ - rc = 0; - // Register actual uprobe if cond_enabled right now -- if (c->probe->cond_enabled) -- (void) stapiu_register(inst, c); - goto out1; - - out2: -@@ -617,11 +623,21 @@ stapiu_change_semaphore_plus(struct stapiu_consumer* c, struct task_struct *task - /* Look through all the consumer's processes and increment semaphores. */ - list_for_each_entry(p, &c->process_list_head, process_list) { - unsigned long addr = p->base + c->sdt_sem_offset; -+ if (p->tgid != task->tgid) // skip other processes in the list -+ continue; - if (addr >= relocation && addr < relocation + length) { - int rc2; - // unlock list and process write for this entry - spin_unlock_irqrestore(&c->process_list_lock, flags); - any_found=1; -+ -+ dbug_uprobes("incrementing semaphore (u%sprobe) pid %ld " -+ "pidx %zu address %lx\n", -+ c->return_p ? "ret" : "", -+ (long) task->tgid, -+ c->probe->index, -+ (unsigned long) addr); -+ - rc2 = stapiu_write_task_semaphore(task, addr, +1); - if (!rc) - rc = rc2; -@@ -641,15 +657,8 @@ stapiu_change_semaphore_plus(struct stapiu_consumer* c, struct task_struct *task - * about the semaphores, so we can just release the process slot. */ - static int - stapiu_change_minus(struct stapiu_consumer* c, struct task_struct *task, -- unsigned long relocation, unsigned long length) -+ unsigned long addr, unsigned long length) - { -- dbug_uprobes("notified for inode-offset departure u%sprobe " -- "pidx %zu target procname:%s buildid:%s\n", -- c->return_p ? "ret" : "", -- c->probe->index, -- ((char*)c->finder.procname ?: (char*)""), -- ((char*)c->finder.build_id ?: (char*)"")); -- - // We don't need do anything really. - // A process going away means: - // - its uprobes will no longer fire: no problem, the uprobe inode -@@ -674,6 +683,36 @@ stapiu_change_minus(struct stapiu_consumer* c, struct task_struct *task, - // [ 1955.436334] ? __x64_sys_execve+0x27/0x30 - // [ 1955.437700] ? do_syscall_64+0x5c/0xa0 - -+ // But as an optimization - to avoid having them build up indefinitely, -+ // and make semaphore operations go slowly, we will nuke matching entries anyway. -+ unsigned long flags; -+ struct stapiu_process *p, *tmp; -+ unsigned nmatch=0; -+ -+ spin_lock_irqsave(&c->process_list_lock, flags); -+ list_for_each_entry_safe(p, tmp, &c->process_list_head, process_list) { -+ // we nuke by matching semaphore address (where ..._semaphore_plus wrote) -+ // against the address range being unmapped -+ unsigned long semaddr = p->base + c->sdt_sem_offset; -+ if (p->tgid != task->tgid) // skip other processes in the list -+ continue; -+ if (semaddr >= addr && semaddr < addr + length) { -+ list_del(&p->process_list); -+ _stp_kfree (p); -+ nmatch ++; -+ } -+ } -+ spin_unlock_irqrestore(&c->process_list_lock, flags); -+ -+ if (nmatch > 0) -+ dbug_uprobes("notified for inode-offset departure u%sprobe " -+ "pidx %zu matches:%u procname:%s buildid:%s\n", -+ c->return_p ? "ret" : "", -+ c->probe->index, -+ nmatch, -+ ((char*)c->finder.procname ?: ((char*)c->solib_pathname ?: "")), -+ ((char*)c->finder.build_id ?: ((char*)c->solib_build_id ?: ""))); -+ - return 0; - } - - -commit e90530877ee21cffa2a9d53567ba5b5de1dd9b32 -Author: Frank Ch. Eigler -Date: Mon Jul 27 07:58:30 2020 -0400 - - PR25568 / RHBZ1857749: buildid/uprobes/inode rework, task_finder etc. side - - During work on a new stress tests for build-id based probes (coming in - next commit), it was found that the task_finder2 logic for buildid - verification didn't, well, work, because it was never run (due to an - erroneous pathlen conditional), and couldn't be safely run where it - was (because it was under spinlock but would have done - access_process_vm). Reworked the relevant bits of task_finder2 to - perform build-id verification for processes later - during the quiesce - callback periods. (Buildid verification for solibs is already done - in the task_finder2 consumer uprobes-inode.c.) - - Testing with sdt_misc indicated a case where a preexisting process - (with solib sdt.h semaphores) was being attached to by a new stap - binary. task_finder2's enumeration of the preexising processes' - memory map segments violated assumptions by recent code related to - tracking in stapiu_process[] lists. (It did not mirror the temporal - ld.so mmap sequence.) Changed this tracking to use the inode* as the - key, and stop trying to track mapping lengths, to make positive - matches and eliminate duplicate stapiu_process[] entries for the same - (process,solib) permutation. Reworked stapiu_process[] accumulation - generally to move to the two immediate task_finder callbacks, out of - stapiu_change_plus(). - - Added lots of commentary and diagnostics throughout. stap - -DDEBUG_UPROBES give meaningful info about uprobes & sdt semaphores; - with -DDEBUG_TASK_FINDER, more but not overwhelming relevant info - appears. - -diff --git a/runtime/linux/task_finder2.c b/runtime/linux/task_finder2.c -index 9777efb..8b8057a 100644 ---- a/runtime/linux/task_finder2.c -+++ b/runtime/linux/task_finder2.c -@@ -652,8 +652,8 @@ __verify_build_id(struct task_struct *tsk, unsigned long addr, - tsk_build_id[build_id_len] = '\0'; - - if (strcmp(build_id, tsk_build_id)) { -- dbug_task(2, "target build-id not matched: [%s] != [%s]\n", -- build_id, tsk_build_id); -+ dbug_task(2, "target build-id not matched: [%s] @ 0x%lx != [%s]\n", -+ build_id, addr, tsk_build_id); - return false; - } - -@@ -884,16 +884,9 @@ __stp_utrace_attach_match_filename(struct task_struct *tsk, - // procname/build-id and match an "all thread" probe. - if (tgt == NULL) - continue; -- /* buildid-based target */ -- else if (tgt->build_id_len > 0 && tgt->procname > 0 -- && !__verify_build_id(tsk, -- tgt->build_id_vaddr, -- tgt->build_id, -- tgt->build_id_len)) -- { -- continue; -- } -- else if (tgt->build_id_len == 0 && tgt->pathlen > 0 -+ /* buildid-based target ... gets checked in __stp_tf_quiesce_worker */ -+ /* procname-based target */ -+ else if (tgt->pathlen > 0 - && (tgt->pathlen != filelen - || strcmp(tgt->procname, filename) != 0)) - { -@@ -1341,6 +1334,34 @@ __stp_tf_quiesce_worker(struct task_work *work) - return; - } - -+ /* If we had a build-id based executable probe (so we have a -+ * tgt->build_id) set, we could not check it back in -+ * __stp_utrace_attach_* because we can't do sleepy -+ * access_process_vm() calls from there. BUt now that we're -+ * in process context, quiesced, finally we can check. If we -+ * were build-id based, and the build-id does not match, then -+ * we UTRACE_DETACH from this process and skip the callbacks. -+ * -+ * XXX: For processes that do match, we redo this check every -+ * time this callbacks is encountered somehow. That's -+ * probably unnecessary. -+ */ -+ if (tgt->build_id_len > 0) { -+ int ok = __verify_build_id(current, -+ tgt->build_id_vaddr, -+ tgt->build_id, -+ tgt->build_id_len); -+ -+ dbug_task(2, "verified buildid-target process pid=%ld ok=%d\n", -+ (long) current->tgid, ok); -+ if (!ok) { -+ // stap_utrace_detach (current, & tgt->ops); -+ /* Remember that this task_work_func is finished. */ -+ stp_task_work_func_done(); -+ return; -+ } -+ } -+ - __stp_tf_handler_start(); - - /* NB make sure we run mmap callbacks before other callbacks -@@ -1434,6 +1455,21 @@ __stp_utrace_task_finder_target_quiesce(u32 action, - } - } - else { -+ /* Like in __stp_tf_quiesce_worker(), verify build-id now if belated. */ -+ if (tgt->build_id_len > 0) { -+ int ok = __verify_build_id(current, -+ tgt->build_id_vaddr, -+ tgt->build_id, -+ tgt->build_id_len); -+ -+ dbug_task(2, "verified2 buildid-target process pid=%ld ok=%d\n", -+ (long) current->tgid, ok); -+ if (!ok) { -+ __stp_tf_handler_end(); -+ return UTRACE_RESUME; // NB: not _DETACH; that interferes with other engines -+ } -+ } -+ - /* NB make sure we run mmap callbacks before other callbacks - * like 'probe process.begin' handlers so that the vma tracker - * is already initialized in the latter contexts */ -@@ -1797,15 +1833,7 @@ stap_start_task_finder(void) - struct stap_task_finder_target, list); - if (tgt == NULL) - continue; -- /* buildid-based target */ -- else if (tgt->build_id_len > 0 && tgt->procname > 0 -- && !__verify_build_id(tsk, -- tgt->build_id_vaddr, -- tgt->build_id, -- tgt->build_id_len)) -- { -- continue; -- } -+ /* buildid-based target ... gets checked in __stp_tf_quiesce_worker */ - /* procname-based target */ - else if (tgt->build_id == 0 && tgt->pathlen > 0 - && (tgt->pathlen != mmpathlen -diff --git a/runtime/linux/uprobes-inode.c b/runtime/linux/uprobes-inode.c -index de81839..757da30 100644 ---- a/runtime/linux/uprobes-inode.c -+++ b/runtime/linux/uprobes-inode.c -@@ -76,7 +76,7 @@ struct stapiu_instance { - struct list_head instance_list; // to find other instances e.g. during shutdown - - struct uprobe_consumer kconsumer; // the kernel-side struct for uprobe callbacks etc. -- struct inode *inode; // XXX: refcount? -+ struct inode *inode; // refcounted - unsigned registered_p:1; // whether the this kconsumer is registered (= armed, live) - - struct stapiu_consumer *sconsumer; // whose instance are we -@@ -86,10 +86,14 @@ struct stapiu_instance { - /* A snippet to record the per-process vm where a particular - executable/solib was mapped. Used for sdt semaphore setting, and - for identifying processes of our interest (vs. disinterest) for -- uprobe hits. This object is owned by a stapiu_consumer. */ -+ uprobe hits. This object is owned by a stapiu_consumer. We use -+ the same inode* as the stapiu_instance, and have the same lifespan, -+ so don't bother separately refcount it. -+*/ - struct stapiu_process { - struct list_head process_list; // to find other processes - -+ struct inode *inode; // the inode* for solib or executable - unsigned long relocation; // the mmap'ed .text address - unsigned long base; // the address to apply sdt offsets against - pid_t tgid; // pid -@@ -392,6 +396,7 @@ stapiu_consumer_unreg(struct stapiu_consumer *c) - // multiple times in the list. Don't break after the first. - list_for_each_entry_safe(p, tmp, &c->process_list_head, process_list) { - list_del(&p->process_list); -+ // no refcount used for the inode field - _stp_kfree (p); - } - } -@@ -498,6 +503,8 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - /* Do the buildid check. NB: on F29+, offset may not equal - 0 for LOADable "R E" segments, because the read-only .note.* - stuff may have been loaded earlier, separately. PR23890. */ -+ // NB: this is not really necessary for buildid-based probes, -+ // which had this verified already. - rc = _stp_usermodule_check(task, c->module_name, - relocation - offset); - if (rc) -@@ -527,7 +534,6 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - } - - if (!inst) { // new instance; need new uprobe etc. -- // Normal case: need a new one. - inst = _stp_kzalloc(sizeof(struct stapiu_instance)); - if (! inst) { - rc = -ENOMEM; -@@ -560,30 +566,9 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - (void) _stp_perf_read_init ((c->perf_counters)[j], task); - } - -- // Associate this consumer with this process. If we encounter -- // resource problems here, we don't really have to undo the uprobe -- // registrations etc. already in effect. It may break correct -- // tracking of process hierarchy in -c/-x operation, but too bad. -- p = _stp_kzalloc(sizeof(struct stapiu_process)); -- if (! p) { -- rc = -ENOMEM; -- goto out1; -- } -- p->tgid = task->tgid; -- p->relocation = relocation; -- /* The base is used for relocating semaphores. If the -- * probe is in an ET_EXEC binary, then that offset -- * already is a real address. But stapiu_process_found -- * calls us in this case with relocation=offset=0, so -- * we don't have to worry about it. */ -- p->base = relocation - offset; -- spin_lock_irqsave (&c->process_list_lock, flags); -- list_add(&p->process_list, &c->process_list_head); -- spin_unlock_irqrestore (&c->process_list_lock, flags); -- // NB: actual semaphore value bumping is done later -+ // NB: process_list[] already extended up in stapiu_mmap_found(). - - rc = 0; -- // Register actual uprobe if cond_enabled right now - goto out1; - - out2: -@@ -599,7 +584,7 @@ stapiu_change_plus(struct stapiu_consumer* c, struct task_struct *task, - * Increment the semaphore now. */ - static int - stapiu_change_semaphore_plus(struct stapiu_consumer* c, struct task_struct *task, -- unsigned long relocation, unsigned long length) -+ unsigned long relocation, struct inode* inode) - { - int rc = 0; - struct stapiu_process *p; -@@ -609,6 +594,13 @@ stapiu_change_semaphore_plus(struct stapiu_consumer* c, struct task_struct *task - if (! c->sdt_sem_offset) // nothing to do - return 0; - -+ dbug_uprobes("considering semaphore (u%sprobe) pid %ld inode 0x%lx" -+ "pidx %zu\n", -+ c->return_p ? "ret" : "", -+ (long) task->tgid, -+ (unsigned long) inode, -+ c->probe->index); -+ - // NB: we mustn't hold a lock while changing the task memory, - // but we need a lock to protect the process_list from concurrent - // add/delete. So hold a spinlock during iteration until the first -@@ -617,32 +609,31 @@ stapiu_change_semaphore_plus(struct stapiu_consumer* c, struct task_struct *task - // somehow maps in the same solib multiple times). We can't easily - // both iterate this list (in a spinlock-protected safe way), and - // relax the spinlock enough to do a safe stapiu_write_task_semaphore() -- // call within the loop. So we will hit only the copy in our list. -+ // call within the loop. So we will hit only the first copy in our list. - any_found = 0; - spin_lock_irqsave(&c->process_list_lock, flags); - /* Look through all the consumer's processes and increment semaphores. */ - list_for_each_entry(p, &c->process_list_head, process_list) { - unsigned long addr = p->base + c->sdt_sem_offset; -- if (p->tgid != task->tgid) // skip other processes in the list -- continue; -- if (addr >= relocation && addr < relocation + length) { -- int rc2; -- // unlock list and process write for this entry -- spin_unlock_irqrestore(&c->process_list_lock, flags); -- any_found=1; -- -- dbug_uprobes("incrementing semaphore (u%sprobe) pid %ld " -- "pidx %zu address %lx\n", -- c->return_p ? "ret" : "", -- (long) task->tgid, -- c->probe->index, -- (unsigned long) addr); -- -- rc2 = stapiu_write_task_semaphore(task, addr, +1); -- if (!rc) -- rc = rc2; -- break; // exit list_for_each loop -- } -+ int rc2; -+ if (p->tgid != task->tgid) continue; // skip other processes in the list -+ if (p->inode != inode) continue; // skip other inodes -+ -+ // unlock list and process write for this entry -+ spin_unlock_irqrestore(&c->process_list_lock, flags); -+ any_found=1; -+ -+ dbug_uprobes("incrementing semaphore (u%sprobe) pid %ld " -+ "pidx %zu address 0x%lx\n", -+ c->return_p ? "ret" : "", -+ (long) task->tgid, -+ c->probe->index, -+ (unsigned long) addr); -+ -+ rc2 = stapiu_write_task_semaphore(task, addr, +1); -+ if (!rc) -+ rc = rc2; -+ break; // exit list_for_each loop - } - if (! any_found) - spin_unlock_irqrestore(&c->process_list_lock, flags); -@@ -755,17 +746,41 @@ stapiu_process_found(struct stap_task_finder_target *tf_target, - - if (!process_p) - return 0; /* ignore threads */ -- -+ -+ dbug_uprobes("process_found pid=%ld f.p=%s f.b=%s c.p=%s c.b=%s\n", -+ (long)task->tgid, -+ ((char*)c->finder.procname ?: ""), -+ ((char*)c->finder.build_id ?: ""), -+ ((char*)c->solib_pathname ?: ""), -+ ((char*)c->solib_build_id ?: "")); -+ - /* ET_EXEC events are like shlib events, but with 0 relocation bases */ - if (register_p) { - int rc = -EINVAL; - struct inode *inode = stapiu_get_task_inode(task); - - if (inode) { -- rc = stapiu_change_plus(c, task, 0, TASK_SIZE, -- 0, 0, inode); -- stapiu_change_semaphore_plus(c, task, 0, -- TASK_SIZE); -+ // Add a stapiu_process record to the consumer, so that -+ // the semaphore increment logic will accept this task. -+ struct stapiu_process* p; -+ unsigned long flags; -+ p = _stp_kzalloc(sizeof(struct stapiu_process)); -+ if (p) { -+ p->tgid = task->tgid; -+ p->relocation = 0; -+ p->inode = inode; -+ p->base = 0; -+ spin_lock_irqsave (&c->process_list_lock, flags); -+ list_add(&p->process_list, &c->process_list_head); -+ spin_unlock_irqrestore (&c->process_list_lock, flags); -+ } else { -+ _stp_warn("out of memory tracking executable in process %ld\n", -+ (long) task->tgid); -+ } -+ -+ rc = stapiu_change_plus(c, task, 0, TASK_SIZE, 0, 0, inode); -+ -+ stapiu_change_semaphore_plus(c, task, 0, inode); - } - return rc; - } else -@@ -776,6 +791,8 @@ stapiu_process_found(struct stap_task_finder_target *tf_target, - bool - __verify_build_id (struct task_struct *tsk, unsigned long addr, - unsigned const char *build_id, int build_id_len); -+// defined in task_finder2.c -+ - - - /* The task_finder_mmap_callback. These callbacks are NOT -@@ -791,28 +808,119 @@ stapiu_mmap_found(struct stap_task_finder_target *tf_target, - struct stapiu_consumer *c = - container_of(tf_target, struct stapiu_consumer, finder); - int rc = 0; -+ struct stapiu_process* p; -+ int known_mapping_p; -+ unsigned long flags; - -- /* The file path or build-id must match. The build-id address -- * is calculated using start address of this vma, the file -- * offset of the vma start address and the file offset of -- * the build-id. */ -- if (c->solib_pathname && path && strcmp (path, c->solib_pathname)) -- return 0; -- if (c->solib_build_id_len > 0 && !__verify_build_id(task, -- addr - offset + c->solib_build_id_vaddr, -- c->solib_build_id, -- c->solib_build_id_len)) -- return 0; -+ /* -+ We need to verify that this file/mmap corresponds to the given stapiu_consumer. -+ One could compare (inode) file name, but that won't work with buildid-based -+ uprobes. For those, one cannot just -+ -+ __verify_build_id(... addr - offset + c->solib_build_id_vaddr ...) -+ -+ because dlopen()ing a shared library involves multiple mmaps, including -+ some at repeating/offset addresses. See glibc _dl_map_segments() in various -+ versions. So by the fourth call (!) on modern glibc's, we get a VM_WRITE-able -+ data segment mapped, but that's at a load/mapping address that is offset by a -+ page from the base (file offset=0) mapping. -+ -+ e.g. on Fedora 32 / glibc 2.31, with testsuite/libsdt_buildid.so: -+ -+ Program Headers: -+ Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align -+ LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0004b8 0x0004b8 R 0x1000 -+ LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000161 0x000161 R E 0x1000 -+ LOAD 0x002000 0x0000000000002000 0x0000000000002000 0x0000cc 0x0000cc R 0x1000 -+ LOAD 0x002df8 0x0000000000003df8 0x0000000000003df8 0x000232 0x000238 RW 0x1000 -+ DYNAMIC 0x002e10 0x0000000000003e10 0x0000000000003e10 0x0001d0 0x0001d0 RW 0x8 -+ -+ strace: -+ openat(AT_FDCWD, ".../libsdt_buildid.so", O_RDONLY|O_CLOEXEC) = 3 -+ mmap(NULL, 16432, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x148c764ac000 -+ mmap(0x148c764ad000, 4096, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1000) = 0x148c764ad000 -+ mmap(0x148c764ae000, 4096, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x148c764ae000 -+ mmap(0x148c764af000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x148c764af000 -+ -+ Note how the virtual mapping for the fourth mmap (also) maps file-offset 0x2000 at -+ vm offset 0x3000. -+ -+ So what we do is rely on the name/buildid validation tests being run -+ -earlier- in the dlopen/mmap sequence to validate near-future -+ mmap()s. We search the c->process_list[] for a mapping that already -+ overlaps the new range, and if so, consider it validated ... whether -+ for the solib_pathname or the solib_build_id case. -+ -+ This is complicated for startup-time traversal of processes/mmaps, -+ where it seems sometimes we get notifications out of temporal sequence. -+ */ - -- /* 1 - shared libraries' executable segments load from offset 0 -- * - ld.so convention offset != 0 is now allowed -- * so stap_uprobe_change_plus can set a semaphore, -- * i.e. a static extern, in a shared object -- * 2 - the shared library we're interested in -- * 3 - mapping should be executable or writeable (for -- * semaphore in .so) -- * NB: or both, on kernels that lack noexec mapping -- */ -+ known_mapping_p = 0; -+ spin_lock_irqsave(&c->process_list_lock, flags); -+ list_for_each_entry(p, &c->process_list_head, process_list) { -+ if (p->tgid != task->tgid) continue; -+ if (p->inode != dentry->d_inode) continue; -+ known_mapping_p = 1; -+ break; -+ } -+ spin_unlock_irqrestore(&c->process_list_lock, flags); -+ -+ -+ // Check if this mapping (solib) is of interest: whether we expect -+ // it by buildid or name. -+ -+ if (! known_mapping_p) { -+ /* The file path or build-id must match. The build-id address -+ * is calculated using start address of this vma, the file -+ * offset of the vma start address and the file offset of -+ * the build-id. */ -+ if (c->solib_pathname && path && strcmp (path, c->solib_pathname)) -+ return 0; -+ if (c->solib_build_id_len > 0 && !__verify_build_id(task, -+ addr - offset + c->solib_build_id_vaddr, -+ c->solib_build_id, -+ c->solib_build_id_len)) -+ return 0; -+ } -+ -+ // If we made it this far, we have an interesting solib. -+ -+ dbug_uprobes("mmap_found pid=%ld path=%s addr=0x%lx length=%lu offset=%lu flags=0x%lx known=%d\n", -+ (long) task->tgid, path, addr, length, offset, vm_flags, known_mapping_p); -+ -+ if (! known_mapping_p) { -+ // OK, let's add it. The first mapping should be a VM_READ mapping -+ // of the entire solib file, which will also serve as the apprx. -+ // outer bounds of the repeatedly-mapped segments. -+ -+#if 0 -+ // Consider an assumption about the dlopen/mmap sequence -+ // If it comes out of sequence, we could get length/base wrong in the stored -+ // stapiu_process, which could lead us to miscalculate semaphore addresses. -+ // -+ // However, this has been observed on task-finder initial-enumeration case, -+ // (sdt_misc.exp, where a solib test is already running when stap starts). -+ if (offset != 0) -+ return 0; -+#endif -+ -+ // Associate this consumer with this process. If we encounter -+ // resource problems here, we don't really have to undo the uprobe -+ // registrations etc. already in effect. It may break correct -+ // tracking of process hierarchy in -c/-x operation, but too bad. -+ p = _stp_kzalloc(sizeof(struct stapiu_process)); -+ if (p) { -+ p->tgid = task->tgid; -+ p->relocation = addr; -+ p->inode = dentry->d_inode; -+ p->base = addr-offset; // ... in case caught this during the second mmap -+ spin_lock_irqsave (&c->process_list_lock, flags); -+ list_add(&p->process_list, &c->process_list_head); -+ spin_unlock_irqrestore (&c->process_list_lock, flags); -+ } else -+ _stp_warn("out of memory tracking solib %s in process %ld\n", -+ path, (long) task->tgid); -+ } - - /* Check non-writable, executable sections for probes. */ - if ((vm_flags & VM_EXEC) && !(vm_flags & VM_WRITE)) -@@ -827,7 +935,7 @@ stapiu_mmap_found(struct stap_task_finder_target *tf_target, - */ - - if ((rc == 0) && (vm_flags & VM_WRITE)) -- rc = stapiu_change_semaphore_plus(c, task, addr, length); -+ rc = stapiu_change_semaphore_plus(c, task, addr, dentry->d_inode); - - return rc; - } -diff --git a/runtime/sym.c b/runtime/sym.c -index be09ec8..21d820a 100644 ---- a/runtime/sym.c -+++ b/runtime/sym.c -@@ -713,9 +713,10 @@ static int _stp_build_id_check (struct _stp_module *m, - // NB: It is normal for different binaries with the same file path - // coexist in the same system via chroot or namespaces, therefore - // we make sure below is really a warning. -- _stp_warn ("Build-id mismatch [man warning::buildid]: \"%s\" address " -+ _stp_warn ("Build-id mismatch [man warning::buildid]: \"%s\" pid %ld address " - "%#lx, expected %s actual %s\n", -- m->path, notes_addr, hexstring_theory, hexstring_practice); -+ m->path, (long) tsk->tgid, -+ notes_addr, hexstring_theory, hexstring_practice); - return 1; - } - - -commit 5e1ef9d7f2a5ea6e5511ef5228cf05dda1c570b3 -Author: Frank Ch. Eigler -Date: Mon Jul 27 07:58:30 2020 -0400 - - PR25568 / RHBZ1857749: sdt_buildid.exp test case - - Add new test that checks for combinations of buildid and pathname - based uprobes for executables and shared libraries. - -diff --git a/testsuite/systemtap.base/sdt_buildid.c b/testsuite/systemtap.base/sdt_buildid.c -new file mode 100644 -index 0000000..ccbb2f2 ---- /dev/null -+++ b/testsuite/systemtap.base/sdt_buildid.c -@@ -0,0 +1,26 @@ -+#include -+#include -+#include -+ -+void bar (); -+ -+#ifndef ONLY_MAIN -+#include "sdt_buildid_.h" -+ -+void -+bar () -+{ -+ printf("%s=%ld\n", "test_probe_0_semaphore", SDT_BUILDID_TEST_PROBE_0_ENABLED()); -+ if (SDT_BUILDID_TEST_PROBE_0_ENABLED()) -+ SDT_BUILDID_TEST_PROBE_0(); -+} -+#endif -+ -+#ifndef NO_MAIN -+int -+main () -+{ -+ bar(); -+ return 0; -+} -+#endif -diff --git a/testsuite/systemtap.base/sdt_buildid.exp b/testsuite/systemtap.base/sdt_buildid.exp -new file mode 100644 -index 0000000..3141fd6 ---- /dev/null -+++ b/testsuite/systemtap.base/sdt_buildid.exp -@@ -0,0 +1,214 @@ -+set test "sdt_buildid" -+ -+set pbtype_flags {{additional_flags=-g} {} {}} -+set fail_count 0 -+ -+# Compile a C program to use as the user-space probing target -+set stap_path $env(SYSTEMTAP_PATH)/stap -+set sup_dpath "[pwd]/sdt_buildid_.d" -+set sup_hpath "[pwd]/sdt_buildid_.h" -+set sup_opath "[pwd]/sdt_buildid_.o" -+ -+# Run dtrace -+if {[installtest_p]} { -+ set dtrace $env(SYSTEMTAP_PATH)/dtrace -+} else { -+ set dtrace ../dtrace -+} -+ -+verbose -log "$dtrace --types -h -s $srcdir/$subdir/sdt_buildid_.d" -+if {[catch {exec $dtrace --types -h -s \ -+ $srcdir/$subdir/sdt_buildid_.d} res]} { -+ verbose -log "unable to run $dtrace: $res" -+} -+verbose -log "$dtrace --types -G -s $srcdir/$subdir/sdt_buildid_.d" -+if {[catch {exec $dtrace --types -G -s \ -+ $srcdir/$subdir/sdt_buildid_.d} res]} { -+ verbose -log "unable to run $dtrace: $res" -+} -+if {[file exists $sup_hpath] && [file exists $sup_opath]} then { -+ pass "$test dtrace" -+} else { -+ incr fail_count -+ fail "$test dtrace" -+ return -+} -+ -+set sup_flags [sdt_includes] -+set sup_flags "$sup_flags additional_flags=-Wall" -+set sup_flags "$sup_flags additional_flags=-Werror" -+set sup_flags "$sup_flags additional_flags=$sup_opath" -+set sup_flags "$sup_flags additional_flags=-I." -+set sup_exepath "[pwd]/sdt_buildid.x" -+ -+set res [target_compile $srcdir/$subdir/sdt_buildid.c $sup_exepath \ -+ executable $sup_flags] -+if { $res != "" } { -+ incr fail_count -+ verbose "target_compile failed: $res" 2 -+ fail "$test compiling" -+ return -+} else { -+ pass "$test compiling" -+} -+ -+ -+set sup41_flags "$sup_flags additional_flags=-shared" -+set sup41_flags "$sup41_flags additional_flags=-fPIC" -+set sup41_flags "$sup41_flags additional_flags=-DNO_MAIN" -+set sup_sopath "[pwd]/libsdt_buildid.so" -+set sup_exe2path "[pwd]/sdt_buildid_shared.x" -+set res0 [target_compile $srcdir/$subdir/sdt_buildid.c $sup_sopath \ -+ executable $sup41_flags ] -+set sup42_flags "additional_flags=-Wl,-rpath,[pwd]" -+set sup42_flags "$sup42_flags additional_flags=-L[pwd] additional_flags=-lsdt_buildid" -+set sup42_flags "$sup42_flags additional_flags=-DONLY_MAIN" -+set res [target_compile $srcdir/$subdir/sdt_buildid.c $sup_exe2path \ -+ executable $sup42_flags ] -+if { $res0 != "" || $res != "" } { -+ incr fail_count -+ verbose "target_compile failed: $res0 $res" 2 -+ fail "$test compiling -shared" -+ return -+} else { -+ pass "$test compiling -shared" -+} -+ -+catch { exec eu-readelf -n $sup_exepath | grep Build.ID | awk "{print \$NF}" } bid1 -+catch { exec eu-readelf -n $sup_sopath | grep Build.ID | awk "{print \$NF}" } bidso -+catch { exec eu-readelf -n $sup_exe2path | grep Build.ID | awk "{print \$NF}" } bid2 -+verbose -log "buildid: $sup_exepath $bid1" -+verbose -log "buildid: $sup_sopath $bidso" -+verbose -log "buildid: $sup_exe2path $bid2" -+# though we won't use the $bid2 -+ -+if {![installtest_p]} { -+ untested $test -+ return -+} -+ -+# To test via build-id, we need a debuginfod server to scan the testsuite build -+# directory. -+ -+ -+if [catch {exec /usr/bin/which debuginfod} debuginfod] then { -+ untested "$test debuginfod" -+} else { -+ set port [expr {10000 + int(rand()*10000)}] -+ spawn $debuginfod -p $port -d :memory: -F . -+ set debuginfod_pid [exp_pid $spawn_id] -+ # give it time to scan the build directory -+ sleep 10 -+ # XXX: we could expect some verbose traffic -+ set env(DEBUGINFOD_URLS) "http://localhost:$port $env(DEBUGINFOD_URLS)" -+ verbose -log "started debuginfod on port $port" -+ -+ set subtest "$test debuginfod buildid-exe buildid-solib" -+ spawn $stap_path $srcdir/$subdir/sdt_buildid.stp $bid1 $bidso -+ set ok 0 -+ expect { -+ -timeout 240 -+ -re {^Count [0-9]*[02468][^\r\n]*\r\n} { incr ok; exec $sup_exepath; exp_continue } -+ -re {^Count [0-9]*[13579][^\r\n]*\r\n} { incr ok; exec $sup_exe2path; exp_continue } -+ eof { } -+ timeout { } -+ } -+ catch {close}; catch {wait} -+ if {$ok > 6} then { pass $subtest } else { fail $subtest } -+ -+ set subtest "$test debuginfod buildid-exe path-solib" -+ spawn $stap_path $srcdir/$subdir/sdt_buildid.stp $bid1 $sup_sopath -+ set ok 0 -+ expect { -+ -timeout 240 -+ -re {^Count [0-9]*[02468][^\r\n]*\r\n} { incr ok; exec $sup_exepath; exp_continue } -+ -re {^Count [0-9]*[13579][^\r\n]*\r\n} { incr ok; exec $sup_exe2path; exp_continue } -+ eof { } -+ timeout { } -+ } -+ catch {close}; catch {wait} -+ if {$ok > 6} then { pass $subtest } else { fail $subtest } -+ -+ set subtest "$test debuginfod path-exe buildid-solib" -+ spawn $stap_path $srcdir/$subdir/sdt_buildid.stp $sup_exepath $bidso -+ set ok 0 -+ expect { -+ -timeout 240 -+ -re {^Count [0-9]*[02468][^\r\n]*\r\n} { incr ok; exec $sup_exepath; exp_continue } -+ -re {^Count [0-9]*[13579][^\r\n]*\r\n} { incr ok; exec $sup_exe2path; exp_continue } -+ eof { } -+ timeout { } -+ } -+ catch {close}; catch {wait} -+ if {$ok > 6} then { pass $subtest } else { fail $subtest } -+ -+ set subtest "$test debuginfod buildid-solib" -+ spawn $stap_path $srcdir/$subdir/sdt_buildid.stp $bidso -+ set ok 0 -+ expect { -+ -timeout 240 -+ -re {^Count [0-9]*[02468][^\r\n]*\r\n} { incr ok; exec $sup_exe2path; exp_continue } -+ -re {^Count [0-9]*[13579][^\r\n]*\r\n} { incr ok; exec $sup_exe2path; exp_continue } -+ eof { } -+ timeout { } -+ } -+ catch {close}; catch {wait} -+ if {$ok > 6} then { pass $subtest } else { fail $subtest } -+ -+ set subtest "$test debuginfod buildid-exe" -+ spawn $stap_path $srcdir/$subdir/sdt_buildid.stp $bid1 -+ set ok 0 -+ expect { -+ -timeout 240 -+ -re {^Count [0-9]*[02468][^\r\n]*\r\n} { incr ok; exec $sup_exepath; exp_continue } -+ -re {^Count [0-9]*[13579][^\r\n]*\r\n} { incr ok; exec $sup_exepath; exp_continue } -+ eof { } -+ timeout { } -+ } -+ catch {close}; catch {wait} -+ if {$ok > 6} then { pass $subtest } else { fail $subtest } -+ -+ kill -INT $debuginfod_pid -+} -+ -+ -+set subtest "$test non-buildid both" -+spawn $stap_path $srcdir/$subdir/sdt_buildid.stp $sup_exepath $sup_sopath -+set ok 0 -+expect { -+ -timeout 240 -+ -re {^Count [0-9]*[02468][^\r\n]*\r\n} { incr ok; exec $sup_exepath; exp_continue } -+ -re {^Count [0-9]*[13579][^\r\n]*\r\n} { incr ok; exec $sup_exe2path; exp_continue } -+ eof { } -+ timeout { } -+} -+catch {close}; catch {wait} -+if {$ok > 6} then { pass $subtest } else { fail $subtest } -+ -+set subtest "$test non-buildid exe" -+spawn $stap_path $srcdir/$subdir/sdt_buildid.stp $sup_exepath -+set ok 0 -+expect { -+ -timeout 240 -+ -re {^Count [0-9]*[02468][^\r\n]*\r\n} { incr ok; exec $sup_exepath; exp_continue } -+ -re {^Count [0-9]*[13579][^\r\n]*\r\n} { incr ok; exec $sup_exepath; exp_continue } -+ eof { } -+ timeout { } -+} -+catch {close}; catch {wait} -+if {$ok > 6} then { pass $subtest } else { fail $subtest } -+ -+set subtest "$test non-buildid solib" -+spawn $stap_path $srcdir/$subdir/sdt_buildid.stp $sup_sopath -+set ok 0 -+expect { -+ -timeout 240 -+ -re {^Count [0-9]*[02468][^\r\n]*\r\n} { incr ok; exec $sup_exe2path; exp_continue } -+ -re {^Count [0-9]*[13579][^\r\n]*\r\n} { incr ok; exec $sup_exe2path; exp_continue } -+ eof { } -+ timeout { } -+} -+catch {close}; catch {wait} -+if {$ok > 6} then { pass $subtest } else { fail $subtest } -+ -+return -diff --git a/testsuite/systemtap.base/sdt_buildid.stp b/testsuite/systemtap.base/sdt_buildid.stp -new file mode 100644 -index 0000000..a26d183 ---- /dev/null -+++ b/testsuite/systemtap.base/sdt_buildid.stp -@@ -0,0 +1,19 @@ -+global count -+ -+function trace () { -+ printf ("Count %d [%d] %s %s\n", count++, pid(), $$name, pp()) -+} -+ -+probe process(@1).mark("test_probe_0") { trace() } -+%( $# > 1 %? probe process(@2).mark("test_probe_0") { trace() } %) -+ -+probe begin -+{ -+ printf ("Count %d\n", count++) -+} -+ -+probe timer.s(1) // exit quickly after enough marks fire -+{ -+ if (count > 10) exit() -+} -+ -diff --git a/testsuite/systemtap.base/sdt_buildid_.d b/testsuite/systemtap.base/sdt_buildid_.d -new file mode 100644 -index 0000000..ebfca55 ---- /dev/null -+++ b/testsuite/systemtap.base/sdt_buildid_.d -@@ -0,0 +1,4 @@ -+provider sdt_buildid { -+ probe test_probe_0 (); -+}; -+ diff --git a/SOURCES/rhbz1855264.patch b/SOURCES/rhbz1855264.patch deleted file mode 100644 index c361090..0000000 --- a/SOURCES/rhbz1855264.patch +++ /dev/null @@ -1,874 +0,0 @@ -commit 0a281a96ddf7cae9a0f0cc0eb505a752ffdd932e -Author: William Cohen -Date: Tue Jun 16 16:02:11 2020 -0400 - - Make sizeof.stp runnable with the bpf backend. - -diff --git a/testsuite/systemtap.examples/general/sizeof.meta b/testsuite/systemtap.examples/general/sizeof.meta -index 29713e4..b30078d 100644 ---- a/testsuite/systemtap.examples/general/sizeof.meta -+++ b/testsuite/systemtap.examples/general/sizeof.meta -@@ -2,7 +2,7 @@ title: Print the Size of a C Type - name: sizeof.stp - version: 1.0 - author: anonymous --keywords: statistics memory -+keywords: statistics memory bpf - subsystem: any - status: proposed - exit: event-ended -@@ -11,3 +11,5 @@ scope: system-wide - description: This script prints the size of a type, based on dwarf debuginfo for any kernel or userspace module, or trial-compilation of a given header file name. - test_check: stap -p4 sizeof.stp task_struct 'kernel' - test_installcheck: stap sizeof.stp FILE '' -+test_check_bpf: stap -p4 --bpf sizeof.stp task_struct 'kernel' -+test_installcheck_bpf: stap --bpf sizeof.stp FILE '' -diff --git a/testsuite/systemtap.examples/general/sizeof.stp b/testsuite/systemtap.examples/general/sizeof.stp -index 0c77dce..5aec674 100755 ---- a/testsuite/systemtap.examples/general/sizeof.stp -+++ b/testsuite/systemtap.examples/general/sizeof.stp -@@ -7,9 +7,11 @@ - # sizeof.stp TYPENAME - - probe oneshot { -- println("type ", @1, -- %( $# > 1 %? " in ", @2, %) /* module or header file name */ -- " byte-size: ", -- %( $# > 1 %? @cast_module_sizeof(@2, @1) %: @cast_sizeof(@1) %) -- ) -+ %( $# > 1 %? -+ printf("type %s in %s byte-size: %d\n", @1, @2, -+ @cast_module_sizeof(@2, @1)) -+ %: -+ printf("type %s byte-size: %d\n", @1, -+ @cast_sizeof(@1)) -+ %) - } -commit 2b2b6a622dc1d434c60d0ea159b260f660068ad1 -Author: William Cohen -Date: Wed Jun 17 11:57:18 2020 -0400 - - Fix sizeof.stp to explicitly use kernel debuginfo if one not specified - - Commit 00ee19ff030f665df7e087a579f39105256a0253 changed how @cast() - operations work and they no longer default to using the kernel - debuginfo for type information. Need to use the @cast_module_sizeof() - instead of @cast_size() to use the kernel debuginfo. - -diff --git a/testsuite/systemtap.examples/general/sizeof.stp b/testsuite/systemtap.examples/general/sizeof.stp -index 5aec674..b45f593 100755 ---- a/testsuite/systemtap.examples/general/sizeof.stp -+++ b/testsuite/systemtap.examples/general/sizeof.stp -@@ -12,6 +12,6 @@ probe oneshot { - @cast_module_sizeof(@2, @1)) - %: - printf("type %s byte-size: %d\n", @1, -- @cast_sizeof(@1)) -+ @cast_module_sizeof("kernel", @1)) - %) - } - -commit 717b7dddd08b66b3caa5585221472d84e40be658 -Author: William Cohen -Date: Wed Jun 17 13:08:30 2020 -0400 - - Use explicit @cast() operators to fslatency-nd.stp and fsslower-nd.stp - - Commit 00ee19ff030f665df7e087a579f39105256a0253 changed how @cast() - operations work and they no longer default to using the kernel - debuginfo for type information. Need to include kernel as location for - this information for the @cast() rather than just assuming a default. - -diff --git a/testsuite/systemtap.examples/lwtools/fslatency-nd.stp b/testsuite/systemtap.examples/lwtools/fslatency-nd.stp -index 6008399..0bee34f 100755 ---- a/testsuite/systemtap.examples/lwtools/fslatency-nd.stp -+++ b/testsuite/systemtap.examples/lwtools/fslatency-nd.stp -@@ -63,8 +63,8 @@ probe __vfs_read = kprobe.function("__vfs_read") - { - # Skip the call if new_sync_read() wouldn't be called. - file = pointer_arg(1) -- if (!file || @cast(file, "file")->f_op->read -- || !@cast(file, "file")->f_op->read_iter) -+ if (!file || @cast(file, "file", "kernel")->f_op->read -+ || !@cast(file, "file", "kernel")->f_op->read_iter) - next - } - -@@ -75,8 +75,8 @@ probe __vfs_write = kprobe.function("__vfs_write") - { - # Skip the call if new_sync_write() wouldn't be called. - file = pointer_arg(1) -- if (!file || @cast(file, "file")->f_op->write -- || !@cast(file, "file")->f_op->write_iter) -+ if (!file || @cast(file, "file", "kernel")->f_op->write -+ || !@cast(file, "file", "kernel")->f_op->write_iter) - next - } - -@@ -102,8 +102,8 @@ probe __vfs_read.return = kprobe.function("__vfs_read").return - { - # Skip the call if new_sync_read() wouldn't be called. - file = @entry(pointer_arg(1)) -- if (!file || @cast(file, "file")->f_op->read -- || !@cast(file, "file")->f_op->read_iter) -+ if (!file || @cast(file, "file", "kernel")->f_op->read -+ || !@cast(file, "file", "kernel")->f_op->read_iter) - next - } - -@@ -115,8 +115,8 @@ probe __vfs_write.return = kprobe.function("__vfs_write") - { - # Skip the call if new_sync_write() wouldn't be called. - file = pointer_arg(1) -- if (!file || @cast(file, "file")->f_op->write -- || !@cast(file, "file")->f_op->write_iter) -+ if (!file || @cast(file, "file", "kernel")->f_op->write -+ || !@cast(file, "file", "kernel")->f_op->write_iter) - next - } - -diff --git a/testsuite/systemtap.examples/lwtools/fsslower-nd.stp b/testsuite/systemtap.examples/lwtools/fsslower-nd.stp -index 64abe41..90fa9b5 100755 ---- a/testsuite/systemtap.examples/lwtools/fsslower-nd.stp -+++ b/testsuite/systemtap.examples/lwtools/fsslower-nd.stp -@@ -65,8 +65,8 @@ probe __vfs_read = kprobe.function("__vfs_read") - { - # Skip the call if new_sync_read() wouldn't be called. - file = pointer_arg(1) -- if (!file || @cast(file, "file")->f_op->read -- || !@cast(file, "file")->f_op->read_iter) -+ if (!file || @cast(file, "file", "kernel")->f_op->read -+ || !@cast(file, "file", "kernel")->f_op->read_iter) - next - } - -@@ -77,8 +77,8 @@ probe __vfs_write = kprobe.function("__vfs_write") - { - # Skip the call if new_sync_write() wouldn't be called. - file = pointer_arg(1) -- if (!file || @cast(file, "file")->f_op->write -- || !@cast(file, "file")->f_op->write_iter) -+ if (!file || @cast(file, "file", "kernel")->f_op->write -+ || !@cast(file, "file", "kernel")->f_op->write_iter) - next - } - -@@ -110,8 +110,8 @@ probe __vfs_read.return = kprobe.function("__vfs_read").return - { - # Skip the call if new_sync_read() wouldn't be called. - file = @entry(pointer_arg(1)) -- if (!file || @cast(file, "file")->f_op->read -- || !@cast(file, "file")->f_op->read_iter) -+ if (!file || @cast(file, "file", "kernel")->f_op->read -+ || !@cast(file, "file", "kernel")->f_op->read_iter) - next - } - -@@ -123,7 +123,7 @@ probe __vfs_write.return = kprobe.function("__vfs_write") - { - # Skip the call if new_sync_write() wouldn't be called. - file = pointer_arg(1) -- if (!file || @cast(file, "file")->f_op->write -- || !@cast(file, "file")->f_op->write_iter) -+ if (!file || @cast(file, "file", "kernel")->f_op->write -+ || !@cast(file, "file", "kernel")->f_op->write_iter) - next - } - -commit 9eb37102d48b814821b7f474986a7bfe86784192 -Author: William Cohen -Date: Wed Jun 17 13:39:20 2020 -0400 - - Use explicit @cast() operators for pfiles.stp and ioctl_handler.stp - - Commit 00ee19ff030f665df7e087a579f39105256a0253 changed how @cast() - operations work and they no longer default to using the kernel - debuginfo for type information. Need to include kernel as location for - this information for the @cast() rather than just assuming a default. - -diff --git a/testsuite/systemtap.examples/process/pfiles.stp b/testsuite/systemtap.examples/process/pfiles.stp -index d52a154..6344a4c 100755 ---- a/testsuite/systemtap.examples/process/pfiles.stp -+++ b/testsuite/systemtap.examples/process/pfiles.stp -@@ -787,9 +787,9 @@ function print_unix_socket(sock) { - strlen(peername) > 0 ? peername . "\n" : "") - try { # skip line in case of null pointers - printf(" peercred pid: %d\n", -- @defined(@cast(sock, "socket")->sk->sk_peer_pid) ? -- @cast(sock, "socket")->sk->sk_peer_pid->numbers[0]->nr : -- @cast(sock, "socket")->sk->sk_peercred->pid ); } catch { } -+ @defined(@cast(sock, "socket", "kernel")->sk->sk_peer_pid) ? -+ @cast(sock, "socket", "kernel")->sk->sk_peer_pid->numbers[0]->nr : -+ @cast(sock, "socket", "kernel")->sk->sk_peercred->pid ); } catch { } - } - - function print_ipv4_socket(sock) { -diff --git a/testsuite/systemtap.examples/profiling/ioctl_handler.stp b/testsuite/systemtap.examples/profiling/ioctl_handler.stp -index 7044185..6f1e52c 100755 ---- a/testsuite/systemtap.examples/profiling/ioctl_handler.stp -+++ b/testsuite/systemtap.examples/profiling/ioctl_handler.stp -@@ -9,7 +9,7 @@ probe syscall.ioctl { - ioctl_requests[execname()] <<< 1 - try { - # Dig down through the task struct to find the actual function handling ioctl. -- ioctl_func_address = @cast(task_current(), "struct task_struct")->files->fdt->fd[fd]->f_op->unlocked_ioctl -+ ioctl_func_address = @cast(task_current(), "struct task_struct", "kernel")->files->fdt->fd[fd]->f_op->unlocked_ioctl - if (ioctl_func_address) - ioctl_func[execname(), ioctl_func_address] <<< 1 - } catch { - -commit 3040d4e8ddb6a9b1d1a57a0185206498670c3f1a -Author: William Cohen -Date: Wed Jun 17 13:53:58 2020 -0400 - - Use explicit @cast() operators for stapgames/pingpong.stp tapset. - -diff --git a/testsuite/systemtap.examples/stapgames/tapset/gmtty.stp b/testsuite/systemtap.examples/stapgames/tapset/gmtty.stp -index 026e4a9..f6ad2db 100644 ---- a/testsuite/systemtap.examples/stapgames/tapset/gmtty.stp -+++ b/testsuite/systemtap.examples/stapgames/tapset/gmtty.stp -@@ -10,12 +10,12 @@ global GM_tty_ws_row, GM_tty_ws_col, GM_tty_name - # Initialize current TTY -- must be called from begin - function game_tty_init:long () - { -- tty = @cast(task_current(), "task_struct")->signal->tty -+ tty = @cast(task_current(), "task_struct", "kernel")->signal->tty - if (tty) { -- namep = @cast(tty,"tty_struct")->name -+ namep = @cast(tty,"tty_struct", "kernel")->name - GM_tty_name = kernel_string(namep) -- GM_tty_ws_col = @cast(tty, "tty_struct")->winsize->ws_col -- GM_tty_ws_row = @cast(tty, "tty_struct")->winsize->ws_row -+ GM_tty_ws_col = @cast(tty, "tty_struct", "kernel")->winsize->ws_col -+ GM_tty_ws_row = @cast(tty, "tty_struct", "kernel")->winsize->ws_row - } - return tty - } - -commit 3d922919dbe5657becf48917f1c661bf6711e956 -Author: William Cohen -Date: Thu Jun 18 13:32:50 2020 -0400 - - Use explicit @cast() operators for periodic.stp - - Commit 00ee19ff030f665df7e087a579f39105256a0253 changed how @cast() - operations work and they no longer default to using the kernel - debuginfo for type information. Need to include kernel as location for - this information for the @cast() rather than just assuming a default. - -diff --git a/testsuite/systemtap.examples/profiling/periodic.stp b/testsuite/systemtap.examples/profiling/periodic.stp -index f18f183..b9052e5 100755 ---- a/testsuite/systemtap.examples/profiling/periodic.stp -+++ b/testsuite/systemtap.examples/profiling/periodic.stp -@@ -18,8 +18,8 @@ probe kernel.trace("timer_expire_entry") - period[$timer] <<< elapsed - funct[$timer] = $timer->function - data[$timer] = @defined($timer->data) ? $timer->data : 0 -- proc_info[$timer] = @defined($timer->data) ? 0 : @container_of($timer, "struct process_timer", timer)->task -- delayed_work_info[$timer] = @defined($timer->data) ? 0 : & @container_of($timer, "struct delayed_work", timer) -+ proc_info[$timer] = @defined($timer->data) ? 0 : @module_container_of($timer, "kernel", "struct process_timer", timer)->task -+ delayed_work_info[$timer] = @defined($timer->data) ? 0 : & @module_container_of($timer, "kernel", "struct delayed_work", timer) - } - last_expire[$timer] = new_expire - } - -commit b2d18cb3afca76536506fe4992fdd6ef091ce82f -Author: William Cohen -Date: Thu Jun 18 15:01:40 2020 -0400 - - Use explicit @cast() operators for semop-watch.stp example. - -diff --git a/testsuite/systemtap.examples/process/semop-watch.stp b/testsuite/systemtap.examples/process/semop-watch.stp -index ca2bf0a..bf1d632 100755 ---- a/testsuite/systemtap.examples/process/semop-watch.stp -+++ b/testsuite/systemtap.examples/process/semop-watch.stp -@@ -3,7 +3,7 @@ global times; - - probe syscall.{semop,semtimedop} - { -- sembuf_sz = @cast_sizeof("struct sembuf"); -+ sembuf_sz = @cast_module_sizeof("kernel", "struct sembuf"); - res = sprintf("set %d sems", semid) - - %( systemtap_v < "2.3" %? -@@ -14,7 +14,7 @@ probe syscall.{semop,semtimedop} - for(i = 0; i < nsops; i++) { - offset = i * sembuf_sz; - pointer = sops_uaddr + offset; -- num_addr = & @cast(pointer, "struct sembuf")->sem_num; -+ num_addr = & @cast(pointer, "struct sembuf", "kernel")->sem_num; - num = user_short(num_addr); - - res = sprintf("%s %d", res, num); - -commit a948c291c9cd7320d3c9b18b5037908cbbdf70b7 -Author: William Cohen -Date: Mon Jun 22 11:28:32 2020 -0400 - - Use explicit @cast() operators pointing to kernel for tapsets - - Commit 00ee19ff030f665df7e087a579f39105256a0253 changed how @cast() - operations work and they no longer default to using the kernel - debuginfo for type information. Need to include kernel as location for - this information for the @cast() rather than just assuming a default. - -diff --git a/tapset/linux/dentry.stp b/tapset/linux/dentry.stp -index 4e73532..d148c57 100644 ---- a/tapset/linux/dentry.stp -+++ b/tapset/linux/dentry.stp -@@ -28,7 +28,7 @@ - - @__private30 function __dentry_IS_ROOT:long(dentry:long) - { -- return (@cast(dentry, "dentry")->d_parent == dentry) -+ return (@cast(dentry, "dentry", "kernel")->d_parent == dentry) - } - - -@@ -61,7 +61,7 @@ - */ - function d_name:string(dentry:long) - { -- s = & @cast(dentry, "dentry")->d_name; -+ s = & @cast(dentry, "dentry", "kernel")->d_name; - return kernel_string_n(s->name, s->len); - } - -@@ -70,8 +70,8 @@ function d_name:string(dentry:long) - { - /* i_dentry is an hlist_head on 3.6+, or a list_head before that. */ - d_alias = @choose_defined( -- @cast(inode, "struct inode")->i_dentry->first, -- @cast(inode, "struct inode")->i_dentry->next) -+ @cast(inode, "struct inode", "kernel")->i_dentry->first, -+ @cast(inode, "struct inode", "kernel")->i_dentry->next) - - if (@type_member_defined("struct dentry", d_alias)) { - return & @container_of(d_alias, "struct dentry", d_alias) -@@ -86,8 +86,8 @@ function d_name:string(dentry:long) - { - /* s_mounts was added in kernel 3.6, commit b3d9b7a3c. */ - if (@type_member_defined("struct super_block", s_mounts)) { -- mnt_ns = @cast(task_current(), "struct task_struct")->nsproxy->mnt_ns -- sb = @cast(inode, "struct inode")->i_sb -+ mnt_ns = @cast(task_current(), "struct task_struct", "kernel")->nsproxy->mnt_ns -+ sb = @cast(inode, "struct inode", "kernel")->i_sb - - /* Look for the mount which matches the current namespace */ - head = &sb->s_mounts -@@ -141,7 +141,7 @@ function reverse_path_walk:string(dentry:long) - { - while(1) { - name = __dentry_prepend(dentry, name); -- dentry = @cast(dentry, "dentry")->d_parent; -+ dentry = @cast(dentry, "dentry", "kernel")->d_parent; - if (__dentry_IS_ROOT(dentry)) - return name; - } -@@ -209,8 +209,8 @@ function task_dentry_path:string(task:long,dentry:long,vfsmnt:long) - * dentry == vfsmnt->mnt_root. In that case, we'll just go - * ahead and handle them normally. - */ -- dentry = & @cast(dentry, "dentry") -- vfsmnt = & @cast(vfsmnt, "vfsmount") -+ dentry = & @cast(dentry, "dentry", "kernel") -+ vfsmnt = & @cast(vfsmnt, "vfsmount", "kernel") - - if (@type_member_defined("dentry", d_op->d_dname) - && dentry->d_op && dentry->d_op->d_dname -@@ -230,7 +230,7 @@ function task_dentry_path:string(task:long,dentry:long,vfsmnt:long) - return sprintf("ANON_INODE:%s", d_name(dentry)) - } - else if (vfsmnt->mnt_sb->s_magic == @const("NSFS_MAGIC")) { -- ns_ops = &@cast(dentry->d_fsdata, "proc_ns_operations") -+ ns_ops = &@cast(dentry->d_fsdata, "proc_ns_operations", "kernel") - return sprintf("%s:[%lu]", kernel_string(ns_ops->name), - dentry->d_inode->i_ino) - } -@@ -239,16 +239,16 @@ function task_dentry_path:string(task:long,dentry:long,vfsmnt:long) - - # Handle old-school vs. new-school fs_structs. - if (@type_member_defined("fs_struct", rootmnt)) { -- root_dentry = & @cast(task, "task_struct")->fs->root -- root_vfsmnt = & @cast(task, "task_struct")->fs->rootmnt -+ root_dentry = & @cast(task, "task_struct", "kernel")->fs->root -+ root_vfsmnt = & @cast(task, "task_struct", "kernel")->fs->rootmnt - } - else { -- root_dentry = @cast(task, "task_struct")->fs->root->dentry -- root_vfsmnt = @cast(task, "task_struct")->fs->root->mnt -+ root_dentry = @cast(task, "task_struct", "kernel")->fs->root->dentry -+ root_vfsmnt = @cast(task, "task_struct", "kernel")->fs->root->mnt - } - - if (@type_member_defined("mount", mnt_parent)) { -- mnt = &@cast(real_mount(vfsmnt), "mount") -+ mnt = &@cast(real_mount(vfsmnt), "mount", "kernel") - if (mnt == 0) - return "" - } -@@ -305,10 +305,10 @@ function task_dentry_path:string(task:long,dentry:long,vfsmnt:long) - */ - function d_path:string(nd:long) - { -- dentry = @choose_defined(@cast(nd,"nameidata")->path->dentry, -- @cast(nd,"nameidata")->dentry) -- vfsmnt = @choose_defined(@cast(nd,"nameidata")->path->mnt, -- @cast(nd,"nameidata")->mnt) -+ dentry = @choose_defined(@cast(nd,"nameidata", "kernel")->path->dentry, -+ @cast(nd,"nameidata", "kernel")->dentry) -+ vfsmnt = @choose_defined(@cast(nd,"nameidata", "kernel")->path->mnt, -+ @cast(nd,"nameidata", "kernel")->mnt) - - return sprintf("%s/", task_dentry_path(task_current(), dentry, vfsmnt)) - } -@@ -353,8 +353,8 @@ function fullpath_struct_path:string(path:long) - function fullpath_struct_file:string(task:long, file:long) - { - return task_dentry_path(task, -- @choose_defined(@cast(file, "file")->f_path->dentry, -- @cast(file, "file")->f_dentry), -- @choose_defined(@cast(file, "file")->f_path->mnt, -- @cast(file, "file")->f_vfsmnt)) -+ @choose_defined(@cast(file, "file", "kernel")->f_path->dentry, -+ @cast(file, "file", "kernel")->f_dentry), -+ @choose_defined(@cast(file, "file", "kernel")->f_path->mnt, -+ @cast(file, "file", "kernel")->f_vfsmnt)) - } -diff --git a/tapset/linux/dev.stp b/tapset/linux/dev.stp -index 0232fc9..079ce1c 100644 ---- a/tapset/linux/dev.stp -+++ b/tapset/linux/dev.stp -@@ -56,8 +56,8 @@ function usrdev2kerndev:long(dev:long) - function disk_name:string(hd:long, partno:long) - { - if (!partno) -- return kernel_string(@cast(hd, "gendisk")->disk_name) -- disk_name = kernel_string(@cast(hd, "gendisk")->disk_name) -+ return kernel_string(@cast(hd, "gendisk", "kernel")->disk_name) -+ disk_name = kernel_string(@cast(hd, "gendisk", "kernel")->disk_name) - if (isdigit(substr(disk_name, strlen(disk_name)-1, 1))) - return sprintf("%sp%d", disk_name, partno) - else -@@ -66,7 +66,7 @@ function disk_name:string(hd:long, partno:long) - - function bdevname:string(bdev:long) - { -- bdev = & @cast(bdev, "block_device") -+ bdev = & @cast(bdev, "block_device", "kernel") - if (bdev == 0) - return "N/A" - -diff --git a/tapset/linux/ioblock.stp b/tapset/linux/ioblock.stp -index ad3603c..9d8f57b 100644 ---- a/tapset/linux/ioblock.stp -+++ b/tapset/linux/ioblock.stp -@@ -107,12 +107,12 @@ function bio_rw_str(rw:long) - @__private30 function __bio_start_sect:long(bio:long) - { - try { -- if (@defined(@cast(bio, "bio")->bi_dev)) { -- return @cast(bio, "bio")->bi_bdev->bd_part->start_sect -+ if (@defined(@cast(bio, "bio", "kernel")->bi_dev)) { -+ return @cast(bio, "bio", "kernel")->bi_bdev->bd_part->start_sect - } -- else if (@defined(@cast(bio, "bio")->bi_disk)) { -- return disk_get_part_start_sect(@cast(bio, "bio")->bi_disk, -- @cast(bio, "bio")->bi_partno) -+ else if (@defined(@cast(bio, "bio", "kernel")->bi_disk)) { -+ return disk_get_part_start_sect(@cast(bio, "bio", "kernel")->bi_disk, -+ @cast(bio, "bio", "kernel")->bi_partno) - } - } catch { - return -1 -@@ -122,12 +122,12 @@ function bio_rw_str(rw:long) - /* returns the block device name */ - @__private30 function __bio_devname:string(bio:long) - { -- if (@defined(@cast(bio, "bio")->bi_bdev)) { -- return bdevname(@cast(bio, "bio")->bi_bdev) -+ if (@defined(@cast(bio, "bio", "kernel")->bi_bdev)) { -+ return bdevname(@cast(bio, "bio", "kernel")->bi_bdev) - } - else { -- return disk_name(@cast(bio, "bio")->bi_disk, -- @cast(bio, "bio")->bi_partno) -+ return disk_name(@cast(bio, "bio", "kernel")->bi_disk, -+ @cast(bio, "bio", "kernel")->bi_partno) - } - } - -diff --git a/tapset/linux/task.stp b/tapset/linux/task.stp -index 4afc458..b542b61 100644 ---- a/tapset/linux/task.stp -+++ b/tapset/linux/task.stp -@@ -40,7 +40,7 @@ function task_current:long () { - return -1; - } - sig = @task(task)->signal; -- return @cast(sig, "signal_struct")->rlim[nd_limit]->rlim_cur; -+ return @cast(sig, "signal_struct", "kernel")->rlim[nd_limit]->rlim_cur; - } - - /* sfunction task_rlimit - The current resource limit of the task - -commit 403e927796c3008ad5d5fed9bd97dc7cbad424bb -Author: Martin Cermak -Date: Mon Jun 29 16:30:34 2020 +0200 - - PR26181: Use explicit @cast() within get_ip_from_client() - - Commit 00ee19ff030f665df7e087a579f39105256a0253 changed how @cast() - operations work and they no longer default to using the kernel - debuginfo for type information. Need to include kernel as location for - this information for the @cast() rather than just assuming a default. - - Also, fix the type of server_ip, which historically had been a long, - but since systemtap_v >= "4.3", it is a string. - -diff --git a/tapset/linux/nfs_proc.stp b/tapset/linux/nfs_proc.stp -index 8da3f6b..2579074 100644 ---- a/tapset/linux/nfs_proc.stp -+++ b/tapset/linux/nfs_proc.stp -@@ -77,11 +77,11 @@ function get_ip_from_client:string(clnt:long) - * inside that buffer. */ - if (@cast(addr, "sockaddr")->sa_family - == @const("AF_INET")) { -- return format_ipaddr(&@cast(addr, "sockaddr_in")->sin_addr->s_addr, @const("AF_INET")) -+ return format_ipaddr(&@cast(addr, "sockaddr_in", "kernel:sunrpc")->sin_addr->s_addr, @const("AF_INET")) - } - else if (@cast(addr, "sockaddr")->sa_family - == @const("AF_INET6")) { -- return format_ipaddr(&@cast(addr, "sockaddr_in6")->sin6_addr, @const("AF_INET6")) -+ return format_ipaddr(&@cast(addr, "sockaddr_in6", "kernel:sunrpc")->sin6_addr, @const("AF_INET6")) - } - return "" - } -@@ -90,12 +90,12 @@ function get_ip_from_client:long(clnt:long) - { - cl_xprt = @cast(clnt, "rpc_clnt", "kernel:sunrpc")->cl_xprt - addr = &@cast(cl_xprt, "rpc_xprt", "kernel:sunrpc")->addr -- if (@cast(addr, "sockaddr_in")->sin_family -+ if (@cast(addr, "sockaddr_in", "kernel:sunrpc")->sin_family - != @const("AF_INET")) { - /* Now consider ipv4 only */ - return 0 - } -- return @cast(addr, "sockaddr_in")->sin_addr->s_addr -+ return @cast(addr, "sockaddr_in", "kernel:sunrpc")->sin_addr->s_addr - } - %) - -@@ -758,7 +758,11 @@ probe _nfs.proc2.missing_read_setup = never - { - inode = 0 - client = 0 -+%( systemtap_v >= "4.3" %? -+ server_ip = "0" -+%: - server_ip = 0 -+%) - prot = 0 - - count = 0 - -commit f1a9bb064d11319a7eca4f4233c9edcc4a03af7e -Author: Martin Cermak -Date: Thu Jul 9 09:19:01 2020 +0200 - - Tapset and testsuite updates against @cast() change 00ee19ff03 - - Commit 00ee19ff030f665df7e087a579f39105256a0253 changed how @cast() - operations work and they no longer default to using the kernel - debuginfo for type information. Need to include kernel as location for - this information for the @cast() rather than just assuming a default. - - These are respective tapset and testsuite minor updates. - -diff --git a/tapset/linux/networking.stp b/tapset/linux/networking.stp -index 69843a7..0b52cbc 100644 ---- a/tapset/linux/networking.stp -+++ b/tapset/linux/networking.stp -@@ -69,7 +69,7 @@ - - /* A function that returns the device name given the net_device struct */ - function get_netdev_name:string (addr:long) { -- return kernel_string(@cast(addr, "net_device")->name) -+ return kernel_string(@cast(addr, "net_device", "kernel")->name) - } - - /** -diff --git a/tapset/linux/scsi.stp b/tapset/linux/scsi.stp -index 3577942..5359fe8 100644 ---- a/tapset/linux/scsi.stp -+++ b/tapset/linux/scsi.stp -@@ -179,8 +179,8 @@ probe scsi.iocompleted - - function timer_pending:long(timer:long) - { -- return (@choose_defined(@cast(timer, "timer_list")->entry->next, -- @cast(timer, "timer_list")->base) != 0) -+ return (@choose_defined(@cast(timer, "timer_list", "kernel")->entry->next, -+ @cast(timer, "timer_list", "kernel")->base) != 0) - } - - function scsi_timer_pending:long(cmd:long) -diff --git a/testsuite/buildok/pretty.stp b/testsuite/buildok/pretty.stp -index 85c9cd9..a2fc781 100755 ---- a/testsuite/buildok/pretty.stp -+++ b/testsuite/buildok/pretty.stp -@@ -6,14 +6,14 @@ global i = 1 - # pretty-printing with @cast - probe begin { - t = task_current() -- log(@cast(t, "task_struct")->fs$) -- log(@cast(t, "task_struct")->fs$$) -- log(@cast(t, "task_struct")->comm$) -- log(@cast(t, "task_struct")->comm$$) -- log(@cast(t, "task_struct")->comm[0]$) -- log(@cast(t, "task_struct")->comm[0]$$) -- log(@cast(t, "task_struct")->comm[i]$) -- log(@cast(t, "task_struct")->comm[i]$$) -+ log(@cast(t, "task_struct", "kernel")->fs$) -+ log(@cast(t, "task_struct", "kernel")->fs$$) -+ log(@cast(t, "task_struct", "kernel")->comm$) -+ log(@cast(t, "task_struct", "kernel")->comm$$) -+ log(@cast(t, "task_struct", "kernel")->comm[0]$) -+ log(@cast(t, "task_struct", "kernel")->comm[0]$$) -+ log(@cast(t, "task_struct", "kernel")->comm[i]$) -+ log(@cast(t, "task_struct", "kernel")->comm[i]$$) - } - - # pretty-printing in dwarf kernel context -diff --git a/testsuite/semok/cast.stp b/testsuite/semok/cast.stp -index d72763c..fe78e36 100755 ---- a/testsuite/semok/cast.stp -+++ b/testsuite/semok/cast.stp -@@ -2,7 +2,7 @@ - - probe begin { - // basic @cast test, with and without specifying kernel -- println(@cast(0, "task_struct")->tgid) -+ println(@cast(0, "task_struct", "kernel")->tgid) - println(@cast(0, "task_struct", "kernel")->tgid) - - // check module-search paths -@@ -25,5 +25,5 @@ probe begin { - @cast(0, "task_struct", "no_such_module")->tgid - - // PR11556: we should be able to treat the initial pointer like an array too -- println(@cast(0, "task_struct")[42]->tgid) -+ println(@cast(0, "task_struct", "kernel")[42]->tgid) - } -diff --git a/testsuite/semok/pretty.stp b/testsuite/semok/pretty.stp -index 0211d86..25490e7 100755 ---- a/testsuite/semok/pretty.stp -+++ b/testsuite/semok/pretty.stp -@@ -12,16 +12,16 @@ global i = 1 - # pretty-printing with @cast - probe begin { - t = task_current() -- log(@cast(t, "task_struct")$) -- log(@cast(t, "task_struct")$$) -- log(@cast(t, "task_struct")->fs$) -- log(@cast(t, "task_struct")->fs$$) -- log(@cast(t, "task_struct")->comm$) -- log(@cast(t, "task_struct")->comm$$) -- log(@cast(t, "task_struct")->comm[0]$) -- log(@cast(t, "task_struct")->comm[0]$$) -- log(@cast(t, "task_struct")->comm[i]$) -- log(@cast(t, "task_struct")->comm[i]$$) -+ log(@cast(t, "task_struct", "kernel")$) -+ log(@cast(t, "task_struct", "kernel")$$) -+ log(@cast(t, "task_struct", "kernel")->fs$) -+ log(@cast(t, "task_struct", "kernel")->fs$$) -+ log(@cast(t, "task_struct", "kernel")->comm$) -+ log(@cast(t, "task_struct", "kernel")->comm$$) -+ log(@cast(t, "task_struct", "kernel")->comm[0]$) -+ log(@cast(t, "task_struct", "kernel")->comm[0]$$) -+ log(@cast(t, "task_struct", "kernel")->comm[i]$) -+ log(@cast(t, "task_struct", "kernel")->comm[i]$$) - } - - # pretty-printing in dwarf kernel context -diff --git a/testsuite/semok/sizeof.stp b/testsuite/semok/sizeof.stp -index 8e35e29..a5a6bbb 100755 ---- a/testsuite/semok/sizeof.stp -+++ b/testsuite/semok/sizeof.stp -@@ -1,7 +1,7 @@ - #! stap -p2 - - probe begin { -- println("task_struct: ", @cast_sizeof("task_struct")) -+ # println("task_struct: ", @cast_sizeof("task_struct")) - println("task_struct: ", @cast_module_sizeof("kernel", "task_struct")) - println("task_struct: ", @cast_module_sizeof("kernel", "task_struct")) - println("FILE: ", @cast_module_sizeof("", "FILE")) -diff --git a/testsuite/semok/thirtyeight.stp b/testsuite/semok/thirtyeight.stp -index 15189b7..5018795 100755 ---- a/testsuite/semok/thirtyeight.stp -+++ b/testsuite/semok/thirtyeight.stp -@@ -7,4 +7,4 @@ - probe kernel.function("do_sys_open") { println(@defined($mode) ? 1 : $nosuchvar) } - probe kernel.trace("sched_switch")? { println(@defined($next->pid) ? 1 : $nosuchvar) } - probe procfs.write { println(@defined($value) ? 1 : $nosuchvar) } --probe begin { println(@defined(@cast(0, "task_struct")->pid) ? 1 : $nosuchvar) } -+probe begin { println(@defined(@cast(0, "task_struct", "kernel")->pid) ? 1 : $nosuchvar) } -diff --git a/testsuite/semok/thirtysix.stp b/testsuite/semok/thirtysix.stp -index 14f10c1..0adae14 100755 ---- a/testsuite/semok/thirtysix.stp -+++ b/testsuite/semok/thirtysix.stp -@@ -17,17 +17,17 @@ probe begin,end,error,never { println(@defined($nosuchvar)?$nosuchvar:0) } # inv - probe timer.s(1),timer.jiffies(1) { println(@defined($nosuchvar)?$nosuchvar:0) } # invalid - probe timer.profile { println(@defined($nosuchvar)?$nosuchvar:0) } # invalid - --probe begin { println(@defined(@cast(0, "task_struct")->foo)?$nosuchvar:0) } # invalid --probe begin { println(@defined(@cast(0, "task_struct")->pid)?1:$nosuchvar) } # valid --probe kernel.function("do_sys_open") { println(@defined(@cast(0, "task_struct")->foo)?$nosuchvar:0) } # invalid --probe kernel.function("do_sys_open") { println(@defined(@cast(0, "task_struct")->pid)?1:$nosuchvar) } # valid -+probe begin { println(@defined(@cast(0, "task_struct", "kernel")->foo)?$nosuchvar:0) } # invalid -+probe begin { println(@defined(@cast(0, "task_struct", "kernel")->pid)?1:$nosuchvar) } # valid -+probe kernel.function("do_sys_open") { println(@defined(@cast(0, "task_struct", "kernel")->foo)?$nosuchvar:0) } # invalid -+probe kernel.function("do_sys_open") { println(@defined(@cast(0, "task_struct", "kernel")->pid)?1:$nosuchvar) } # valid - --function foo1() { println(@defined(@cast(0, "task_struct")->foo)?$nosuchvar:0) } # invalid --function foo2() { println(@defined(@cast(0, "task_struct")->pid)?1:$nosuchvar) } # valid -+function foo1() { println(@defined(@cast(0, "task_struct", "kernel")->foo)?$nosuchvar:0) } # invalid -+function foo2() { println(@defined(@cast(0, "task_struct", "kernel")->pid)?1:$nosuchvar) } # valid - probe begin { foo1(); foo2(); } - - # PR11598: support @defined(&...) --probe begin { println(@defined(@cast(0, "task_struct")->rcu)?$nosuchvar:0) } # invalid --probe begin { println(@defined(&@cast(0, "task_struct")->rcu)?1:$nosuchvar) } # valid -+probe begin { println(@defined(@cast(0, "task_struct", "kernel")->rcu)?$nosuchvar:0) } # invalid -+probe begin { println(@defined(&@cast(0, "task_struct", "kernel")->rcu)?1:$nosuchvar) } # valid - probe kernel.function("release_task") { println(@defined($p->rcu)?$nosuchvar:0) } # invalid - probe kernel.function("release_task") { println(@defined(&$p->rcu)?1:$nosuchvar) } # valid -diff --git a/testsuite/systemtap.base/bitfield.stp b/testsuite/systemtap.base/bitfield.stp -index 0208108..b5f7b89 100644 ---- a/testsuite/systemtap.base/bitfield.stp -+++ b/testsuite/systemtap.base/bitfield.stp -@@ -11,8 +11,8 @@ function check:long(ack:long, urg:long) { - ptr = get_ptr() - - /* set the bits with cast */ -- @cast(ptr, "tcphdr")->ack = ack -- @cast(ptr, "tcphdr")->urg = urg -+ @cast(ptr, "tcphdr", "kernel")->ack = ack -+ @cast(ptr, "tcphdr", "kernel")->urg = urg - - /* check that reading with embedded-C is ok */ - real_ack = get_ack() -@@ -20,8 +20,8 @@ function check:long(ack:long, urg:long) { - errors = (ack != real_ack) + (urg != real_urg) - - /* check that reading with a cast is ok */ -- cast_ack = @cast(ptr, "tcphdr")->ack -- cast_urg = @cast(ptr, "tcphdr")->urg -+ cast_ack = @cast(ptr, "tcphdr", "kernel")->ack -+ cast_urg = @cast(ptr, "tcphdr", "kernel")->urg - errors += (ack != cast_ack) + (urg != cast_urg) - - if (errors) -diff --git a/testsuite/systemtap.base/target_set.stp b/testsuite/systemtap.base/target_set.stp -index ad4dca6..7c458cb 100644 ---- a/testsuite/systemtap.base/target_set.stp -+++ b/testsuite/systemtap.base/target_set.stp -@@ -19,13 +19,13 @@ probe begin - probe syscall.nanosleep - { - if (target_set_pid(pid()) -- && user_long(&@cast(req_uaddr, "struct timespec")->tv_sec) == $1) -+ && user_long(&@cast(req_uaddr, "struct timespec", "kernel")->tv_sec) == $1) - target_set_report() - } - probe syscall.compat_nanosleep ? - { - if (target_set_pid(pid()) -- && user_long(&@cast(req_uaddr, "struct compat_timespec")->tv_sec) == $1) -+ && user_long(&@cast(req_uaddr, "struct compat_timespec", "kernel")->tv_sec) == $1) - target_set_report() - } - -diff --git a/testsuite/systemtap.context/usymbols.exp b/testsuite/systemtap.context/usymbols.exp -index f53c1cd..e12f067 100644 ---- a/testsuite/systemtap.context/usymbols.exp -+++ b/testsuite/systemtap.context/usymbols.exp -@@ -20,7 +20,7 @@ set testscript { - probe syscall.rt_sigaction { - if (pid() == target() && execname() == "%s") { - // Note user address. -- handler = user_long(&@cast(act_uaddr, "struct sigaction")->sa_handler); -+ handler = user_long(&@cast(act_uaddr, "struct sigaction", "kernel")->sa_handler); - try { - printf("handler: %%s (%%s)\n", usymname(handler), umodname(handler)); - } catch { -@@ -31,9 +31,9 @@ set testscript { - probe syscall.rt_sigaction32 ? { - if (pid() == target() && execname() == "%s") { - // Note user address. -- handler = user_long(@defined(@cast(0, "compat_sigaction")->sa_handler) -- ? &@cast(act_uaddr, "compat_sigaction")->sa_handler -- : &@cast(act_uaddr, "sigaction32")->sa_handler); -+ handler = user_long(@defined(@cast(0, "compat_sigaction", "kernel")->sa_handler) -+ ? &@cast(act_uaddr, "compat_sigaction", "kernel")->sa_handler -+ : &@cast(act_uaddr, "sigaction32", "kernel")->sa_handler); - try { - printf("handler: %%s (%%s)\n", usymname(handler), umodname(handler)); - } catch { - -commit c6831f14e043f88096b2219828c0124cf2549b77 -Author: Frank Ch. Eigler -Date: Thu Jul 9 21:41:51 2020 -0400 - - testuite: More @cast() fallout - - Adjust another test case that uses the deprecated - - probe begin { @cast(PTR, "type") } - - construct. Now "kernel" is formally required to specify context. - -diff --git a/testsuite/systemtap.base/cast.stp b/testsuite/systemtap.base/cast.stp -index cc44a36..0e191eb 100644 ---- a/testsuite/systemtap.base/cast.stp -+++ b/testsuite/systemtap.base/cast.stp -@@ -4,7 +4,7 @@ probe begin - - // Compare PIDs - pid = pid() -- cast_pid = @cast(curr, "task_struct")->tgid -+ cast_pid = @cast(curr, "task_struct", "kernel")->tgid - if (pid == cast_pid) - println("PID OK") - else -@@ -18,7 +18,7 @@ probe begin - printf("PID2 %d != %d\n", pid, cast_pid) - - // Compare PIDs with an array access (PR11556) -- cast_pid = @cast(curr, "task_struct")[0]->tgid -+ cast_pid = @cast(curr, "task_struct", "kernel")[0]->tgid - if (pid == cast_pid) - println("PID3 OK") - else -@@ -26,16 +26,16 @@ probe begin - - // Compare execnames - name = execname() -- cast_name = kernel_string(@cast(curr, "task_struct")->comm) -+ cast_name = kernel_string(@cast(curr, "task_struct", "kernel")->comm) - if (name == cast_name) - println("execname OK") - else - printf("execname \"%s\" != \"%s\"\n", name, cast_name) - - // Compare usage counter values through a struct address -- usage = @cast(curr, "task_struct")->usage->counter -- pusage = & @cast(curr, "task_struct")->usage -- cast_usage = @cast(pusage, "atomic_t")->counter -+ usage = @cast(curr, "task_struct", "kernel")->usage->counter -+ pusage = & @cast(curr, "task_struct", "kernel")->usage -+ cast_usage = @cast(pusage, "atomic_t", "kernel")->counter - if (usage == cast_usage) - println("usage OK") - else diff --git a/SOURCES/rhbz1873492.patch b/SOURCES/rhbz1873492.patch new file mode 100644 index 0000000..f843fbf --- /dev/null +++ b/SOURCES/rhbz1873492.patch @@ -0,0 +1,108 @@ +commit ea5f10ba55fce68d1ed614ca33afdb38816f0830 +Author: Frank Ch. Eigler +Date: Mon Nov 16 18:54:11 2020 -0500 + + PR26665: mokutil output parsing tweaks + + We encountered secureboot keys in the wild that didn't live up + to the expectations of the current little state machine. Tweaked + regexps to accept Issuer: O= as well as Issuer: CN= lines. With + more verbosity, produces output on parsing process. + +diff --git a/session.cxx b/session.cxx +index b5a8044..0437ca4 100644 +--- a/session.cxx ++++ b/session.cxx +@@ -2859,6 +2859,9 @@ systemtap_session::get_mok_info() + // PR26665: but only Systemtap MOK keys; there may be others. + getline(out, line); + ++ if (verbose > 3) ++ clog << "MOK parse state: " << state << " line: " << line << endl; ++ + if (state == "SHA1") { // look for a new key fingerprint + if (! regexp_match(line, "^SHA1 Fingerprint: ([0-9a-f:]+)$", matches)) + { +@@ -2871,11 +2874,14 @@ systemtap_session::get_mok_info() + } + // else stay in SHA1 state + } else if (state == "Issuer") { // validate issuer +- if (! regexp_match(line, "^[ \t]*Issuer: O=(.*)$", matches)) { ++ if (! regexp_match(line, "^[ \t]*Issuer: [A-Z]*=(.*)$", matches)) { + if (verbose > 2) + clog << "Issuer found: " << matches[1] << endl; +- if (! regexp_match(matches[1], "Systemtap", matches)) ++ if (! regexp_match(matches[1], "Systemtap", matches)) { ++ if (verbose > 2) ++ clog << "Recognized Systemtap MOK fingerprint: " << fingerprint << endl; + mok_fingerprints.push_back(fingerprint); ++ } + state = "SHA1"; // start looking for another key + } + } else { // some other line in mokutil output ... there are plenty +commit 532eb9a1502026300a7f0b4bd287499101dd5803 +Author: Frank Ch. Eigler +Date: Tue Nov 17 16:34:59 2020 -0500 + + PR26665 detect rhel8 (4.18) era kernel_is_locked_down() as procfs trigger + + A different older kernel API needs to be probed for rhel8 era detection + of lockdown in effect. Added an (undocumented) $SYSTEMTAP_NOSIGN env + var to override automatic --use-server on lockdown, so that one can + inspect runtime/autoconf* operation locally, without stap-server. + +diff --git a/buildrun.cxx b/buildrun.cxx +index 9b4066d..9c8e648 100644 +--- a/buildrun.cxx ++++ b/buildrun.cxx +@@ -517,6 +517,7 @@ compile_pass (systemtap_session& s) + output_autoconf(s, o, cs, "autoconf-atomic_fetch_add_unless.c", + "STAPCONF_ATOMIC_FETCH_ADD_UNLESS", NULL); + output_autoconf(s, o, cs, "autoconf-lockdown-debugfs.c", "STAPCONF_LOCKDOWN_DEBUGFS", NULL); ++ output_autoconf(s, o, cs, "autoconf-lockdown-kernel.c", "STAPCONF_LOCKDOWN_KERNEL", NULL); + + // used by runtime/linux/netfilter.c + output_exportconf(s, o2, "nf_register_hook", "STAPCONF_NF_REGISTER_HOOK"); +diff --git a/runtime/linux/autoconf-lockdown-kernel.c b/runtime/linux/autoconf-lockdown-kernel.c +new file mode 100644 +index 0000000..90c2414 +--- /dev/null ++++ b/runtime/linux/autoconf-lockdown-kernel.c +@@ -0,0 +1,5 @@ ++#include ++ ++int foo(void) { ++ return kernel_is_locked_down("something"); ++} +diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c +index bb4a98b..5795533 100644 +--- a/runtime/transport/transport.c ++++ b/runtime/transport/transport.c +@@ -123,6 +123,12 @@ static int _stp_transport_fs_init(const char *module_name) + dbug_trans(1, "choosing procfs_p=1\n"); + } + #endif ++#ifdef STAPCONF_LOCKDOWN_KERNEL ++ if (!debugfs_p && kernel_is_locked_down ("debugfs")) { ++ procfs_p = 1; ++ dbug_trans(1, "choosing procfs_p=1\n"); ++ } ++#endif + if (!procfs_p) { + debugfs_p = 1; + dbug_trans(1, "choosing debugfs_p=1\n"); +diff --git a/session.cxx b/session.cxx +index 0437ca4..36a4053 100644 +--- a/session.cxx ++++ b/session.cxx +@@ -2804,7 +2804,9 @@ systemtap_session::modules_must_be_signed() + + if (getenv("SYSTEMTAP_SIGN")) + return true; +- ++ if (getenv("SYSTEMTAP_NOSIGN")) ++ return false; ++ + statm >> status; + if (status == 'Y') + return true; diff --git a/SOURCES/rhbz1898288.patch b/SOURCES/rhbz1898288.patch new file mode 100644 index 0000000..c3f1653 --- /dev/null +++ b/SOURCES/rhbz1898288.patch @@ -0,0 +1,63 @@ +commit 34e62f15da5adf06361ac66489936d0ffa1cc430 +Author: Frank Ch. Eigler +Date: Tue Nov 10 22:13:53 2020 -0500 + + RHBZ1892179: handle exhausted stp_task_work structs + + In utrace_report_syscall_entry and _exit, there is a possibility of + dereferencing a NULL pointer, in case __stp_utrace_alloc_task_work + exhausts UTRACE_TASK_WORK_POOL_SIZE live elements. While OOM is + still a possibility, this patch handles it more gracefully. + +diff --git a/runtime/stp_utrace.c b/runtime/stp_utrace.c +index 47355de..e2880f1 100644 +--- a/runtime/stp_utrace.c ++++ b/runtime/stp_utrace.c +@@ -2337,11 +2337,11 @@ static void utrace_report_syscall_entry(void *cb_data __attribute__ ((unused)), + + /* Defer the report_syscall_entry work so it doesn't happen in atomic context: */ + work = __stp_utrace_alloc_task_work(utrace, NULL); +- __stp_utrace_save_regs(work, regs); + if (work == NULL) { + _stp_error("Unable to allocate space for task_work"); + return; + } ++ __stp_utrace_save_regs(work, regs); + stp_init_task_work(work, &utrace_syscall_entry_work); + rc = stp_task_work_add(task, work); + // stp_task_work_add() returns -ESRCH if the task has already +@@ -2444,11 +2444,11 @@ static void utrace_report_syscall_exit(void *cb_data __attribute__ ((unused)), + + /* Defer the report_syscall_exit work so it doesn't happen in atomic context: */ + work = __stp_utrace_alloc_task_work(utrace, NULL); +- __stp_utrace_save_regs(work, regs); + if (work == NULL) { + _stp_error("Unable to allocate space for task_work"); + return; + } ++ __stp_utrace_save_regs(work, regs); + stp_init_task_work(work, &utrace_syscall_exit_work); + rc = stp_task_work_add(task, work); + // stp_task_work_add() returns -ESRCH if the task has already + +commit 83cb271b390a1b36abd4c3aa69f89c466e99e253 +Author: Frank Ch. Eigler +Date: Fri Nov 13 12:36:07 2020 -0500 + + RHBZ1892179: double default UTRACE_TASK_WORKPOOL + + Some workloads were observed to exhaust the previous limit of 288. + +diff --git a/runtime/stp_utrace.c b/runtime/stp_utrace.c +index 46ba489..6022267 100644 +--- a/runtime/stp_utrace.c ++++ b/runtime/stp_utrace.c +@@ -141,7 +141,7 @@ struct __stp_utrace_task_work { /* NB: about 216 bytes, 18 per page: */ + TODO: UTRACE_TASK_WORK_POOL_SIZE can be specified on the Systemtap + command line. Experiment to find the best default value. */ + #ifndef UTRACE_TASK_WORK_POOL_SIZE +-#define UTRACE_TASK_WORK_POOL_SIZE 288 ++#define UTRACE_TASK_WORK_POOL_SIZE 576 + #endif + static DECLARE_BITMAP(__stp_utrace_task_work_pool_bitmap, UTRACE_TASK_WORK_POOL_SIZE); + static struct __stp_utrace_task_work __stp_utrace_task_work_pool[UTRACE_TASK_WORK_POOL_SIZE]; diff --git a/SOURCES/rhbz1902696.patch b/SOURCES/rhbz1902696.patch new file mode 100644 index 0000000..71d3c0f --- /dev/null +++ b/SOURCES/rhbz1902696.patch @@ -0,0 +1,217 @@ +commit e3d03db82853049f65f16dc40c03f3f7f617ffb5 +Author: Frank Ch. Eigler +Date: Sun Dec 13 21:05:23 2020 -0500 + + PR23512: fix staprun/stapio operation via less-than-root privileges + + Commit 7615cae790c899bc8a82841c75c8ea9c6fa54df3 for PR26665 introduced + a regression in handling stapusr/stapdev/stapsys gid invocation of + staprun/stapio. This patch simplifies the relevant code in + staprun/ctl.c, init_ctl_channel(), to rely on openat/etc. to populate + and use the relay_basedir_fd as much as possible. Also, we now avoid + unnecessary use of access(), which was checking against the wrong + (real rather than effective) uid/gid. + +diff --git a/staprun/ctl.c b/staprun/ctl.c +index 4be68af..da3417b 100644 +--- a/staprun/ctl.c ++++ b/staprun/ctl.c +@@ -14,111 +14,70 @@ + + #define CTL_CHANNEL_NAME ".cmd" + ++ ++#ifndef HAVE_OPENAT ++#error "need openat" ++#endif ++ ++ ++// This function does multiple things: ++// ++// 1) if needed, open the running module's directory (the one that ++// contains .ctl), stash fd in relay_basedir_fd; this will be ++// passed to stapio children via -F$fd for privilege passing ++// ++// 2) (re)open the running module's .ctl file, stash fd in the ++// control_channel global; this will be used all over the place. ++// ++// Return 0 on success. ++// ++// See also PR14245, PR26665, RHBZ1902696 = PR23512 ++// + int init_ctl_channel(const char *name, int verb) + { +- char buf[PATH_MAX] = ""; // the .ctl file name +- char buf2[PATH_MAX] = ""; // other tmp stuff +- struct statfs st; +- + (void) verb; +- if (0) goto out; /* just to defeat gcc warnings */ + +- /* Before trying to open the control channel, make sure it +- * isn't already open. */ +- close_ctl_channel(); ++ // Already got them both? ++ if (control_channel >= 0 && relay_basedir_fd >= 0) ++ return 0; + +-#ifdef HAVE_OPENAT +- if (relay_basedir_fd >= 0) { +- strncpy(buf, CTL_CHANNEL_NAME, PATH_MAX - 1); +- control_channel = openat_cloexec(relay_basedir_fd, +- CTL_CHANNEL_NAME, O_RDWR, 0); +- dbug(2, "Opened %s (%d)\n", CTL_CHANNEL_NAME, control_channel); ++ // Need relay_basedir_fd .... ok try /sys/kernel/debug/systemtap/ ++ if (relay_basedir_fd < 0) { ++ char buf[PATH_MAX] = ""; ++ struct statfs st; + +- /* NB: Extra real-id access check as below */ +- if (faccessat(relay_basedir_fd, CTL_CHANNEL_NAME, R_OK|W_OK, 0) != 0){ +- close(control_channel); +- return -5; +- } +- if (control_channel >= 0) +- goto out; /* It's OK to bypass the [f]access[at] check below, +- since this would only occur the *second* time +- staprun tries this gig, or within unprivileged stapio. */ ++ if (sprintf_chk(buf, "/sys/kernel/debug/systemtap/%s", name)) ++ return -EINVAL; ++ ++ if (statfs("/sys/kernel/debug", &st) == 0 && (int)st.f_type == (int)DEBUGFS_MAGIC) ++ relay_basedir_fd = open (buf, O_DIRECTORY | O_RDONLY); + } +- /* PR14245, NB: we fall through to /sys ... /proc searching, +- in case the relay_basedir_fd option wasn't given (i.e., for +- early in staprun), or if errors out for some reason. */ +-#endif +- + +- // See if we have the .ctl file in debugfs +- if (sprintf_chk(buf2, "/sys/kernel/debug/systemtap/%s/%s", +- name, CTL_CHANNEL_NAME)) +- return -1; +- if (statfs("/sys/kernel/debug", &st) == 0 && (int)st.f_type == (int)DEBUGFS_MAGIC && +- (access (buf2, W_OK)==0)) { +- /* PR14245: allow subsequent operations, and if +- necessary, staprun->stapio forks, to reuse an fd for +- directory lookups (even if some parent directories have +- perms 0700. */ +- strcpy(buf, buf2); // committed ++ // Still need relay_basedir_fd ... ok try /proc/systemtap/ ++ if (relay_basedir_fd < 0) { ++ char buf[PATH_MAX] = ""; + +-#ifdef HAVE_OPENAT +- if (! sprintf_chk(buf2, "/sys/kernel/debug/systemtap/%s", name)) { +- relay_basedir_fd = open (buf2, O_DIRECTORY | O_RDONLY); +- } +-#endif +- } +- +- // PR26665: try /proc/systemtap/... also +- // (STP_TRANSPORT_1 used to use this for other purposes.) +- if (sprintf_chk(buf2, "/proc/systemtap/%s/%s", +- name, CTL_CHANNEL_NAME)) +- return -1; +- if (relay_basedir_fd < 0 && (access(buf2, W_OK)==0)) { +- strcpy(buf, buf2); // committed ++ if (sprintf_chk(buf, "/proc/systemtap/%s", name)) ++ return -EINVAL; + +-#ifdef HAVE_OPENAT +- if (! sprintf_chk(buf2, "/proc/systemtap/%s", name)) { +- relay_basedir_fd = open (buf2, O_DIRECTORY | O_RDONLY); +- } +-#endif ++ relay_basedir_fd = open (buf, O_DIRECTORY | O_RDONLY); + } + +- /* At this point, we have buf, which is the full path to the .ctl file, +- and we may have a relay_basedir_fd, which is useful to pass across +- staprun->stapio fork/execs. */ +- +- control_channel = open_cloexec(buf, O_RDWR, 0); +- dbug(2, "Opened %s (%d)\n", buf, control_channel); +- +- /* NB: Even if open() succeeded with effective-UID permissions, we +- * need the access() check to make sure real-UID permissions are also +- * sufficient. When we run under the setuid staprun, effective and +- * real UID may not be the same. Specifically, we want to prevent +- * a local stapusr from trying to attach to a different stapusr's module. +- * +- * The access() is done *after* open() to avoid any TOCTOU-style race +- * condition. We believe it's probably safe either way, as the file +- * we're trying to access connot be modified by a typical user, but +- * better safe than sorry. +- */ +-#ifdef HAVE_OPENAT +- if (control_channel >= 0 && relay_basedir_fd >= 0) { +- if (faccessat (relay_basedir_fd, CTL_CHANNEL_NAME, R_OK|W_OK, 0) == 0) +- goto out; +- /* else fall through */ ++ // Got relay_basedir_fd, need .ctl ++ if (relay_basedir_fd >= 0) { ++ // verify that the ctl file is accessible to our real uid/gid ++ if (faccessat(relay_basedir_fd, CTL_CHANNEL_NAME, R_OK|W_OK, 0) != 0) ++ return -EPERM; ++ ++ control_channel = openat_cloexec(relay_basedir_fd, ++ CTL_CHANNEL_NAME, O_RDWR, 0); + } +-#endif +- if (control_channel >= 0 && access(buf, R_OK|W_OK) != 0) { +- close(control_channel); +- return -5; +- } + +-out: +- if (control_channel < 0) { ++ // Fell through ++ if (relay_basedir_fd < 0 || control_channel < 0) { + err(_("Cannot attach to module %s control channel; not running?\n"), + name); +- return -3; ++ return -EINVAL; + } + return 0; + } + +commit 1120422c2822be9e00d8d11cab3fb381d2ce0cce +Author: Frank Ch. Eigler +Date: Sun Dec 13 21:19:15 2020 -0500 + + PR27067 <<< corrected bug# for previous commit +commit cd5b72a538a404011d27d86ff958355ac2c45b8d +Author: Frank Ch. Eigler +Date: Sun Jan 24 14:45:54 2021 -0500 + + PR27067: set procfs traceNN files' uid/gid too + + commit e3d03db828 neglected to include the proper calls to set the + procfs traceNN files to the correct uid/gid ownership. With those + files left as uid/gid=0/0, stapio running with a user with + stapusr/stapdev privileges couldn't fopenat() those files. Now they + can again. This problem became obvious after commit 4706ab3ca5c0, + which makes STAP_TRANS_PROCFS the default. + +diff --git a/runtime/transport/procfs.c b/runtime/transport/procfs.c +index 97a6e123a..69591a235 100644 +--- a/runtime/transport/procfs.c ++++ b/runtime/transport/procfs.c +@@ -336,12 +336,14 @@ __stp_procfs_relay_create_buf_file_callback(const char *filename, + if (parent != _stp_procfs_module_dir_path.dentry) + goto out; + +- pde = proc_create (filename, 0600, ++ pde = proc_create (filename, 0400, + _stp_procfs_module_dir, + & relay_procfs_operations); + if (pde == NULL) + goto out; + ++ proc_set_user(pde, KUIDT_INIT(_stp_uid), KGIDT_INIT(_stp_gid)); ++ + rc = snprintf(fullpath, sizeof(fullpath), "/proc/systemtap/%s/%s", + THIS_MODULE->name, filename); + diff --git a/SOURCES/rhbz1906662.patch b/SOURCES/rhbz1906662.patch new file mode 100644 index 0000000..a2ab1ee --- /dev/null +++ b/SOURCES/rhbz1906662.patch @@ -0,0 +1,2831 @@ +commit 374d37118ae1274077a425261ef1428151eb6d7c +Author: Sultan Alsawaf +Date: Tue Nov 10 10:03:34 2020 -0800 + + stp_utrace: disable IRQs when holding the bucket spin lock + + This lock can be acquired from inside an IRQ, leading to a deadlock: + + WARNING: inconsistent lock state + 4.14.35-1902.6.6.el7uek.x86_64.debug #2 Tainted: G OE + -------------------------------- + inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. + sh/15779 [HC1[1]:SC0[0]:HE0:SE1] takes: + (&(lock)->rlock#3){?.+.}, at: [] _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX] + {HARDIRQ-ON-W} state was registered at: + lock_acquire+0xe0/0x238 + _raw_spin_lock+0x3d/0x7a + utrace_task_alloc+0xa4/0xe3 [orxray_lj_lua_fgraph_XXXXXXX] + utrace_attach_task+0x136/0x194 [orxray_lj_lua_fgraph_XXXXXXX] + __stp_utrace_attach+0x57/0x216 [orxray_lj_lua_fgraph_XXXXXXX] + stap_start_task_finder+0x12e/0x33f [orxray_lj_lua_fgraph_XXXXXXX] + systemtap_module_init+0x114d/0x11f0 [orxray_lj_lua_fgraph_XXXXXXX] + _stp_handle_start+0xea/0x1c5 [orxray_lj_lua_fgraph_XXXXXXX] + _stp_ctl_write_cmd+0x28d/0x2d1 [orxray_lj_lua_fgraph_XXXXXXX] + full_proxy_write+0x67/0xbb + __vfs_write+0x3a/0x170 + vfs_write+0xc7/0x1c0 + SyS_write+0x58/0xbf + do_syscall_64+0x7e/0x22c + entry_SYSCALL_64_after_hwframe+0x16e/0x0 + irq event stamp: 9454 + hardirqs last enabled at (9453): [] _raw_write_unlock_irqrestore+0x40/0x67 + hardirqs last disabled at (9454): [] apic_timer_interrupt+0x1c7/0x1d1 + softirqs last enabled at (9202): [] __do_softirq+0x361/0x4e5 + softirqs last disabled at (9195): [] irq_exit+0xf6/0x102 + + other info that might help us debug this: + Possible unsafe locking scenario: + + CPU0 + ---- + lock(&(lock)->rlock#3); + + lock(&(lock)->rlock#3); + + *** DEADLOCK *** + + no locks held by sh/15779. + + stack backtrace: + CPU: 16 PID: 15779 Comm: sh Tainted: G OE 4.14.35-1902.6.6.el7uek.x86_64.debug #2 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 + Call Trace: + + dump_stack+0x81/0xb6 + print_usage_bug+0x1fc/0x20d + ? check_usage_backwards+0x130/0x12b + mark_lock+0x1f8/0x27b + __lock_acquire+0x6e7/0x165a + ? sched_clock_local+0x18/0x81 + ? perf_swevent_hrtimer+0x136/0x151 + lock_acquire+0xe0/0x238 + ? _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX] + _raw_spin_lock_irqsave+0x55/0x97 + ? _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX] + _stp_mempool_alloc+0x35/0xab [orxray_lj_lua_fgraph_XXXXXXX] + _stp_ctl_get_buffer+0x69/0x215 [orxray_lj_lua_fgraph_XXXXXXX] + _stp_ctl_send+0x4e/0x169 [orxray_lj_lua_fgraph_XXXXXXX] + _stp_vlog+0xac/0x143 [orxray_lj_lua_fgraph_XXXXXXX] + ? _stp_utrace_probe_cb+0xa4/0xa4 [orxray_lj_lua_fgraph_XXXXXXX] + _stp_warn+0x6a/0x88 [orxray_lj_lua_fgraph_XXXXXXX] + function___global_warn__overload_0+0x60/0xac [orxray_lj_lua_fgraph_XXXXXXX] + probe_67+0xce/0x10e [orxray_lj_lua_fgraph_XXXXXXX] + _stp_hrtimer_notify_function+0x2db/0x55f [orxray_lj_lua_fgraph_XXXXXXX] + __hrtimer_run_queues+0x132/0x5c5 + hrtimer_interrupt+0xb7/0x1ca + smp_apic_timer_interrupt+0xa5/0x35a + apic_timer_interrupt+0x1cc/0x1d1 + + +diff --git a/runtime/stp_utrace.c b/runtime/stp_utrace.c +index e2880f1e4..46ba48923 100644 +--- a/runtime/stp_utrace.c ++++ b/runtime/stp_utrace.c +@@ -490,9 +490,9 @@ static int utrace_exit(void) + rcu_read_lock(); + stap_hlist_for_each_entry_rcu(utrace, node, &bucket->head, hlist) { + utrace->freed = true; +- stp_spin_lock(&bucket->lock); ++ stp_spin_lock_irqsave(&bucket->lock, flags); + hlist_del_rcu(&utrace->hlist); +- stp_spin_unlock(&bucket->lock); ++ stp_spin_unlock_irqrestore(&bucket->lock, flags); + + utrace_cleanup(utrace); + } +@@ -724,6 +724,7 @@ static struct utrace *utrace_task_alloc(struct utrace_bucket *bucket, + struct task_struct *task) + { + struct utrace *utrace; ++ unsigned long flags; + + utrace = kmem_cache_zalloc(utrace_cachep, STP_ALLOC_FLAGS); + if (unlikely(!utrace)) +@@ -739,9 +740,9 @@ static struct utrace *utrace_task_alloc(struct utrace_bucket *bucket, + atomic_set(&utrace->resume_work_added, 0); + atomic_set(&utrace->report_work_added, 0); + +- stp_spin_lock(&bucket->lock); ++ stp_spin_lock_irqsave(&bucket->lock, flags); + hlist_add_head_rcu(&utrace->hlist, &bucket->head); +- stp_spin_unlock(&bucket->lock); ++ stp_spin_unlock_irqrestore(&bucket->lock, flags); + return utrace; + } + +@@ -768,15 +769,17 @@ static struct utrace *utrace_task_alloc(struct utrace_bucket *bucket, + */ + static void utrace_free(struct utrace_bucket *bucket, struct utrace *utrace) + { ++ unsigned long flags; ++ + if (unlikely(!utrace)) + return; + + /* Remove this utrace from the mapping list of tasks to + * struct utrace. */ + utrace->freed = true; +- stp_spin_lock(&bucket->lock); ++ stp_spin_lock_irqsave(&bucket->lock, flags); + hlist_del_rcu(&utrace->hlist); +- stp_spin_unlock(&bucket->lock); ++ stp_spin_unlock_irqrestore(&bucket->lock, flags); + + /* Free the utrace struct. */ + #ifdef STP_TF_DEBUG +commit 6a092f5ae824d4ce972c10b8681b9272e2fd67f3 +Author: Sultan Alsawaf +Date: Tue Nov 17 11:03:53 2020 -0800 + + task_finder: call _stp_vma_done() upon error to fix memory leak + + The memory allocated inside stap_initialize_vma_map() is not freed upon + error when the task finder is started because a call to _stp_vma_done() + in the error path is missing. Add it to fix the leak. + +diff --git a/task_finder.cxx b/task_finder.cxx +index d08d44a75..7c45e728b 100644 +--- a/task_finder.cxx ++++ b/task_finder.cxx +@@ -66,6 +66,7 @@ task_finder_derived_probe_group::emit_module_init (systemtap_session& s) + + s.op->newline() << "if (rc) {"; + s.op->newline(1) << "stap_stop_task_finder();"; ++ s.op->newline() << "_stp_vma_done();"; + s.op->newline(-1) << "}"; + s.op->newline(-1) << "}"; + } +commit 3c4f82ca024df4f8e213f7c77418493262d4a4d7 +Author: Sultan Alsawaf +Date: Tue Nov 24 10:50:10 2020 -0800 + + runtime_context: factor out RCU usage using a rw lock + + We can factor out the RCU insanity in here by just adding in a rw lock + and using that to synchronize _stp_runtime_contexts_free() with any code + that has the runtime context held. + +diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h +index 41fecba81..18566957a 100644 +--- a/runtime/linux/runtime_context.h ++++ b/runtime/linux/runtime_context.h +@@ -11,15 +11,14 @@ + #ifndef _LINUX_RUNTIME_CONTEXT_H_ + #define _LINUX_RUNTIME_CONTEXT_H_ + +-#ifndef __rcu +-#define __rcu +-#endif +- +-static struct context __rcu *contexts[NR_CPUS] = { NULL }; ++/* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */ ++static DEFINE_RWLOCK(_stp_context_lock); ++static DEFINE_PER_CPU(struct context *, contexts); ++static atomic_t _stp_context_stop = ATOMIC_INIT(0); + + static int _stp_runtime_contexts_alloc(void) + { +- int cpu; ++ unsigned int cpu; + + for_each_possible_cpu(cpu) { + /* Module init, so in user context, safe to use +@@ -31,91 +30,67 @@ static int _stp_runtime_contexts_alloc(void) + (unsigned long) sizeof (struct context)); + return -ENOMEM; + } +- rcu_assign_pointer(contexts[cpu], c); ++ per_cpu(contexts, cpu) = c; + } + return 0; + } + + /* We should be free of all probes by this time, but for example the timer for + * _stp_ctl_work_callback may still be running and looking for contexts. We +- * use RCU-sched synchronization to be sure its safe to free them. */ ++ * use _stp_context_stop and a write lock to be sure its safe to free them. */ + static void _stp_runtime_contexts_free(void) + { +- // Note that 'free_contexts' is static because it is +- // (probably) too big to fit on a kernel function's stack. +- static struct context *free_contexts[NR_CPUS] = { NULL }; +- int cpu; ++ unsigned long flags; ++ unsigned int cpu; + +- /* First, save all the pointers. */ +- rcu_read_lock_sched(); +- for_each_possible_cpu(cpu) { +- free_contexts[cpu] = rcu_dereference_sched(contexts[cpu]); +- } +- rcu_read_unlock_sched(); ++ /* Sync to make sure existing readers are done */ ++ atomic_set(&_stp_context_stop, 1); ++ write_lock_irqsave(&_stp_context_lock, flags); ++ write_unlock_irqrestore(&_stp_context_lock, flags); + +- /* Now clear all pointers to prevent new readers. */ +- for_each_possible_cpu(cpu) { +- rcu_assign_pointer(contexts[cpu], NULL); +- } +- +- /* Sync to make sure existing readers are done. */ +- stp_synchronize_sched(); +- +- /* Now we can actually free the contexts. */ +- for_each_possible_cpu(cpu) { +- struct context *c = free_contexts[cpu]; +- if (c != NULL) { +- free_contexts[cpu] = NULL; +- _stp_vfree(c); +- } +- } ++ /* Now we can actually free the contexts */ ++ for_each_possible_cpu(cpu) ++ _stp_vfree(per_cpu(contexts, cpu)); + } + + static inline struct context * _stp_runtime_get_context(void) + { +- // RHBZ1788662 rcu operations are rejected in idle-cpu contexts +- // in effect: skip probe if it's in rcu-idle state +-#if defined(STAPCONF_RCU_IS_WATCHING) || LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0) // linux commit #5c173eb8 +- if (! rcu_is_watching()) +- return 0; +-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0) // linux commit #9b2e4f18 +- if (! rcu_is_cpu_idle()) +- return 0; +-#else +- ; // XXX older kernels didn't put tracepoints in idle-cpu +-#endif +- return rcu_dereference_sched(contexts[smp_processor_id()]); ++ if (atomic_read(&_stp_context_stop)) ++ return NULL; ++ ++ return per_cpu(contexts, smp_processor_id()); + } + + static struct context * _stp_runtime_entryfn_get_context(void) ++ __acquires(&_stp_context_lock) + { + struct context* __restrict__ c = NULL; +- preempt_disable (); ++ ++ if (!read_trylock(&_stp_context_lock)) ++ return NULL; ++ + c = _stp_runtime_get_context(); + if (c != NULL) { +- if (atomic_inc_return(&c->busy) == 1) { +- // NB: Notice we're not re-enabling preemption ++ if (!atomic_cmpxchg(&c->busy, 0, 1)) { ++ // NB: Notice we're not releasing _stp_context_lock + // here. We exepect the calling code to call + // _stp_runtime_entryfn_get_context() and + // _stp_runtime_entryfn_put_context() as a + // pair. + return c; + } +- atomic_dec(&c->busy); + } +- preempt_enable_no_resched(); ++ read_unlock(&_stp_context_lock); + return NULL; + } + + static inline void _stp_runtime_entryfn_put_context(struct context *c) ++ __releases(&_stp_context_lock) + { + if (c) { +- if (c == _stp_runtime_get_context()) +- atomic_dec(&c->busy); +- /* else, warn about bad state? */ +- preempt_enable_no_resched(); ++ atomic_set(&c->busy, 0); ++ read_unlock(&_stp_context_lock); + } +- return; + } + + static void _stp_runtime_context_wait(void) +@@ -130,9 +105,13 @@ static void _stp_runtime_context_wait(void) + int i; + + holdon = 0; +- rcu_read_lock_sched(); ++ read_lock(&_stp_context_lock); ++ if (atomic_read(&_stp_context_stop)) { ++ read_unlock(&_stp_context_lock); ++ break; ++ } + for_each_possible_cpu(i) { +- struct context *c = rcu_dereference_sched(contexts[i]); ++ struct context *c = per_cpu(contexts, i); + if (c != NULL + && atomic_read (& c->busy)) { + holdon = 1; +@@ -146,7 +125,7 @@ static void _stp_runtime_context_wait(void) + } + } + } +- rcu_read_unlock_sched(); ++ read_unlock(&_stp_context_lock); + + /* + * Just in case things are really really stuck, a +commit bb25d64f7b6c98ef2fc8b711f19bd6271866d727 +Author: Sultan Alsawaf +Date: Tue Dec 1 09:54:07 2020 -0800 + + runtime_context: synchronize _stp_context_stop more strictly + + We're only reading _stp_context_stop while the read lock is held, so we + can move the modification of it to inside the write lock to ensure + strict memory ordering. As such, it no longer needs to be an atomic_t + variable. + + We also don't need to disable IRQs when holding the write lock because + only read_trylock is used from IRQ context, not read_lock, so there's no + possibility of a deadlock occurring. + +diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h +index 18566957a..e716e6d39 100644 +--- a/runtime/linux/runtime_context.h ++++ b/runtime/linux/runtime_context.h +@@ -14,7 +14,7 @@ + /* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */ + static DEFINE_RWLOCK(_stp_context_lock); + static DEFINE_PER_CPU(struct context *, contexts); +-static atomic_t _stp_context_stop = ATOMIC_INIT(0); ++static bool _stp_context_stop; + + static int _stp_runtime_contexts_alloc(void) + { +@@ -40,13 +40,12 @@ static int _stp_runtime_contexts_alloc(void) + * use _stp_context_stop and a write lock to be sure its safe to free them. */ + static void _stp_runtime_contexts_free(void) + { +- unsigned long flags; + unsigned int cpu; + + /* Sync to make sure existing readers are done */ +- atomic_set(&_stp_context_stop, 1); +- write_lock_irqsave(&_stp_context_lock, flags); +- write_unlock_irqrestore(&_stp_context_lock, flags); ++ write_lock(&_stp_context_lock); ++ _stp_context_stop = true; ++ write_unlock(&_stp_context_lock); + + /* Now we can actually free the contexts */ + for_each_possible_cpu(cpu) +@@ -55,7 +54,7 @@ static void _stp_runtime_contexts_free(void) + + static inline struct context * _stp_runtime_get_context(void) + { +- if (atomic_read(&_stp_context_stop)) ++ if (_stp_context_stop) + return NULL; + + return per_cpu(contexts, smp_processor_id()); +@@ -106,7 +105,7 @@ static void _stp_runtime_context_wait(void) + + holdon = 0; + read_lock(&_stp_context_lock); +- if (atomic_read(&_stp_context_stop)) { ++ if (_stp_context_stop) { + read_unlock(&_stp_context_lock); + break; + } +commit 0cc239e6f0fff79cb584fc857d3220402558db37 +Author: Sultan Alsawaf +Date: Tue Dec 1 18:47:04 2020 -0800 + + runtime_context: replace _stp_context_lock with an atomic variable + + We can't use any lock primitives here, such as spin locks or rw locks, + because lock_acquire() has tracepoints inside of it. This can cause a + deadlock, so we have to roll our own synchronization mechanism using an + atomic variable. + +diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h +index e716e6d39..7dd240e1a 100644 +--- a/runtime/linux/runtime_context.h ++++ b/runtime/linux/runtime_context.h +@@ -11,10 +11,9 @@ + #ifndef _LINUX_RUNTIME_CONTEXT_H_ + #define _LINUX_RUNTIME_CONTEXT_H_ + +-/* Can't use STP_DEFINE_RWLOCK() or this might be replaced with a spin lock */ +-static DEFINE_RWLOCK(_stp_context_lock); ++/* Can't use a lock primitive for this because lock_acquire() has tracepoints */ ++static atomic_t _stp_contexts_busy_ctr = ATOMIC_INIT(0); + static DEFINE_PER_CPU(struct context *, contexts); +-static bool _stp_context_stop; + + static int _stp_runtime_contexts_alloc(void) + { +@@ -37,15 +36,14 @@ static int _stp_runtime_contexts_alloc(void) + + /* We should be free of all probes by this time, but for example the timer for + * _stp_ctl_work_callback may still be running and looking for contexts. We +- * use _stp_context_stop and a write lock to be sure its safe to free them. */ ++ * use _stp_contexts_busy_ctr to be sure its safe to free them. */ + static void _stp_runtime_contexts_free(void) + { + unsigned int cpu; + + /* Sync to make sure existing readers are done */ +- write_lock(&_stp_context_lock); +- _stp_context_stop = true; +- write_unlock(&_stp_context_lock); ++ while (atomic_cmpxchg(&_stp_contexts_busy_ctr, 0, INT_MAX)) ++ cpu_relax(); + + /* Now we can actually free the contexts */ + for_each_possible_cpu(cpu) +@@ -54,24 +52,20 @@ static void _stp_runtime_contexts_free(void) + + static inline struct context * _stp_runtime_get_context(void) + { +- if (_stp_context_stop) +- return NULL; +- + return per_cpu(contexts, smp_processor_id()); + } + + static struct context * _stp_runtime_entryfn_get_context(void) +- __acquires(&_stp_context_lock) + { + struct context* __restrict__ c = NULL; + +- if (!read_trylock(&_stp_context_lock)) ++ if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX)) + return NULL; + + c = _stp_runtime_get_context(); + if (c != NULL) { + if (!atomic_cmpxchg(&c->busy, 0, 1)) { +- // NB: Notice we're not releasing _stp_context_lock ++ // NB: Notice we're not releasing _stp_contexts_busy_ctr + // here. We exepect the calling code to call + // _stp_runtime_entryfn_get_context() and + // _stp_runtime_entryfn_put_context() as a +@@ -79,16 +73,15 @@ static struct context * _stp_runtime_entryfn_get_context(void) + return c; + } + } +- read_unlock(&_stp_context_lock); ++ atomic_dec(&_stp_contexts_busy_ctr); + return NULL; + } + + static inline void _stp_runtime_entryfn_put_context(struct context *c) +- __releases(&_stp_context_lock) + { + if (c) { + atomic_set(&c->busy, 0); +- read_unlock(&_stp_context_lock); ++ atomic_dec(&_stp_contexts_busy_ctr); + } + } + +@@ -104,11 +97,9 @@ static void _stp_runtime_context_wait(void) + int i; + + holdon = 0; +- read_lock(&_stp_context_lock); +- if (_stp_context_stop) { +- read_unlock(&_stp_context_lock); ++ if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX)) + break; +- } ++ + for_each_possible_cpu(i) { + struct context *c = per_cpu(contexts, i); + if (c != NULL +@@ -124,7 +115,7 @@ static void _stp_runtime_context_wait(void) + } + } + } +- read_unlock(&_stp_context_lock); ++ atomic_dec(&_stp_contexts_busy_ctr); + + /* + * Just in case things are really really stuck, a +commit fbab0ea35e6af0d6599c6de3708b24008bf03ae6 +Author: Sultan Alsawaf +Date: Wed Dec 2 11:07:11 2020 -0800 + + runtime_context: disable preempt while holding the context + + After the context lock was converted to an atomic in the previous + commit, the preempt disable logic disappeared. Add it back. + +diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h +index 7dd240e1a..7a1532e54 100644 +--- a/runtime/linux/runtime_context.h ++++ b/runtime/linux/runtime_context.h +@@ -34,6 +34,24 @@ static int _stp_runtime_contexts_alloc(void) + return 0; + } + ++static bool _stp_runtime_context_trylock(void) ++{ ++ bool locked; ++ ++ preempt_disable(); ++ locked = atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX); ++ if (!locked) ++ preempt_enable_no_resched(); ++ ++ return locked; ++} ++ ++static void _stp_runtime_context_unlock(void) ++{ ++ atomic_dec(&_stp_contexts_busy_ctr); ++ preempt_enable_no_resched(); ++} ++ + /* We should be free of all probes by this time, but for example the timer for + * _stp_ctl_work_callback may still be running and looking for contexts. We + * use _stp_contexts_busy_ctr to be sure its safe to free them. */ +@@ -59,7 +77,7 @@ static struct context * _stp_runtime_entryfn_get_context(void) + { + struct context* __restrict__ c = NULL; + +- if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX)) ++ if (!_stp_runtime_context_trylock()) + return NULL; + + c = _stp_runtime_get_context(); +@@ -73,7 +91,7 @@ static struct context * _stp_runtime_entryfn_get_context(void) + return c; + } + } +- atomic_dec(&_stp_contexts_busy_ctr); ++ _stp_runtime_context_unlock(); + return NULL; + } + +@@ -81,7 +99,7 @@ static inline void _stp_runtime_entryfn_put_context(struct context *c) + { + if (c) { + atomic_set(&c->busy, 0); +- atomic_dec(&_stp_contexts_busy_ctr); ++ _stp_runtime_context_unlock(); + } + } + +@@ -97,7 +115,7 @@ static void _stp_runtime_context_wait(void) + int i; + + holdon = 0; +- if (!atomic_add_unless(&_stp_contexts_busy_ctr, 1, INT_MAX)) ++ if (!_stp_runtime_context_trylock()) + break; + + for_each_possible_cpu(i) { +@@ -115,7 +133,7 @@ static void _stp_runtime_context_wait(void) + } + } + } +- atomic_dec(&_stp_contexts_busy_ctr); ++ _stp_runtime_context_unlock(); + + /* + * Just in case things are really really stuck, a +commit aedc044d5d38cb2fa6144d0a3345d06847862f1b +Author: Sultan Alsawaf +Date: Wed Dec 2 11:27:47 2020 -0800 + + task_finder_vma: add kfree_rcu() compat for old kernels + + Newer RHEL 6 kernels have kfree_rcu(), but older ones do not. Using + kfree_rcu() is beneficial because it lets the RCU subsystem know that + the queued RCU callback is low-priority, and can be deferred, hence why + we don't replace kfree_rcu() with call_rcu() outright. Luckily, + kfree_rcu() is a macro so we can just #ifdef with it. + +diff --git a/runtime/task_finder_vma.c b/runtime/task_finder_vma.c +index 7f0f6ed56..dc77a80f5 100644 +--- a/runtime/task_finder_vma.c ++++ b/runtime/task_finder_vma.c +@@ -87,6 +87,15 @@ __stp_tf_vma_new_entry(void) + return entry; + } + ++#ifndef kfree_rcu ++static void __stp_tf_vma_free_entry(struct rcu_head *rcu) ++{ ++ struct __stp_tf_vma_entry *entry = container_of(rcu, typeof(*entry), rcu); ++ ++ kfree(entry); ++} ++#endif ++ + // __stp_tf_vma_put_entry(): Put a specified number of references on the entry. + static void + __stp_tf_vma_put_entry(struct __stp_tf_vma_bucket *bucket, +@@ -106,7 +115,11 @@ __stp_tf_vma_put_entry(struct __stp_tf_vma_bucket *bucket, + hlist_del_rcu(&entry->hlist); + stp_spin_unlock_irqrestore(&bucket->lock, flags); + ++#ifdef kfree_rcu + kfree_rcu(entry, rcu); ++#else ++ call_rcu(&entry->rcu, __stp_tf_vma_free_entry); ++#endif + } + + // stap_initialize_vma_map(): Initialize the free list. Grabs the +commit 6a27888b118b7a94650a68aae028957cdd5fb5f5 +Author: Sultan Alsawaf +Date: Wed Dec 2 18:09:17 2020 -0800 + + REVERTME: tapset-timers: work around on-the-fly deadlocks caused by mutex_trylock + + The following deadlock exists due to tracepoints existing inside a lock + that is used both inside probe context and outside probe context: + #0 [ffff88017f6d7a08] kvm_wait at ffffffff81079f5a + #1 [ffff88017f6d7a30] __pv_queued_spin_lock_slowpath at ffffffff8114f51e + #2 [ffff88017f6d7a70] queued_spin_lock_slowpath at ffffffff810e842b + #3 [ffff88017f6d7a80] mutex_trylock at ffffffff81882b1b + #4 [ffff88017f6d7ab8] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #5 [ffff88017f6d7ad8] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #6 [ffff88017f6d7b10] probe_7879 at ffffffffc0a98c85 [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #7 [ffff88017f6d7b38] enter_real_tracepoint_probe_1543 at ffffffffc0c3b757 [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #8 [ffff88017f6d7b70] enter_tracepoint_probe_1543 at ffffffffc09b117e [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #9 [ffff88017f6d7b80] lock_acquire at ffffffff811460ba + #10 [ffff88017f6d7be8] mutex_trylock at ffffffff81882a27 + #11 [ffff88017f6d7c20] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #12 [ffff88017f6d7c40] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #13 [ffff88017f6d7c78] _stp_vlog at ffffffffc09b8d32 [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #14 [ffff88017f6d7cd8] _stp_dbug at ffffffffc09ba43b [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #15 [ffff88017f6d7d38] systemtap_module_refresh at ffffffffc09ba51d [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #16 [ffff88017f6d7d50] module_refresher at ffffffffc09ba53e [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #17 [ffff88017f6d7d60] process_one_work at ffffffff810da9cc + #18 [ffff88017f6d7de8] worker_thread at ffffffff810dafe6 + #19 [ffff88017f6d7e48] kthread at ffffffff810e44cf + #20 [ffff88017f6d7f50] ret_from_fork_nospec_begin at ffffffff818958dd + + Note the deadlock due to _stp_transport_trylock_relay_inode recursing + onto itself via mutex_trylock. + + This is a temporary fix for the issue until a proper patch is made to + remove the mutex_trylock from __stp_print_flush. This should be reverted + when that patch lands (it will have something to do with bulkmode). + +diff --git a/tapset-timers.cxx b/tapset-timers.cxx +index 10da17cda..503498c85 100644 +--- a/tapset-timers.cxx ++++ b/tapset-timers.cxx +@@ -391,11 +391,11 @@ hrtimer_derived_probe_group::emit_module_refresh (systemtap_session& s) + s.op->newline(+1) << "struct stap_hrtimer_probe* stp = &stap_hrtimer_probes[i];"; + // timer disabled, but condition says enabled? + s.op->newline( 0) << "if (!stp->enabled && stp->probe->cond_enabled) {"; +- s.op->newline(+1) << "dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);"; ++ s.op->newline(+1) << "//dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);"; + s.op->newline( 0) << "_stp_hrtimer_start(stp);"; + // timer enabled, but condition says disabled? + s.op->newline(-1) << "} else if (stp->enabled && !stp->probe->cond_enabled) {"; +- s.op->newline(+1) << "dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);"; ++ s.op->newline(+1) << "//dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);"; + s.op->newline( 0) << "_stp_hrtimer_cancel(stp);"; + s.op->newline(-1) << "}"; + s.op->newline( 0) << "stp->enabled = stp->probe->cond_enabled;"; +commit 7187dcf39412fcb25c432d318be8e49a6051f055 +Author: Sultan Alsawaf +Date: Thu Dec 3 12:57:34 2020 -0800 + + runtime: fix print races in IRQ context and during print cleanup + + Prints can race when there's a print called from IRQ context or a print + called while print cleanup takes place, which can lead to garbled print + messages, out-of-bounds memory accesses, and memory use-after-free. This + is one example of racy modification of the print buffer len in IRQ + context which caused a panic due to an out-of-bounds memory access: + + BUG: unable to handle kernel paging request at ffffe8ffff621000 + IP: [] _stp_vsprint_memory+0x83/0x950 [stap_2c44636dfda18135ca3012a752599da6_13_533] + PGD 174b90067 PUD 174b8f067 PMD 174b93067 PTE 0 + Oops: 0002 [#1] SMP + CPU: 12 PID: 3468 Comm: cat Kdump: loaded Tainted: G OE ------------ 3.10.0-1127.19.1.el7.x86_64.debug #1 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014 + task: ffff88001f4f0000 ti: ffff88004ea5c000 task.ti: ffff88004ea5c000 + RIP: 0010:[] [] _stp_vsprint_memory+0x83/0x950 [stap_2c44636dfda18135ca3012a752599da6_13_533] + RSP: 0018:ffff88004ea5f9a8 EFLAGS: 00010082 + RAX: ffffe8ffff621001 RBX: ffffe8ffff620ff2 RCX: fffffffffffffffe + RDX: 000000000000006e RSI: ffffffffffffffff RDI: ffffc90002c23730 + RBP: ffff88004ea5fa28 R08: 00000000ffffffff R09: 0000000000000073 + R10: ffffc90002c243d7 R11: 0000000000000001 R12: ffffc90002c2373f + R13: ffffe8ffff621004 R14: 0000000000000012 R15: 00000000fffffffe + FS: 00007f8a9b1d4740(0000) GS:ffff880179e00000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: ffffe8ffff621000 CR3: 00000000b3e3c000 CR4: 0000000000360fe0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + [] ? sched_clock+0x9/0x10 + [] ? lock_release_holdtime.part.30+0xf/0x1a0 + [] function___global_trace__overload_0+0x5b0/0x1220 [stap_2c44636dfda18135ca3012a752599da6_13_533] + [] ? stp_lock_probe+0x53/0xe0 [stap_2c44636dfda18135ca3012a752599da6_13_533] + [] ? kretprobe_trampoline_holder+0x9/0x9 + [] probe_7118+0x82/0xe0 [stap_2c44636dfda18135ca3012a752599da6_13_533] + [] enter_kretprobe_common+0x256/0x490 [stap_2c44636dfda18135ca3012a752599da6_13_533] + [] ? proc_sys_open+0x51/0x60 + [] enter_kretprobe_probe+0x10/0x20 [stap_2c44636dfda18135ca3012a752599da6_13_533] + [] trampoline_handler+0x148/0x220 + [] ? proc_sys_open+0x51/0x60 + [] kretprobe_trampoline+0x25/0x57 + [] ? proc_sys_open+0x51/0x60 + [] kretprobe_trampoline_holder+0x9/0x9 + [] ? security_inode_permission+0x22/0x30 + [] ? sysctl_head_finish+0x50/0x50 + [] vfs_open+0x5d/0xb0 + [] ? may_open+0x5a/0x120 + [] do_last+0x285/0x15b0 + [] ? link_path_walk+0x27e/0x8c0 + [] path_openat+0xd0/0x5d0 + [] ? kvm_clock_read+0x33/0x40 + [] do_filp_open+0x4d/0xb0 + [] ? _raw_spin_unlock+0x27/0x40 + [] ? __alloc_fd+0xfb/0x270 + [] do_sys_open+0x124/0x220 + [] SyS_open+0x1e/0x20 + [] kretprobe_trampoline_holder+0x9/0x9 + + This patch resolves the IRQ print races by disabling IRQs on the local + CPU when accessing said CPU's print buffer, and resolves the cleanup + races with a lock. We also protect against data corruption and panics + from prints inside NMIs now by checking if the current CPU was accessing + the log buffer when an NMI fired; in this case, the NMI's prints will be + dropped, as there is no way to safely service them without creating a + dedicated log buffer for them. This is achieved by forbidding reentrancy + with respect to _stp_print_trylock_irqsave() when the runtime context + isn't held. Reentrancy is otherwise allowed when the runtime context is + held because the runtime context provides reentrancy protection. + +diff --git a/runtime/linux/io.c b/runtime/linux/io.c +index 74a032c52..122708e2a 100644 +--- a/runtime/linux/io.c ++++ b/runtime/linux/io.c +@@ -20,9 +20,6 @@ + + #define WARN_STRING "WARNING: " + #define ERR_STRING "ERROR: " +-#if (STP_LOG_BUF_LEN < 10) /* sizeof(WARN_STRING) */ +-#error "STP_LOG_BUF_LEN is too short" +-#endif + + enum code { INFO=0, WARN, ERROR, DBUG }; + +@@ -31,25 +28,37 @@ static void _stp_vlog (enum code type, const char *func, int line, const char *f + + static void _stp_vlog (enum code type, const char *func, int line, const char *fmt, va_list args) + { ++ struct _stp_log *log; ++ unsigned long flags; ++ size_t bytes_avail; + int num; +- char *buf = per_cpu_ptr(Stp_lbuf, get_cpu()); ++ char *buf; + int start = 0; + ++ if (!_stp_print_trylock_irqsave(&flags)) ++ return; ++ ++ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); ++ bytes_avail = STP_BUFFER_SIZE - log->len; ++ if (unlikely(!bytes_avail)) ++ goto err_unlock; ++ ++ buf = &log->buf[log->len]; + if (type == DBUG) { +- start = _stp_snprintf(buf, STP_LOG_BUF_LEN, "%s:%d: ", func, line); ++ start = _stp_snprintf(buf, bytes_avail, "%s:%d: ", func, line); + } else if (type == WARN) { +- /* This strcpy() is OK, since we know STP_LOG_BUF_LEN +- * is > sizeof(WARN_STRING). */ +- strcpy (buf, WARN_STRING); +- start = sizeof(WARN_STRING) - 1; ++ strncpy(buf, WARN_STRING, bytes_avail); ++ start = min(bytes_avail, sizeof(WARN_STRING) - 1); + } else if (type == ERROR) { +- /* This strcpy() is OK, since we know STP_LOG_BUF_LEN +- * is > sizeof(ERR_STRING) (which is < sizeof(WARN_STRING). */ +- strcpy (buf, ERR_STRING); +- start = sizeof(ERR_STRING) - 1; ++ strncpy(buf, ERR_STRING, bytes_avail); ++ start = min(bytes_avail, sizeof(ERR_STRING) - 1); + } + +- num = vscnprintf (buf + start, STP_LOG_BUF_LEN - start - 1, fmt, args); ++ bytes_avail -= start; ++ if (unlikely(!bytes_avail)) ++ goto err_unlock; ++ ++ num = vscnprintf(buf + start, bytes_avail - 1, fmt, args); + if (num + start) { + if (buf[num + start - 1] != '\n') { + buf[num + start] = '\n'; +@@ -66,12 +75,13 @@ static void _stp_vlog (enum code type, const char *func, int line, const char *f + if (type != DBUG) { + _stp_ctl_send(STP_OOB_DATA, buf, start + num + 1); + } else { +- _stp_print(buf); +- _stp_print_flush(); ++ log->len += start + num; ++ __stp_print_flush(log); + } + #endif + } +- put_cpu(); ++err_unlock: ++ _stp_print_unlock_irqrestore(&flags); + } + + /** Prints warning. +diff --git a/runtime/linux/print.c b/runtime/linux/print.c +index 777bca8b0..2aa2f1c8d 100644 +--- a/runtime/linux/print.c ++++ b/runtime/linux/print.c +@@ -35,84 +35,179 @@ + * @{ + */ + +-typedef struct __stp_pbuf { +- uint32_t len; /* bytes used in the buffer */ ++struct _stp_log { ++ unsigned int len; /* Bytes used in the buffer */ + char buf[STP_BUFFER_SIZE]; +-} _stp_pbuf; ++ atomic_t reentrancy_lock; ++}; ++#include "print_flush.c" + +-static void *Stp_pbuf = NULL; ++static struct _stp_log *_stp_log_pcpu; ++ ++/* ++ * An atomic counter is used to synchronize every possible print buffer usage ++ * with the _stp_print_cleanup() function. The cleanup function sets the counter ++ * to INT_MAX after waiting for everything using the print buffer to finish. We ++ * cannot use a lock primitive to implement this because lock_acquire() contains ++ * tracepoints and print statements are used both inside and outside of probes. ++ * If the lock were only used inside probes, the runtime context would protect ++ * us from recursing into the lock_acquire() tracepoints and deadlocking. We ++ * instead use _stp_print_ctr as if it were a read-write lock. ++ */ ++static atomic_t _stp_print_ctr = ATOMIC_INIT(0); + +-/** private buffer for _stp_vlog() */ +-#ifndef STP_LOG_BUF_LEN +-#define STP_LOG_BUF_LEN 256 +-#endif ++/* ++ * This disables IRQs to make per-CPU print buffer accesses atomic. There is a ++ * reentrancy protection mechanism specifically for NMIs, since they can violate ++ * our atomic guarantee. Reentrancy is otherwise allowed within code sections ++ * that have the runtime context held (via _stp_runtime_entryfn_get_context()). ++ */ ++static bool _stp_print_trylock_irqsave(unsigned long *flags) ++{ ++ bool context_held = false; ++ struct _stp_log *log; ++ ++ local_irq_save(*flags); ++ if (!atomic_add_unless(&_stp_print_ctr, 1, INT_MAX)) ++ goto irq_restore; ++ ++ /* ++ * Check the per-CPU reentrancy lock for contention, unless the runtime ++ * context is already held, in which case we already have reentrancy ++ * protection. Otherwise, if the reentrancy lock is contented, that ++ * means we're either inside an NMI that fired while the current CPU was ++ * accessing the log buffer, or something is trying to nest calls to ++ * _stp_print_trylock_irqsave(). Our only choice is to reject the log ++ * access attempt in this case because log buffer corruption and panics ++ * could ensue if we're inside an NMI. ++ */ ++ if (_stp_runtime_context_trylock()) { ++ struct context *c = _stp_runtime_get_context(); ++ context_held = c && atomic_read(&c->busy); ++ _stp_runtime_context_unlock(); ++ } + +-typedef char _stp_lbuf[STP_LOG_BUF_LEN]; +-static void *Stp_lbuf = NULL; ++ /* Fall back onto the reentrancy lock if the context isn't held */ ++ if (!context_held) { ++ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); ++ if (atomic_cmpxchg(&log->reentrancy_lock, 0, 1)) ++ goto print_unlock; ++ } ++ ++ return true; ++ ++print_unlock: ++ atomic_dec(&_stp_print_ctr); ++irq_restore: ++ local_irq_restore(*flags); ++ return false; ++} ++ ++static void _stp_print_unlock_irqrestore(unsigned long *flags) ++{ ++ bool context_held = false; ++ struct _stp_log *log; ++ ++ if (_stp_runtime_context_trylock()) { ++ struct context *c = _stp_runtime_get_context(); ++ context_held = c && atomic_read(&c->busy); ++ _stp_runtime_context_unlock(); ++ } ++ ++ if (!context_held) { ++ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); ++ atomic_set(&log->reentrancy_lock, 0); ++ } ++ ++ atomic_dec(&_stp_print_ctr); ++ local_irq_restore(*flags); ++} + + /* create percpu print and io buffers */ + static int _stp_print_init (void) + { +- Stp_pbuf = _stp_alloc_percpu(sizeof(_stp_pbuf)); +- if (unlikely(Stp_pbuf == 0)) +- return -1; +- +- /* now initialize IO buffer used in io.c */ +- Stp_lbuf = _stp_alloc_percpu(sizeof(_stp_lbuf)); +- if (unlikely(Stp_lbuf == 0)) { +- _stp_free_percpu(Stp_pbuf); +- return -1; ++ unsigned int cpu; ++ ++ _stp_log_pcpu = _stp_alloc_percpu(sizeof(*_stp_log_pcpu)); ++ if (!_stp_log_pcpu) ++ return -ENOMEM; ++ ++ for_each_possible_cpu(cpu) { ++ struct _stp_log *log = per_cpu_ptr(_stp_log_pcpu, cpu); ++ ++ log->reentrancy_lock = (atomic_t)ATOMIC_INIT(0); + } + return 0; + } + + static void _stp_print_cleanup (void) + { +- if (Stp_pbuf) +- _stp_free_percpu(Stp_pbuf); +- if (Stp_lbuf) +- _stp_free_percpu(Stp_lbuf); +-} ++ unsigned int cpu; + +-#include "print_flush.c" ++ /* Wait for the loggers to finish modifying the print buffers */ ++ while (atomic_cmpxchg(&_stp_print_ctr, 0, INT_MAX)) ++ cpu_relax(); ++ ++ for_each_possible_cpu(cpu) { ++ struct _stp_log *log = per_cpu_ptr(_stp_log_pcpu, cpu); ++ ++ /* ++ * Flush anything that could be left in the print buffer. It is ++ * safe to do this without any kind of synchronization mechanism ++ * because nothing is using this print buffer anymore. ++ */ ++ __stp_print_flush(log); ++ } ++ ++ _stp_free_percpu(_stp_log_pcpu); ++} + + static inline void _stp_print_flush(void) + { +- stp_print_flush(per_cpu_ptr(Stp_pbuf, smp_processor_id())); ++ struct _stp_log *log; ++ unsigned long flags; ++ ++ if (!_stp_print_trylock_irqsave(&flags)) ++ return; ++ ++ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); ++ __stp_print_flush(log); ++ _stp_print_unlock_irqrestore(&flags); + } ++ + #ifndef STP_MAXBINARYARGS + #define STP_MAXBINARYARGS 127 + #endif + + +-/** Reserves space in the output buffer for direct I/O. ++/** Reserves space in the output buffer for direct I/O. Must be called with ++ * _stp_print_trylock_irqsave() held. + */ + static void * _stp_reserve_bytes (int numbytes) + { +- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id()); +- int size = STP_BUFFER_SIZE - pb->len; +- void * ret; ++ struct _stp_log *log; ++ char *ret; + + if (unlikely(numbytes == 0 || numbytes > STP_BUFFER_SIZE)) + return NULL; + +- if (unlikely(numbytes > size)) +- _stp_print_flush(); ++ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); ++ if (unlikely(numbytes > (STP_BUFFER_SIZE - log->len))) ++ __stp_print_flush(log); + +- ret = pb->buf + pb->len; +- pb->len += numbytes; ++ ret = &log->buf[log->len]; ++ log->len += numbytes; + return ret; + } + + + static void _stp_unreserve_bytes (int numbytes) + { +- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id()); +- +- if (unlikely(numbytes == 0 || numbytes > pb->len)) +- return; ++ struct _stp_log *log; + +- pb->len -= numbytes; ++ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); ++ if (numbytes <= log->len) ++ log->len -= numbytes; + } + + /** Write 64-bit args directly into the output stream. +@@ -123,22 +218,25 @@ static void _stp_unreserve_bytes (int numbytes) + */ + static void _stp_print_binary (int num, ...) + { ++ unsigned long flags; + va_list vargs; + int i; + int64_t *args; +- ++ + if (unlikely(num > STP_MAXBINARYARGS)) + num = STP_MAXBINARYARGS; + +- args = _stp_reserve_bytes(num * sizeof(int64_t)); ++ if (!_stp_print_trylock_irqsave(&flags)) ++ return; + +- if (likely(args != NULL)) { ++ args = _stp_reserve_bytes(num * sizeof(int64_t)); ++ if (args) { + va_start(vargs, num); +- for (i = 0; i < num; i++) { ++ for (i = 0; i < num; i++) + args[i] = va_arg(vargs, int64_t); +- } + va_end(vargs); + } ++ _stp_print_unlock_irqrestore(&flags); + } + + /** Print into the print buffer. +@@ -149,6 +247,7 @@ static void _stp_print_binary (int num, ...) + static void _stp_printf (const char *fmt, ...) + { + va_list args; ++ + va_start(args, fmt); + _stp_vsnprintf(NULL, 0, fmt, args); + va_end(args); +@@ -160,37 +259,36 @@ static void _stp_printf (const char *fmt, ...) + + static void _stp_print (const char *str) + { +- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id()); +- char *end = pb->buf + STP_BUFFER_SIZE; +- char *ptr = pb->buf + pb->len; +- char *instr = (char *)str; +- +- while (ptr < end && *instr) +- *ptr++ = *instr++; +- +- /* Did loop terminate due to lack of buffer space? */ +- if (unlikely(*instr)) { +- /* Don't break strings across subbufs. */ +- /* Restart after flushing. */ +- _stp_print_flush(); +- end = pb->buf + STP_BUFFER_SIZE; +- ptr = pb->buf + pb->len; +- instr = (char *)str; +- while (ptr < end && *instr) +- *ptr++ = *instr++; ++ struct _stp_log *log; ++ unsigned long flags; ++ ++ if (!_stp_print_trylock_irqsave(&flags)) ++ return; ++ ++ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); ++ while (1) { ++ while (log->len < STP_BUFFER_SIZE && *str) ++ log->buf[log->len++] = *str++; ++ if (likely(!*str)) ++ break; ++ __stp_print_flush(log); + } +- pb->len = ptr - pb->buf; ++ _stp_print_unlock_irqrestore(&flags); + } + + static void _stp_print_char (const char c) + { +- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id()); +- int size = STP_BUFFER_SIZE - pb->len; +- if (unlikely(1 >= size)) +- _stp_print_flush(); +- +- pb->buf[pb->len] = c; +- pb->len ++; ++ struct _stp_log *log; ++ unsigned long flags; ++ ++ if (!_stp_print_trylock_irqsave(&flags)) ++ return; ++ ++ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); ++ if (unlikely(log->len == STP_BUFFER_SIZE)) ++ __stp_print_flush(log); ++ log->buf[log->len++] = c; ++ _stp_print_unlock_irqrestore(&flags); + } + + static void _stp_print_kernel_info(char *sname, char *vstr, int ctx, int num_probes) +diff --git a/runtime/print.h b/runtime/print.h +index ede71f033..ffdea594d 100644 +--- a/runtime/print.h ++++ b/runtime/print.h +@@ -10,6 +10,9 @@ + #ifndef _STP_PRINT_H_ + #define _STP_PRINT_H_ + ++/* The lock must be held with IRQs disabled to do any printing */ ++static bool _stp_print_trylock_irqsave(unsigned long *flags); ++static void _stp_print_unlock_irqrestore(unsigned long *flags); + static int _stp_print_init(void); + static void _stp_print_cleanup(void); + static void *_stp_reserve_bytes(int numbytes); +diff --git a/runtime/print_flush.c b/runtime/print_flush.c +index cf40a2645..acd6a32d9 100644 +--- a/runtime/print_flush.c ++++ b/runtime/print_flush.c +@@ -13,40 +13,31 @@ + * is filled, or this is called. This MUST be called before returning + * from a probe or accumulated output in the print buffer will be lost. + * +- * @note Preemption must be disabled to use this. ++ * @note Interrupts must be disabled to use this. + */ + +-static STP_DEFINE_SPINLOCK(_stp_print_lock); +- +-void stp_print_flush(_stp_pbuf *pb) ++static void __stp_print_flush(struct _stp_log *log) + { +- size_t len = pb->len; ++ size_t len = log->len; + void *entry = NULL; + + /* check to see if there is anything in the buffer */ + if (likely(len == 0)) + return; + +- pb->len = 0; +- +- if (unlikely(_stp_transport_get_state() != STP_TRANSPORT_RUNNING)) +- return; ++ log->len = 0; + + dbug_trans(1, "len = %zu\n", len); + + #ifdef STP_BULKMODE + #ifdef NO_PERCPU_HEADERS + { +- struct context* __restrict__ c = NULL; +- char *bufp = pb->buf; ++ char *bufp = log->buf; + int inode_locked; + +- c = _stp_runtime_entryfn_get_context(); +- + if (!(inode_locked = _stp_transport_trylock_relay_inode())) { + atomic_inc (&_stp_transport_failures); + #ifndef STP_TRANSPORT_RISKY +- _stp_runtime_entryfn_put_context(c); + return; + #endif + } +@@ -70,26 +61,20 @@ void stp_print_flush(_stp_pbuf *pb) + + if (inode_locked) + _stp_transport_unlock_relay_inode(); +- +- _stp_runtime_entryfn_put_context(c); + } + + #else /* !NO_PERCPU_HEADERS */ + + { +- struct context* __restrict__ c = NULL; +- char *bufp = pb->buf; ++ char *bufp = log->buf; + struct _stp_trace t = { .sequence = _stp_seq_inc(), + .pdu_len = len}; + size_t bytes_reserved; + int inode_locked; + +- c = _stp_runtime_entryfn_get_context(); +- + if (!(inode_locked = _stp_transport_trylock_relay_inode())) { + atomic_inc (&_stp_transport_failures); + #ifndef STP_TRANSPORT_RISKY +- _stp_runtime_entryfn_put_context(c); + return; + #endif + } +@@ -124,48 +109,24 @@ void stp_print_flush(_stp_pbuf *pb) + + if (inode_locked) + _stp_transport_unlock_relay_inode(); +- +- _stp_runtime_entryfn_put_context(c); + } + #endif /* !NO_PERCPU_HEADERS */ + + #else /* !STP_BULKMODE */ + + { +- unsigned long flags; +- struct context* __restrict__ c = NULL; +- char *bufp = pb->buf; ++ char *bufp = log->buf; + int inode_locked; + +- /* Prevent probe reentrancy on _stp_print_lock. +- * +- * Since stp_print_flush may be called from probe context, we +- * have to make sure that its lock, _stp_print_lock, can't +- * possibly be held outside probe context too. We ensure this +- * by grabbing the context here, so any probe triggered by this +- * region will appear reentrant and be skipped rather than +- * deadlock. Failure to get_context just means we're already +- * in a probe, which is fine. +- * +- * (see also _stp_ctl_send for a similar situation) +- * +- * A better solution would be to replace this +- * concurrency-control-laden effort with a lockless +- * algorithm. +- */ +- c = _stp_runtime_entryfn_get_context(); +- + if (!(inode_locked = _stp_transport_trylock_relay_inode())) { + atomic_inc (&_stp_transport_failures); + #ifndef STP_TRANSPORT_RISKY + dbug_trans(0, "discarding %zu bytes of data\n", len); +- _stp_runtime_entryfn_put_context(c); + return; + #endif + } + + dbug_trans(1, "calling _stp_data_write...\n"); +- stp_spin_lock_irqsave(&_stp_print_lock, flags); + while (len > 0) { + size_t bytes_reserved; + +@@ -182,12 +143,9 @@ void stp_print_flush(_stp_pbuf *pb) + break; + } + } +- stp_spin_unlock_irqrestore(&_stp_print_lock, flags); + + if (inode_locked) + _stp_transport_unlock_relay_inode(); +- +- _stp_runtime_entryfn_put_context(c); + } + #endif /* !STP_BULKMODE */ + } +diff --git a/runtime/stack.c b/runtime/stack.c +index 241ccf793..da23d4395 100644 +--- a/runtime/stack.c ++++ b/runtime/stack.c +@@ -690,13 +690,20 @@ static void _stp_stack_kernel_sprint(char *str, int size, struct context* c, + * then call _stp_stack_print, + * then copy the result into the output string + * and clear the print buffer. */ +- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id()); +- _stp_print_flush(); ++ struct _stp_log *log; ++ unsigned long flags; + +- _stp_stack_kernel_print(c, sym_flags); ++ if (!_stp_print_trylock_irqsave(&flags)) { ++ *str = '\0'; ++ return; ++ } + +- strlcpy(str, pb->buf, size < (int)pb->len ? size : (int)pb->len); +- pb->len = 0; ++ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); ++ __stp_print_flush(log); ++ _stp_stack_kernel_print(c, sym_flags); ++ strlcpy(str, log->buf, min_t(int, size, log->len)); ++ log->len = 0; ++ _stp_print_unlock_irqrestore(&flags); + } + + static void _stp_stack_user_sprint(char *str, int size, struct context* c, +@@ -707,13 +714,20 @@ static void _stp_stack_user_sprint(char *str, int size, struct context* c, + * then call _stp_stack_print, + * then copy the result into the output string + * and clear the print buffer. */ +- _stp_pbuf *pb = per_cpu_ptr(Stp_pbuf, smp_processor_id()); +- _stp_print_flush(); ++ struct _stp_log *log; ++ unsigned long flags; + +- _stp_stack_user_print(c, sym_flags); ++ if (!_stp_print_trylock_irqsave(&flags)) { ++ *str = '\0'; ++ return; ++ } + +- strlcpy(str, pb->buf, size < (int)pb->len ? size : (int)pb->len); +- pb->len = 0; ++ log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); ++ __stp_print_flush(log); ++ _stp_stack_user_print(c, sym_flags); ++ strlcpy(str, log->buf, min_t(int, size, log->len)); ++ log->len = 0; ++ _stp_print_unlock_irqrestore(&flags); + } + + #endif /* _STACK_C_ */ +diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c +index 57955334b..44e69b68c 100644 +--- a/runtime/transport/transport.c ++++ b/runtime/transport/transport.c +@@ -540,8 +540,8 @@ static void _stp_transport_close(void) + current->pid); + _stp_cleanup_and_exit(0); + _stp_unregister_ctl_channel(); ++ _stp_print_cleanup(); /* Requires the transport, so free this first */ + _stp_transport_fs_close(); +- _stp_print_cleanup(); /* free print buffers */ + _stp_mem_debug_done(); + + dbug_trans(1, "---- CLOSED ----\n"); +diff --git a/runtime/vsprintf.c b/runtime/vsprintf.c +index 28fd18f16..417d9f7f3 100644 +--- a/runtime/vsprintf.c ++++ b/runtime/vsprintf.c +@@ -542,6 +542,8 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args) + number of chars for from string */ + int qualifier; /* 'h', 'l', or 'L' for integer fields */ + int num_bytes = 0; ++ unsigned long irqflags = 0; ++ bool got_print_lock = false; + + /* Reject out-of-range values early */ + if (unlikely((int) size < 0)) +@@ -724,11 +726,14 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args) + num_bytes = STP_BUFFER_SIZE; + } + ++ if (!_stp_print_trylock_irqsave(&irqflags)) ++ return 0; + str = (char*)_stp_reserve_bytes(num_bytes); + if (str == NULL) { + _stp_error("Couldn't reserve any print buffer space\n"); +- return 0; ++ goto err_unlock; + } ++ got_print_lock = true; + size = num_bytes; + end = str + size - 1; + +@@ -820,8 +825,10 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args) + field_width, precision, + *fmt, flags); + if (unlikely(str == NULL)) { +- if (num_bytes > 0) ++ if (num_bytes > 0) { + _stp_unreserve_bytes(num_bytes); ++ goto err_unlock; ++ } + return 0; + } + continue; +@@ -923,7 +930,14 @@ _stp_vsnprintf(char *buf, size_t size, const char *fmt, va_list args) + /* don't write out a null byte if the buf size is zero */ + *end = '\0'; + } ++ ++ if (got_print_lock) ++ _stp_print_unlock_irqrestore(&irqflags); + return str-buf; ++ ++err_unlock: ++ _stp_print_unlock_irqrestore(&irqflags); ++ return 0; + } + + #endif /* _VSPRINTF_C_ */ +diff --git a/translate.cxx b/translate.cxx +index 53f1d0725..f0195486c 100644 +--- a/translate.cxx ++++ b/translate.cxx +@@ -1354,6 +1354,9 @@ c_unparser::emit_compiled_printfs () + o->newline() << "unsigned long ptr_value;"; + o->newline() << "int num_bytes;"; + ++ if (print_to_stream) ++ o->newline() << "unsigned long irqflags;"; ++ + o->newline() << "(void) width;"; + o->newline() << "(void) precision;"; + o->newline() << "(void) ptr_value;"; +@@ -1452,7 +1455,9 @@ c_unparser::emit_compiled_printfs () + } + + o->newline() << "num_bytes = clamp(num_bytes, 0, STP_BUFFER_SIZE);"; +- o->newline() << "str = (char*)_stp_reserve_bytes(num_bytes);"; ++ o->newline() << "if (!_stp_print_trylock_irqsave(&irqflags))"; ++ o->newline(1) << "return;"; ++ o->newline(-1) << "str = (char*)_stp_reserve_bytes(num_bytes);"; + o->newline() << "end = str ? str + num_bytes - 1 : 0;"; + } + else // !print_to_stream +@@ -1547,8 +1552,14 @@ c_unparser::emit_compiled_printfs () + o->newline() << "if (unlikely(str == NULL)) {"; + o->indent(1); + if (print_to_stream) ++ { + o->newline() << "_stp_unreserve_bytes(num_bytes);"; +- o->newline() << "return;"; ++ o->newline() << "goto err_unlock;"; ++ } ++ else ++ { ++ o->newline() << "return;"; ++ } + o->newline(-1) << "}"; + break; + +@@ -1575,6 +1586,11 @@ c_unparser::emit_compiled_printfs () + + o->newline(-1) << "}"; + ++ if (print_to_stream) ++ { ++ o->newline(-1) << "err_unlock:"; ++ o->newline(1) << "_stp_print_unlock_irqrestore(&irqflags);"; ++ } + o->newline(-1) << "}"; + } + o->newline() << "#endif // STP_LEGACY_PRINT"; +commit e8c7a2067ec7fc6315ee9bc34a010ec5f0369c5c +Author: Frank Ch. Eigler +Date: Fri Dec 4 19:33:22 2020 -0500 + + testsuite pr14536.stp: toughen + + This test case stresses nesting of heavy duty processing (backtrace + printing) within kernel interrupt processing paths. It seems to + sometimes trigger problems - so let's make the test harder to make + latent problems show up more likely. Instead of quitting after the + first irq_* function hit, stick around for 10 seconds. + +diff --git a/testsuite/systemtap.base/pr14546.stp b/testsuite/systemtap.base/pr14546.stp +index 3e59a6f3a..e4c205a8f 100644 +--- a/testsuite/systemtap.base/pr14546.stp ++++ b/testsuite/systemtap.base/pr14546.stp +@@ -2,5 +2,6 @@ probe kernel.function("irq_*").call { + x = 10; y = 10; z = 10; w = 10 + $1 + assert(!(x != 10 || y != 10 || z != 10 || w != 10), "memory got corrupted by " . @1) +- exit() + } ++ ++probe timer.s(10) { exit () } +commit cd6399e621646856824ea96b11605a0f52011272 +Author: Frank Ch. Eigler +Date: Fri Dec 4 21:33:21 2020 -0500 + + dyninst transport: add _stp_print_*lock_irq* stubs + + Recent code on the transport/linux side needs a few new (stub) + functions and type decls. + +diff --git a/runtime/dyninst/print.c b/runtime/dyninst/print.c +index 9d91224ba..c78def272 100644 +--- a/runtime/dyninst/print.c ++++ b/runtime/dyninst/print.c +@@ -107,4 +107,18 @@ static void _stp_print_char (const char c) + } + } + ++ ++/* no-op stub synchronization */ ++static bool _stp_print_trylock_irqsave(unsigned long *flags) ++{ ++ (void) flags; ++ return true; ++} ++ ++static void _stp_print_unlock_irqrestore(unsigned long *flags) ++{ ++ (void) flags; ++} ++ ++ + #endif /* _STAPDYN_PRINT_C_ */ +diff --git a/runtime/dyninst/runtime_defines.h b/runtime/dyninst/runtime_defines.h +index 5c3dec519..d00c76a21 100644 +--- a/runtime/dyninst/runtime_defines.h ++++ b/runtime/dyninst/runtime_defines.h +@@ -7,3 +7,6 @@ + #define STAPCONF_PAGEFAULT_DISABLE 1 + #define pagefault_disable() + #define pagefault_enable() ++ ++typedef int bool; ++enum { false=0, true=1 }; +commit fd93cf71df80f7bb5aae707ea5a5875727a85770 +Author: Sultan Alsawaf +Date: Wed Dec 9 12:55:10 2020 -0800 + + PR26844: fix off-by-one error when copying printed backtraces + + Since log->buf isn't null-terminated, log->len represents the total + number of bytes present in the log buffer for copying. The use of + strlcpy() here with log->len as its size results in log->len - 1 bytes + being copied, with the log->len'nth byte of the output buffer being set + to zero to terminate the string. Use memcpy() instead to remedy this, + while ensuring that the output buffer has space for null termination, + since the output buffer needs to be terminated. + +diff --git a/runtime/stack.c b/runtime/stack.c +index da23d4395..85883d6c4 100644 +--- a/runtime/stack.c ++++ b/runtime/stack.c +@@ -692,6 +692,7 @@ static void _stp_stack_kernel_sprint(char *str, int size, struct context* c, + * and clear the print buffer. */ + struct _stp_log *log; + unsigned long flags; ++ int bytes; + + if (!_stp_print_trylock_irqsave(&flags)) { + *str = '\0'; +@@ -701,7 +702,9 @@ static void _stp_stack_kernel_sprint(char *str, int size, struct context* c, + log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); + __stp_print_flush(log); + _stp_stack_kernel_print(c, sym_flags); +- strlcpy(str, log->buf, min_t(int, size, log->len)); ++ bytes = min_t(int, size - 1, log->len); ++ memcpy(str, log->buf, bytes); ++ str[bytes] = '\0'; + log->len = 0; + _stp_print_unlock_irqrestore(&flags); + } +@@ -716,6 +719,7 @@ static void _stp_stack_user_sprint(char *str, int size, struct context* c, + * and clear the print buffer. */ + struct _stp_log *log; + unsigned long flags; ++ int bytes; + + if (!_stp_print_trylock_irqsave(&flags)) { + *str = '\0'; +@@ -725,7 +729,9 @@ static void _stp_stack_user_sprint(char *str, int size, struct context* c, + log = per_cpu_ptr(_stp_log_pcpu, raw_smp_processor_id()); + __stp_print_flush(log); + _stp_stack_user_print(c, sym_flags); +- strlcpy(str, log->buf, min_t(int, size, log->len)); ++ bytes = min_t(int, size - 1, log->len); ++ memcpy(str, log->buf, bytes); ++ str[bytes] = '\0'; + log->len = 0; + _stp_print_unlock_irqrestore(&flags); + } +commit 8819e2a04596deb2fe427d261bebcaf3d2620dfb +Author: Sultan Alsawaf +Date: Wed Dec 9 17:22:20 2020 -0800 + + always use per-cpu bulkmode relayfs files to communicate with userspace + + Using a mutex_trylock() in __stp_print_flush() leads to a lot of havoc, + for numerous. Firstly, since __stp_print_flush() can be called from IRQ + context, holding the inode mutex from here would make the mutex owner + become nonsense, since mutex locks can only be held in contexts backed + by the scheduler. Secondly, the mutex_trylock implementation has a + spin_lock() inside of it that leads to two issues: IRQs aren't disabled + when acquiring this spin_lock(), so using it from IRQ context can lead + to a deadlock, and since spin locks can have tracepoints via + lock_acquire(), the spin_lock() can recurse on itself inside a stap + probe and deadlock, like so: + + #0 [ffff88017f6d7a08] kvm_wait at ffffffff81079f5a + #1 [ffff88017f6d7a30] __pv_queued_spin_lock_slowpath at ffffffff8114f51e + #2 [ffff88017f6d7a70] queued_spin_lock_slowpath at ffffffff810e842b + #3 [ffff88017f6d7a80] mutex_trylock at ffffffff81882b1b + #4 [ffff88017f6d7ab8] _stp_transport_trylock_relay_inode at ffffffffc0c599df [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #5 [ffff88017f6d7ad8] __stp_print_flush at ffffffffc09b6483 [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #6 [ffff88017f6d7b10] probe_7879 at ffffffffc0a98c85 [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #7 [ffff88017f6d7b38] enter_real_tracepoint_probe_1543 at ffffffffc0c3b757 [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #8 [ffff88017f6d7b70] enter_tracepoint_probe_1543 at ffffffffc09b117e [stap_47650d3377d05db0ab7cbbaa25765809__11657] + #9 [ffff88017f6d7b80] lock_acquire at ffffffff811460ba + + The reason the mutex_trylock() was needed in the first place was because + staprun doesn't properly use the relayfs API when reading buffers in + non-bulk mode. It tries to read all CPUs' buffers from a single thread, + when it should be reading each CPU's buffer from a thread running on + said CPU in order to utilize relayfs' synchronization guarantees, which + are made by disabling IRQs on the local CPU when a buffer is modified. + + This change makes staprun always use per-CPU threads to read print + buffers so that we don't need the mutex_trylock() in the print flush + routine, which resolves a wide variety of serious bugs. + + We also need to adjust the transport sub-buffer count to accommodate for + frequent print flushing. The sub-buffer size is now reduced to match the + log buffer size, which is 8192 by default, and the number of sub-buffers + is increased to 256. This uses exactly the same amount of memory as + before. + +diff --git a/runtime/print_flush.c b/runtime/print_flush.c +index acd6a32d9..f4d72d30f 100644 +--- a/runtime/print_flush.c ++++ b/runtime/print_flush.c +@@ -18,6 +18,7 @@ + + static void __stp_print_flush(struct _stp_log *log) + { ++ char *bufp = log->buf; + size_t len = log->len; + void *entry = NULL; + +@@ -26,126 +27,20 @@ static void __stp_print_flush(struct _stp_log *log) + return; + + log->len = 0; +- + dbug_trans(1, "len = %zu\n", len); +- +-#ifdef STP_BULKMODE +-#ifdef NO_PERCPU_HEADERS +- { +- char *bufp = log->buf; +- int inode_locked; +- +- if (!(inode_locked = _stp_transport_trylock_relay_inode())) { +- atomic_inc (&_stp_transport_failures); +-#ifndef STP_TRANSPORT_RISKY +- return; +-#endif +- } +- +- while (len > 0) { +- size_t bytes_reserved; +- +- bytes_reserved = _stp_data_write_reserve(len, &entry); +- if (likely(entry && bytes_reserved > 0)) { +- memcpy(_stp_data_entry_data(entry), bufp, +- bytes_reserved); +- _stp_data_write_commit(entry); +- bufp += bytes_reserved; +- len -= bytes_reserved; +- } +- else { +- atomic_inc(&_stp_transport_failures); +- break; +- } +- } +- +- if (inode_locked) +- _stp_transport_unlock_relay_inode(); +- } +- +-#else /* !NO_PERCPU_HEADERS */ +- +- { +- char *bufp = log->buf; +- struct _stp_trace t = { .sequence = _stp_seq_inc(), +- .pdu_len = len}; ++ do { + size_t bytes_reserved; +- int inode_locked; + +- if (!(inode_locked = _stp_transport_trylock_relay_inode())) { +- atomic_inc (&_stp_transport_failures); +-#ifndef STP_TRANSPORT_RISKY +- return; +-#endif +- } +- +- bytes_reserved = _stp_data_write_reserve(sizeof(struct _stp_trace), &entry); +- if (likely(entry && bytes_reserved > 0)) { +- /* prevent unaligned access by using memcpy() */ +- memcpy(_stp_data_entry_data(entry), &t, sizeof(t)); ++ bytes_reserved = _stp_data_write_reserve(len, &entry); ++ if (likely(entry && bytes_reserved)) { ++ memcpy(_stp_data_entry_data(entry), bufp, ++ bytes_reserved); + _stp_data_write_commit(entry); +- } +- else { ++ bufp += bytes_reserved; ++ len -= bytes_reserved; ++ } else { + atomic_inc(&_stp_transport_failures); +- goto done; ++ break; + } +- +- while (len > 0) { +- bytes_reserved = _stp_data_write_reserve(len, &entry); +- if (likely(entry && bytes_reserved > 0)) { +- memcpy(_stp_data_entry_data(entry), bufp, +- bytes_reserved); +- _stp_data_write_commit(entry); +- bufp += bytes_reserved; +- len -= bytes_reserved; +- } +- else { +- atomic_inc(&_stp_transport_failures); +- break; +- } +- } +- +- done: +- +- if (inode_locked) +- _stp_transport_unlock_relay_inode(); +- } +-#endif /* !NO_PERCPU_HEADERS */ +- +-#else /* !STP_BULKMODE */ +- +- { +- char *bufp = log->buf; +- int inode_locked; +- +- if (!(inode_locked = _stp_transport_trylock_relay_inode())) { +- atomic_inc (&_stp_transport_failures); +-#ifndef STP_TRANSPORT_RISKY +- dbug_trans(0, "discarding %zu bytes of data\n", len); +- return; +-#endif +- } +- +- dbug_trans(1, "calling _stp_data_write...\n"); +- while (len > 0) { +- size_t bytes_reserved; +- +- bytes_reserved = _stp_data_write_reserve(len, &entry); +- if (likely(entry && bytes_reserved > 0)) { +- memcpy(_stp_data_entry_data(entry), bufp, +- bytes_reserved); +- _stp_data_write_commit(entry); +- bufp += bytes_reserved; +- len -= bytes_reserved; +- } +- else { +- atomic_inc(&_stp_transport_failures); +- break; +- } +- } +- +- if (inode_locked) +- _stp_transport_unlock_relay_inode(); +- } +-#endif /* !STP_BULKMODE */ ++ } while (len > 0); + } +diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c +index ff621f71d..2ba5eea7d 100644 +--- a/runtime/transport/relay_v2.c ++++ b/runtime/transport/relay_v2.c +@@ -67,7 +67,7 @@ static size_t __stp_relay_switch_subbuf(struct rchan_buf *buf, size_t length) + return 0; + + if (unlikely(length > buf->chan->subbuf_size)) +- goto toobig; ++ length = buf->chan->subbuf_size; + + if (buf->offset != buf->chan->subbuf_size + 1) { + buf->prev_padding = buf->chan->subbuf_size - buf->offset; +@@ -98,14 +98,7 @@ static size_t __stp_relay_switch_subbuf(struct rchan_buf *buf, size_t length) + buf->data = new; + buf->padding[new_subbuf] = 0; + +- if (unlikely(length + buf->offset > buf->chan->subbuf_size)) +- goto toobig; +- + return length; +- +-toobig: +- buf->chan->last_toobig = length; +- return 0; + } + + static void __stp_relay_wakeup_readers(struct rchan_buf *buf) +@@ -117,24 +110,17 @@ static void __stp_relay_wakeup_readers(struct rchan_buf *buf) + + static void __stp_relay_wakeup_timer(stp_timer_callback_parameter_t unused) + { +-#ifdef STP_BULKMODE + int i; +-#endif + + if (atomic_read(&_stp_relay_data.wakeup)) { + struct rchan_buf *buf; + + atomic_set(&_stp_relay_data.wakeup, 0); +-#ifdef STP_BULKMODE + for_each_possible_cpu(i) { + buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, + i); + __stp_relay_wakeup_readers(buf); + } +-#else +- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0); +- __stp_relay_wakeup_readers(buf); +-#endif + } + + if (atomic_read(&_stp_relay_data.transport_state) == STP_TRANSPORT_RUNNING) +@@ -235,55 +221,8 @@ static void _stp_transport_data_fs_stop(void) + atomic_set (&_stp_relay_data.transport_state, STP_TRANSPORT_STOPPED); + del_timer_sync(&_stp_relay_data.timer); + dbug_trans(0, "flushing...\n"); +- if (_stp_relay_data.rchan) { +- struct rchan_buf *buf; +- +- /* NB we cannot call relay_flush() directly here since +- * we need to do inode locking ourselves. +- */ +- +-#ifdef STP_BULKMODE +- unsigned int i; +- struct rchan *rchan = _stp_relay_data.rchan; +- +- for_each_possible_cpu(i) { +- buf = _stp_get_rchan_subbuf(rchan->buf, i); +- if (buf) { +- struct inode *inode = buf->dentry->d_inode; +- +- /* NB we are in the syscall context which +- * allows sleeping. The following inode +- * locking might sleep. See PR26131. */ +- _stp_lock_inode(inode); +- +- /* NB we intentionally avoids calling +- * our own __stp_relay_switch_subbuf() +- * since here we can sleep. */ +- relay_switch_subbuf(buf, 0); +- +- _stp_unlock_inode(inode); +- } +- } +-#else /* !STP_BULKMODE */ +- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, 0); +- +- if (buf != NULL) { +- struct inode *inode = buf->dentry->d_inode; +- +- /* NB we are in the syscall context which allows +- * sleeping. The following inode locking might +- * sleep. See PR26131. */ +- _stp_lock_inode(inode); +- +- /* NB we intentionally avoids calling +- * our own __stp_relay_switch_subbuf() +- * since here we can sleep. */ +- relay_switch_subbuf(buf, 0); +- +- _stp_unlock_inode(inode); +- } +-#endif +- } ++ if (_stp_relay_data.rchan) ++ relay_flush(_stp_relay_data.rchan); + } + } + +@@ -308,9 +247,7 @@ static int _stp_transport_data_fs_init(void) + + /* Create "trace" file. */ + npages = _stp_subbuf_size * _stp_nsubbufs; +-#ifdef STP_BULKMODE + npages *= num_online_cpus(); +-#endif + npages >>= PAGE_SHIFT; + si_meminfo(&si); + #define MB(i) (unsigned long)((i) >> (20 - PAGE_SHIFT)) +@@ -347,9 +284,7 @@ static int _stp_transport_data_fs_init(void) + { + u64 relay_mem; + relay_mem = _stp_subbuf_size * _stp_nsubbufs; +-#ifdef STP_BULKMODE + relay_mem *= num_online_cpus(); +-#endif + _stp_allocated_net_memory += relay_mem; + _stp_allocated_memory += relay_mem; + } +@@ -386,12 +321,7 @@ _stp_data_write_reserve(size_t size_request, void **entry) + return -EINVAL; + + buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, +-#ifdef STP_BULKMODE +- smp_processor_id() +-#else +- 0 +-#endif +- ); ++ smp_processor_id()); + if (unlikely(buf->offset + size_request > buf->chan->subbuf_size)) { + size_request = __stp_relay_switch_subbuf(buf, size_request); + if (!size_request) +@@ -411,65 +341,10 @@ static unsigned char *_stp_data_entry_data(void *entry) + + static int _stp_data_write_commit(void *entry) + { +- /* Nothing to do here. */ +- return 0; +-} +- +-static noinline int _stp_transport_trylock_relay_inode(void) +-{ +- unsigned i; + struct rchan_buf *buf; +- struct inode *inode; +-#ifdef DEBUG_TRANS +- cycles_t begin; +-#endif + + buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, +-#ifdef STP_BULKMODE +- smp_processor_id() +-#else +- 0 +-#endif +- ); +- if (buf == NULL) +- return 0; +- +- inode = buf->dentry->d_inode; +- +-#ifdef DEBUG_TRANS +- begin = get_cycles(); +-#endif +- +- /* NB this bounded spinlock is needed for stream mode. it is observed +- * that almost all of the iterations needed are less than 50K iterations +- * or about 300K cycles. +- */ +- for (i = 0; i < 50 * 1000; i++) { +- if (_stp_trylock_inode(inode)) { +- dbug_trans(3, "got inode lock: i=%u: cycles: %llu", i, +- get_cycles() - begin); +- return 1; +- } +- } +- +- dbug_trans(0, "failed to get inode lock: i=%u: cycles: %llu", i, +- get_cycles() - begin); ++ smp_processor_id()); ++ __stp_relay_switch_subbuf(buf, 0); + return 0; + } +- +-static void _stp_transport_unlock_relay_inode(void) +-{ +- struct rchan_buf *buf; +- +- buf = _stp_get_rchan_subbuf(_stp_relay_data.rchan->buf, +-#ifdef STP_BULKMODE +- smp_processor_id() +-#else +- 0 +-#endif +- ); +- if (buf == NULL) +- return; +- +- _stp_unlock_inode(buf->dentry->d_inode); +-} +diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c +index 96426eb7b..1be3e9485 100644 +--- a/runtime/transport/transport.c ++++ b/runtime/transport/transport.c +@@ -49,7 +49,6 @@ static int _stp_probes_started = 0; + * transport state flag is atomic. */ + static atomic_t _stp_transport_state = ATOMIC_INIT(_STP_TS_UNINITIALIZED); + +-static inline int _stp_trylock_inode(struct inode *inode); + static inline void _stp_lock_inode(struct inode *inode); + static inline void _stp_unlock_inode(struct inode *inode); + +@@ -70,8 +69,8 @@ static inline void _stp_unlock_inode(struct inode *inode); + #include "procfs.c" + #include "control.c" + +-static unsigned _stp_nsubbufs = 8; +-static unsigned _stp_subbuf_size = 65536*4; ++static unsigned _stp_nsubbufs = 256; ++static unsigned _stp_subbuf_size = STP_BUFFER_SIZE; + + /* module parameters */ + static int _stp_bufsize; +@@ -643,23 +642,6 @@ err0: + return -1; + } + +-/* returns 1 when the lock is successfully acquired, 0 otherwise. */ +-static inline int _stp_trylock_inode(struct inode *inode) +-{ +-#ifdef STAPCONF_INODE_RWSEM +- return inode_trylock(inode); +-#else +-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) +- return mutex_trylock(&inode->i_mutex); +-#else +- /* NB down_trylock() uses a different convention where 0 means +- * the lock is successfully acquired. +- */ +- return !down_trylock(&inode->i_sem); +-#endif +-#endif +-} +- + static inline void _stp_lock_inode(struct inode *inode) + { + #ifdef STAPCONF_INODE_RWSEM +diff --git a/runtime/transport/transport.h b/runtime/transport/transport.h +index 51723b7f5..cc09fc0ae 100644 +--- a/runtime/transport/transport.h ++++ b/runtime/transport/transport.h +@@ -98,24 +98,6 @@ enum _stp_transport_state { + */ + static enum _stp_transport_state _stp_transport_get_state(void); + +-/* +- * _stp_transport_trylock_relay_inode +- * +- * This function locks the relay file inode to protect against relay readers +- * (i.e., staprun/stapio). +- * Returns whether the lock is successfully obtained. +- */ +-static noinline int _stp_transport_trylock_relay_inode(void); +- +-/* +- * _stp_transport_unlock_relay_inode +- * +- * This function releases the lock obtained by +- * _stp_transport_trylock_relay_inode. +- * should only call this when the lock is indeed obtained. +- */ +-static void _stp_transport_unlock_relay_inode(void); +- + /* + * _stp_transport_data_fs_init + * +diff --git a/staprun/relay.c b/staprun/relay.c +index 2f5f2e06a..c76e76719 100644 +--- a/staprun/relay.c ++++ b/staprun/relay.c +@@ -131,6 +131,7 @@ static void *reader_thread(void *data) + sigset_t sigs; + off_t wsize = 0; + int fnum = 0; ++ cpu_set_t cpu_mask; + + sigemptyset(&sigs); + sigaddset(&sigs,SIGUSR2); +@@ -139,21 +140,18 @@ static void *reader_thread(void *data) + sigfillset(&sigs); + sigdelset(&sigs,SIGUSR2); + +- if (bulkmode) { +- cpu_set_t cpu_mask; +- CPU_ZERO(&cpu_mask); +- CPU_SET(cpu, &cpu_mask); +- if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 ) +- _perr("sched_setaffinity"); ++ CPU_ZERO(&cpu_mask); ++ CPU_SET(cpu, &cpu_mask); ++ if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 ) ++ _perr("sched_setaffinity"); + #ifdef NEED_PPOLL +- /* Without a real ppoll, there is a small race condition that could */ +- /* block ppoll(). So use a timeout to prevent that. */ +- timeout->tv_sec = 10; +- timeout->tv_nsec = 0; ++ /* Without a real ppoll, there is a small race condition that could */ ++ /* block ppoll(). So use a timeout to prevent that. */ ++ timeout->tv_sec = 10; ++ timeout->tv_nsec = 0; + #else +- timeout = NULL; ++ timeout = NULL; + #endif +- } + + if (reader_timeout_ms && timeout) { + timeout->tv_sec = reader_timeout_ms / 1000; +@@ -358,11 +356,6 @@ int init_relayfs(void) + _err("couldn't open %s.\n", buf); + return -1; + } +- if (ncpus > 1 && bulkmode == 0) { +- _err("ncpus=%d, bulkmode = %d\n", ncpus, bulkmode); +- _err("This is inconsistent! Please file a bug report. Exiting now.\n"); +- return -1; +- } + + /* PR7097 */ + if (load_only) +commit d86b64029598f69b47d9cf4295f30b7093f38cfc +Author: Sultan Alsawaf +Date: Wed Dec 9 17:22:27 2020 -0800 + + Revert "REVERTME: tapset-timers: work around on-the-fly deadlocks caused by mutex_trylock" + + This reverts commit 6a27888b118b7a94650a68aae028957cdd5fb5f5. + + No longer needed. As promised, we're reverting this. + +diff --git a/tapset-timers.cxx b/tapset-timers.cxx +index 503498c85..10da17cda 100644 +--- a/tapset-timers.cxx ++++ b/tapset-timers.cxx +@@ -391,11 +391,11 @@ hrtimer_derived_probe_group::emit_module_refresh (systemtap_session& s) + s.op->newline(+1) << "struct stap_hrtimer_probe* stp = &stap_hrtimer_probes[i];"; + // timer disabled, but condition says enabled? + s.op->newline( 0) << "if (!stp->enabled && stp->probe->cond_enabled) {"; +- s.op->newline(+1) << "//dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);"; ++ s.op->newline(+1) << "dbug_otf(\"enabling (hrtimer) pidx %zu\\n\", stp->probe->index);"; + s.op->newline( 0) << "_stp_hrtimer_start(stp);"; + // timer enabled, but condition says disabled? + s.op->newline(-1) << "} else if (stp->enabled && !stp->probe->cond_enabled) {"; +- s.op->newline(+1) << "//dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);"; ++ s.op->newline(+1) << "dbug_otf(\"disabling (hrtimer) pidx %zu\\n\", stp->probe->index);"; + s.op->newline( 0) << "_stp_hrtimer_cancel(stp);"; + s.op->newline(-1) << "}"; + s.op->newline( 0) << "stp->enabled = stp->probe->cond_enabled;"; +commit 3abe2c40b2dae499aff2e31beff121fbe43f7654 +Author: Sultan Alsawaf +Date: Fri Dec 11 12:31:25 2020 -0800 + + transport: set is_global to zero even when bulkmode is disabled + + This is needed now that we always want per-cpu logger threads. When + is_global is set to a non-zero value, relay won't create per-cpu log + files. + +diff --git a/runtime/transport/debugfs.c b/runtime/transport/debugfs.c +index 28a5bf89d..bd20281a0 100644 +--- a/runtime/transport/debugfs.c ++++ b/runtime/transport/debugfs.c +@@ -256,13 +256,8 @@ __stp_debugfs_relay_create_buf_file_callback(const char *filename, + * cause relay_open() to create a single global buffer rather + * than the default set of per-cpu buffers. + */ +- if (is_global) { +-#ifdef STP_BULKMODE ++ if (is_global) + *is_global = 0; +-#else +- *is_global = 1; +-#endif +- } + + if (IS_ERR(file)) { + file = NULL; +diff --git a/runtime/transport/procfs.c b/runtime/transport/procfs.c +index 262409356..b0a5d5760 100644 +--- a/runtime/transport/procfs.c ++++ b/runtime/transport/procfs.c +@@ -328,13 +328,8 @@ __stp_procfs_relay_create_buf_file_callback(const char *filename, + unsigned i = 0; + struct inode* in; + +- if (is_global) { +-#ifdef STP_BULKMODE ++ if (is_global) + *is_global = 0; +-#else +- *is_global = 1; +-#endif +- } + + if (parent != _stp_procfs_module_dir_path.dentry) + goto out; +commit a26bf7890196395d73ac193b23e271398731745d +Author: Frank Ch. Eigler +Date: Fri Dec 11 15:39:29 2020 -0500 + + relay transport: comment on STP_BULK message + + While we've eliminated any STP_BULKMODE effects from the way relayfs + files are used ("always bulkmode"), staprun/stapio still need to know + whether the user intended "stap -b" or not, so they can save files + stpd_cpu* files separately. + +diff --git a/runtime/transport/control.c b/runtime/transport/control.c +index 9343b3c28..d123bef2f 100644 +--- a/runtime/transport/control.c ++++ b/runtime/transport/control.c +@@ -88,6 +88,9 @@ static ssize_t _stp_ctl_write_cmd(struct file *file, const char __user *buf, siz + break; + + case STP_BULK: ++ // NB: this signals the runtime to save separate ++ // per-cpu files; our kernel->userspace files are now ++ // always bulkmode (trace$N files). + #ifdef STP_BULKMODE + // no action needed + break; +commit b43eb4ed690bf2421978ed2896667e45e60c3400 +Author: Cosmin Tanislav +Date: Thu Dec 10 16:48:54 2020 -0500 + + bugfix: runtime: transport: handle more error cases in module init + + Signed-off-by: Sultan Alsawaf + +diff --git a/runtime/transport/relay_v2.c b/runtime/transport/relay_v2.c +index 2ba5eea7d..27729f4c8 100644 +--- a/runtime/transport/relay_v2.c ++++ b/runtime/transport/relay_v2.c +@@ -277,6 +277,7 @@ static int _stp_transport_data_fs_init(void) + #endif /* (RELAYFS_CHANNEL_VERSION < 7) */ + if (!_stp_relay_data.rchan) { + rc = -ENOENT; ++ errk("%s: relay_open() failed: %d\n", THIS_MODULE->name, rc); + goto err; + } + /* Increment _stp_allocated_memory and _stp_allocated_net_memory to account for buffers +diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c +index 1be3e9485..f005e14e2 100644 +--- a/runtime/transport/transport.c ++++ b/runtime/transport/transport.c +@@ -552,6 +552,8 @@ static void _stp_transport_close(void) + */ + static int _stp_transport_init(void) + { ++ int ret; ++ + dbug_trans(1, "transport_init\n"); + #ifdef STAPCONF_TASK_UID + _stp_uid = current->uid; +@@ -603,20 +605,28 @@ static int _stp_transport_init(void) + dbug_trans(1, "Using %d subbufs of size %d\n", _stp_nsubbufs, _stp_subbuf_size); + } + +- if (_stp_transport_fs_init(THIS_MODULE->name) != 0) ++ ret = _stp_transport_fs_init(THIS_MODULE->name); ++ if (ret) + goto err0; + + /* create control channel */ +- if (_stp_register_ctl_channel() < 0) ++ ret = _stp_register_ctl_channel(); ++ if (ret < 0) + goto err1; + + /* create print buffers */ +- if (_stp_print_init() < 0) ++ ret = _stp_print_init(); ++ if (ret < 0) { ++ errk("%s: can't create print buffers!", THIS_MODULE->name); + goto err2; ++ } + + /* set _stp_module_self dynamic info */ +- if (_stp_module_update_self() < 0) ++ ret = _stp_module_update_self(); ++ if (ret < 0) { ++ errk("%s: can't update dynamic info!", THIS_MODULE->name); + goto err3; ++ } + + /* start transport */ + _stp_transport_data_fs_start(); +@@ -639,7 +649,7 @@ err2: + err1: + _stp_transport_fs_close(); + err0: +- return -1; ++ return ret; + } + + static inline void _stp_lock_inode(struct inode *inode) +commit 341bf33f14062269c52bcebaa309518d9972ca00 +Author: Frank Ch. Eigler +Date: Fri Dec 11 18:06:36 2020 -0500 + + staprun: handle more and fewer cpus better + + NR_CPUS was a hard-coded minimum and maximum on the number of CPUs + worth of trace$N files staprun/stapio would open at startup. While a + constant is useful for array sizing (and so might as well be really + large), the actual iteration should be informed by get_nprocs_conf(3). + + This patch replaces NR_CPUS with MAX_NR_CPUS (now 1024, why not), and + limits open/thread iterations to the actual number of processors. It + even prints an error if a behemoth >1K-core machine comes into being. + +diff --git a/staprun/relay.c b/staprun/relay.c +index c76e76719..3eb8df34b 100644 +--- a/staprun/relay.c ++++ b/staprun/relay.c +@@ -12,16 +12,16 @@ + + #include "staprun.h" + +-int out_fd[NR_CPUS]; ++int out_fd[MAX_NR_CPUS]; + int monitor_end = 0; +-static pthread_t reader[NR_CPUS]; +-static int relay_fd[NR_CPUS]; +-static int avail_cpus[NR_CPUS]; +-static int switch_file[NR_CPUS]; +-static pthread_mutex_t mutex[NR_CPUS]; ++static pthread_t reader[MAX_NR_CPUS]; ++static int relay_fd[MAX_NR_CPUS]; ++static int avail_cpus[MAX_NR_CPUS]; ++static int switch_file[MAX_NR_CPUS]; ++static pthread_mutex_t mutex[MAX_NR_CPUS]; + static int bulkmode = 0; + static volatile int stop_threads = 0; +-static time_t *time_backlog[NR_CPUS]; ++static time_t *time_backlog[MAX_NR_CPUS]; + static int backlog_order=0; + #define BACKLOG_MASK ((1 << backlog_order) - 1) + #define MONITORLINELENGTH 4096 +@@ -313,12 +313,19 @@ int init_relayfs(void) + if (send_request(STP_BULK, rqbuf, sizeof(rqbuf)) == 0) + bulkmode = 1; + +- /* Try to open a slew of per-cpu trace%d files. Per PR19241, we +- need to go through all potentially present CPUs up to NR_CPUS, that +- we hope is a reasonable limit. For !bulknode, "trace0" will be +- typically used. */ ++ /* Try to open a slew of per-cpu trace%d files. Per PR19241, ++ we need to go through all potentially present CPUs up to ++ get_nprocs_conf(), up to MAX_NR_CPUS (used for array ++ allocations). For !bulknode, "trace0" will be typically ++ used, prior to systemtap 4.5; after, all are used. */ + +- for (i = 0; i < NR_CPUS; i++) { ++ int nprocs = get_nprocs_conf(); ++ if (nprocs > MAX_NR_CPUS) { ++ err("Too many CPUs: get_nprocs_conf()=%d vs MAX_NR_CPUS=%d\n", nprocs, MAX_NR_CPUS); ++ return -1; ++ } ++ ++ for (i = 0; i < nprocs; i++) { + relay_fd[i] = -1; + + #ifdef HAVE_OPENAT +@@ -348,7 +355,8 @@ int init_relayfs(void) + } + } + ncpus = cpui; +- dbug(2, "ncpus=%d, bulkmode = %d\n", ncpus, bulkmode); ++ /* ncpus could be smaller than nprocs if some cpus are offline */ ++ dbug(2, "ncpus=%d, nprocs=%d, bulkmode=%d\n", ncpus, nprocs, bulkmode); + for (i = 0; i < ncpus; i++) + dbug(2, "cpui=%d, relayfd=%d\n", i, avail_cpus[i]); + +diff --git a/staprun/relay_old.c b/staprun/relay_old.c +index f0d2e918f..248e6059d 100644 +--- a/staprun/relay_old.c ++++ b/staprun/relay_old.c +@@ -14,12 +14,12 @@ + #include "staprun.h" + + /* temporary per-cpu output written here for relayfs, filebase0...N */ +-static int relay_fd[NR_CPUS]; +-static int proc_fd[NR_CPUS]; +-static FILE *percpu_tmpfile[NR_CPUS]; +-static char *relay_buffer[NR_CPUS]; +-static pthread_t reader[NR_CPUS]; +-static int switch_file[NR_CPUS]; ++static int relay_fd[MAX_NR_CPUS]; ++static int proc_fd[MAX_NR_CPUS]; ++static FILE *percpu_tmpfile[MAX_NR_CPUS]; ++static char *relay_buffer[MAX_NR_CPUS]; ++static pthread_t reader[MAX_NR_CPUS]; ++static int switch_file[MAX_NR_CPUS]; + static int bulkmode = 0; + unsigned subbuf_size = 0; + unsigned n_subbufs = 0; +@@ -37,7 +37,7 @@ static struct buf_status + { + struct _stp_buf_info info; + unsigned max_backlog; /* max # sub-buffers ready at one time */ +-} status[NR_CPUS]; ++} status[MAX_NR_CPUS]; + + + /** +@@ -461,7 +461,13 @@ int init_oldrelayfs(void) + relay_fd[0] = -1; + out_fd[0] = 0; + +- for (i = 0; i < NR_CPUS; i++) { ++ int nprocs = get_nprocs_conf(); ++ if (nprocs > MAX_NR_CPUS) { ++ err("Too many CPUs: get_nprocs_conf()=%d vs MAX_NR_CPUS=%d\n", nprocs, MAX_NR_CPUS); ++ goto err; ++ } ++ ++ for (i = 0; i < nprocs; i++) { + int ret = open_relayfs_files(i, relay_filebase, proc_filebase); + if (ret == 0) + break; +@@ -472,7 +478,8 @@ int init_oldrelayfs(void) + } + + ncpus = i; +- dbug(2, "ncpus=%d\n", ncpus); ++ /* ncpus could be smaller than nprocs if some cpus are offline */ ++ dbug(2, "ncpus=%d, nprocs=%d\n", ncpus, nprocs); + + if (ncpus == 0) { + err("Couldn't open relayfs files.\n"); +diff --git a/staprun/stap_merge.c b/staprun/stap_merge.c +index 7507f0e3d..87de7d465 100644 +--- a/staprun/stap_merge.c ++++ b/staprun/stap_merge.c +@@ -31,15 +31,15 @@ static void usage (char *prog) + } + + #define TIMESTAMP_SIZE (sizeof(int)) +-#define NR_CPUS 256 ++#define MAX_NR_CPUS 1024 + + int main (int argc, char *argv[]) + { + char *buf, *outfile_name = NULL; + int c, i, j, rc, dropped=0; +- long count=0, min, num[NR_CPUS] = { 0 }; ++ long count=0, min, num[MAX_NR_CPUS] = { 0 }; + FILE *ofp = NULL; +- FILE *fp[NR_CPUS] = { 0 }; ++ FILE *fp[MAX_NR_CPUS] = { 0 }; + int ncpus, len, verbose = 0; + int bufsize = 65536; + +@@ -67,6 +67,10 @@ int main (int argc, char *argv[]) + + i = 0; + while (optind < argc) { ++ if (i >= MAX_NR_CPUS) { ++ fprintf(stderr, "too many files (MAX_NR_CPUS=%d)\n", MAX_NR_CPUS); ++ return -1; ++ } + fp[i] = fopen(argv[optind++], "r"); + if (!fp[i]) { + fprintf(stderr, "error opening file %s.\n", argv[optind - 1]); +diff --git a/staprun/staprun.h b/staprun/staprun.h +index e05dbe5b6..2d68bf527 100644 +--- a/staprun/staprun.h ++++ b/staprun/staprun.h +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + /* Include config.h to pick up dependency for --prefix usage. */ + #include "../config.h" +@@ -285,10 +286,10 @@ extern int optopt; + extern int optind; + + /* maximum number of CPUs we can handle */ +-#define NR_CPUS 256 ++#define MAX_NR_CPUS 1024 + + /* relay*.c uses these */ +-extern int out_fd[NR_CPUS]; ++extern int out_fd[MAX_NR_CPUS]; + + /* relay_old uses these. Set in ctl.c */ + extern unsigned subbuf_size; +commit b4b5a29b51586f75de16cacdb44bdf0b3ad0478e +Author: Sultan Alsawaf +Date: Mon Dec 14 13:20:34 2020 -0800 + + staprun: use the correct out_fd when bulkmode and fsize_max aren't used + + When bulkmode and fsize_max aren't used, there is only one output fd and + it is stored at out_fd[avail_cpus[0]]. + +diff --git a/staprun/relay.c b/staprun/relay.c +index 3eb8df34b..d0202e52f 100644 +--- a/staprun/relay.c ++++ b/staprun/relay.c +@@ -232,10 +232,17 @@ static void *reader_thread(void *data) + wbuf += bytes; + wsize += bytes; + } else { +- rc = write(out_fd[cpu], wbuf, wbytes); ++ int fd; ++ /* Only bulkmode and fsize_max use per-cpu output files. Otherwise, ++ there's just a single output fd stored at out_fd[avail_cpus[0]]. */ ++ if (bulkmode || fsize_max) ++ fd = out_fd[cpu]; ++ else ++ fd = out_fd[avail_cpus[0]]; ++ rc = write(fd, wbuf, wbytes); + if (rc <= 0) { + perr("Couldn't write to output %d for cpu %d, exiting.", +- out_fd[cpu], cpu); ++ fd, cpu); + goto error_out; + } + wbytes -= rc; +commit b26b4e2c257e0bd65134eed5e51d754227a4ed3f +Author: Sultan Alsawaf +Date: Wed Dec 30 14:21:42 2020 -0800 + + task_finder2: fix panics due to broken task work cancellation + + The task_work_cancel() API uses function pointers to uniquely identify + task work structs, so there's no guarantee that a specific task work + struct we want to cancel is the one that will actually get canceled. + This issue would cause task work structs to be freed while they were + still queued up on the task's task-worker list. + + This is an example of one such panic, where the DEBUG_MEM feature + reported that a __stp_tf_task_work struct (56 bytes) wasn't freed, + because that specific task worker got canceled and instead an active + task worker got freed! + + orxray_resty_mem_X_35062: ERROR: Memory ffff8809ed388620 len=56 allocation type: kmalloc. Not freed. + BUG: unable to handle kernel paging request at ffffffffa0570877 + IP: [] 0xffffffffa0570876 + PGD 1abd067 PUD 1abe063 PMD 1028286067 PTE 0 + Oops: 0010 [#1] SMP + CPU: 3 PID: 1338 Comm: nginx Tainted: G OE ------------ 3.10.0-514.10.2.el7.x86_64.debug #1 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 + task: ffff880eae2d0000 ti: ffff880eaf2e4000 task.ti: ffff880eaf2e4000 + RIP: 0010:[] [] 0xffffffffa0570876 + RSP: 0018:ffff880eaf2e7d78 EFLAGS: 00010282 + RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000 + RDX: ffff8809ed388640 RSI: 0000000000000000 RDI: ffff8809ed388640 + RBP: ffff880eaf2e7da0 R08: 0000000000000000 R09: 0000000000000000 + R10: 0000000000000001 R11: ffffffffff90001c R12: ffffffff8248b1c0 + R13: ffff880eae2d0818 R14: ffff880eae2d0000 R15: 00007eff3d2490b0 + FS: 00007eff3dcd2740(0000) GS:ffff881037c00000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: ffffffffa0570877 CR3: 0000000ebce67000 CR4: 00000000003406e0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 + Stack: + ffffffff810c6544 ffff880eaf2e7f58 ffff880eaf2e7e70 ffff880eae2d0000 + 00007eff3dcb3338 ffff880eaf2e7e38 ffffffff810b31ba ffff880eaf2e7dc0 + ffffffff8106c279 ffff880eaf2e7e50 ffff880ef8a792c0 ffff880eaf2e7df8 + Call Trace: + [] ? task_work_run+0xb4/0xe0 + [] get_signal_to_deliver+0x85a/0x960 + [] ? kvm_sched_clock_read+0x9/0x20 + [] ? sched_clock_local+0x1d/0x80 + [] ? sched_clock_cpu+0xb8/0xe0 + [] do_signal+0x57/0x6e0 + [] ? int_very_careful+0x5/0xd + [] do_notify_resume+0x5f/0xb0 + [] int_signal+0x12/0x17 + Code: Bad RIP value. + RIP [] 0xffffffffa0570876 + RSP + CR2: ffffffffa0570877 + ---[ end trace 1cdf8e5b522b246e ]--- + +diff --git a/runtime/linux/task_finder2.c b/runtime/linux/task_finder2.c +index 4e0b68f7c..ecf1f77fd 100644 +--- a/runtime/linux/task_finder2.c ++++ b/runtime/linux/task_finder2.c +@@ -226,10 +226,22 @@ static void __stp_tf_cancel_all_task_work(void) + // Cancel all remaining requests. + stp_spin_lock_irqsave(&__stp_tf_task_work_list_lock, flags); + list_for_each_entry_safe(node, tmp, &__stp_tf_task_work_list, list) { +- if (stp_task_work_cancel(node->task, node->work.func)) { +- list_del(&node->list); +- _stp_kfree(node); +- } ++ struct __stp_tf_task_work *tf_work; ++ struct task_work *work; ++ ++ work = stp_task_work_cancel(node->task, node->work.func); ++ if (!work) ++ continue; ++ ++ /* ++ * There can be multiple queued task workers with the same ++ * worker func, so there's no guarantee that tf_work == node. ++ * Therefore, we can only free what stp_task_work_cancel() just ++ * gave us; freeing 'node' would be unsafe. ++ */ ++ tf_work = container_of(work, typeof(*tf_work), work); ++ list_del(&tf_work->list); ++ _stp_kfree(tf_work); + } + stp_spin_unlock_irqrestore(&__stp_tf_task_work_list_lock, flags); + } +commit 96470399a5a6fba864b90afd15eda43cdc8c8ac4 +Author: Sultan Alsawaf +Date: Wed Dec 30 15:42:11 2020 -0800 + + task_finder2: fix list corruption in __stp_tf_cancel_all_task_work() + + The previous commit (b26b4e2c2 "task_finder2: fix panics due to broken + task work cancellation") made it possible for the next node in the task + work list to be free, which would made list_for_each_entry_safe() not so + safe anymore. Using list_for_each_entry_safe() is still the fastest + approach here, so when the next node in the list happens to be freed, we + should just restart iteration on the list. + +diff --git a/runtime/linux/task_finder2.c b/runtime/linux/task_finder2.c +index ecf1f77fd..83fc17b5e 100644 +--- a/runtime/linux/task_finder2.c ++++ b/runtime/linux/task_finder2.c +@@ -225,6 +225,7 @@ static void __stp_tf_cancel_all_task_work(void) + + // Cancel all remaining requests. + stp_spin_lock_irqsave(&__stp_tf_task_work_list_lock, flags); ++restart: + list_for_each_entry_safe(node, tmp, &__stp_tf_task_work_list, list) { + struct __stp_tf_task_work *tf_work; + struct task_work *work; +@@ -242,6 +243,21 @@ static void __stp_tf_cancel_all_task_work(void) + tf_work = container_of(work, typeof(*tf_work), work); + list_del(&tf_work->list); + _stp_kfree(tf_work); ++ ++ /* ++ * If the tf_work we just freed was the next node in the list, ++ * then we need to restart the list iteration because ++ * list_for_each_entry_safe() can't cope with the next node ++ * being freed. We still need to use list_for_each_entry_safe() ++ * because we need to get through one successful pass through ++ * the entire list, since it's not guaranteed that this list ++ * will be empty when this function exits, as there can still be ++ * active task workers running, which is fine since the ++ * stp_task_work API will wait for all task workers to finish ++ * before allowing the module to unload. ++ */ ++ if (tf_work == tmp) ++ goto restart; + } + stp_spin_unlock_irqrestore(&__stp_tf_task_work_list_lock, flags); + } +commit 6cb54128e005d1220a7b064ee42b9f72561c28e7 +Author: Sultan Alsawaf +Date: Wed Dec 30 15:47:58 2020 -0800 + + task_finder2: fix task worker race on module unload + + Unfortunately, __stp_tf_cancel_all_task_work() does not guarantee that + all of the task finder's task workers will be finished executing when it + returns. In this case, we rely on the stp_task_work API to prevent the + module from being unloaded while there are task workers in-flight, which + works, but the stp_task_work API is notified of a task worker finishing + before it actually finishes. Inside __stp_tf_task_worker_fn(), the + call to the task worker's function (tf_work->func) is where the final + refcount in the stp_task_work API could be put, but there will still be + instructions left in the task worker that will be executing for a short + time after that. In that short time, there can be a race where the + module is unloaded before the task worker finishes executing all of its + instructions, especially if the task worker gets preempted during this + time on a PREEMPT kernel. + + To remedy this, we must ensure that the last instruction in + __stp_tf_task_worker_fn() is where the stp_task_work API is notified of + a task worker finishing. + +diff --git a/runtime/linux/task_finder2.c b/runtime/linux/task_finder2.c +index 83fc17b5e..2bab19295 100644 +--- a/runtime/linux/task_finder2.c ++++ b/runtime/linux/task_finder2.c +@@ -150,6 +150,7 @@ __stp_tf_task_worker_fn(struct task_work *work) + * workers for this task. + */ + __stp_tf_task_work_free(work); ++ stp_task_work_func_done(); + } + + static void +@@ -1066,11 +1067,8 @@ __stp_tf_clone_worker(struct task_work *work) + + might_sleep(); + if (atomic_read(&__stp_task_finder_state) != __STP_TF_RUNNING +- || current->flags & PF_EXITING) { +- /* Remember that this task_work_func is finished. */ +- stp_task_work_func_done(); ++ || current->flags & PF_EXITING) + return; +- } + + __stp_tf_handler_start(); + +@@ -1085,10 +1083,6 @@ __stp_tf_clone_worker(struct task_work *work) + } + + __stp_tf_handler_end(); +- +- /* Remember that this task_work_func is finished. */ +- stp_task_work_func_done(); +- return; + } + + +@@ -1392,11 +1386,8 @@ __stp_tf_quiesce_worker(struct task_work *work) + + might_sleep(); + if (atomic_read(&__stp_task_finder_state) != __STP_TF_RUNNING +- || current->flags & PF_EXITING) { +- /* Remember that this task_work_func is finished. */ +- stp_task_work_func_done(); ++ || current->flags & PF_EXITING) + return; +- } + + /* If we had a build-id based executable probe (so we have a + * tgt->build_id) set, we could not check it back in +@@ -1420,8 +1411,6 @@ __stp_tf_quiesce_worker(struct task_work *work) + (long) current->tgid, ok); + if (!ok) { + // stap_utrace_detach (current, & tgt->ops); +- /* Remember that this task_work_func is finished. */ +- stp_task_work_func_done(); + return; + } + } +@@ -1444,10 +1433,6 @@ __stp_tf_quiesce_worker(struct task_work *work) + __stp_call_callbacks(tgt, current, 1, (current->pid == current->tgid)); + + __stp_tf_handler_end(); +- +- /* Remember that this task_work_func is finished. */ +- stp_task_work_func_done(); +- return; + } + + static u32 +@@ -1614,18 +1599,12 @@ __stp_tf_mmap_worker(struct task_work *work) + + // See if we can find saved syscall info. + entry = __stp_tf_get_map_entry(current); +- if (entry == NULL) { +- /* Remember that this task_work_func is finished. */ +- stp_task_work_func_done(); ++ if (entry == NULL) + return; +- } + + if (atomic_read(&__stp_task_finder_state) != __STP_TF_RUNNING + || current->flags & PF_EXITING) { + __stp_tf_remove_map_entry(entry); +- +- /* Remember that this task_work_func is finished. */ +- stp_task_work_func_done(); + return; + } + +@@ -1650,10 +1629,6 @@ __stp_tf_mmap_worker(struct task_work *work) + __stp_tf_remove_map_entry(entry); + + __stp_tf_handler_end(); +- +- /* Remember that this task_work_func is finished. */ +- stp_task_work_func_done(); +- return; + } + + static u32 diff --git a/SOURCES/rhbz1908904.patch b/SOURCES/rhbz1908904.patch new file mode 100644 index 0000000..7cbe564 --- /dev/null +++ b/SOURCES/rhbz1908904.patch @@ -0,0 +1,44 @@ +commit e3287bddcdc51705bb206479daa82e97fb28071f +Author: Frank Ch. Eigler +Date: Wed Dec 9 22:29:43 2020 -0500 + + PR27044: fix lock loop for conditional probes + + Emit a nested block carefully so that the "goto out;" from a failed + stp_lock_probe() call in that spot near the epilogue of a + probe-handler goes downward, not upward. + +diff --git a/translate.cxx b/translate.cxx +index f0195486c..d7fe1e3b2 100644 +--- a/translate.cxx ++++ b/translate.cxx +@@ -2969,21 +2969,21 @@ c_unparser::emit_probe (derived_probe* v) + { + // PR26296 + // emit all read/write locks for global variables ... if somehow still not done by now ++ // emit a local out: label, for error catching in these condition exprs ++ o->newline() << "{"; ++ o->newline(1) << "__label__ out, deref_fault;"; + if (v->needs_global_locks ()) + emit_lock (); + + for (set::const_iterator +- it = v->probes_with_affected_conditions.begin(); +- it != v->probes_with_affected_conditions.end(); ++it) ++ it = v->probes_with_affected_conditions.begin(); ++ it != v->probes_with_affected_conditions.end(); ++it) + { +- // emit a local out: label, for error catching in these condition exprs +- o->newline() << "{"; +- o->newline(1) << "__label__ out, deref_fault;"; + emit_probe_condition_update(*it); +- o->newline(-1) << "deref_fault: __attribute__((unused));"; +- o->newline() << "out: __attribute__((unused));"; +- o->newline() << "}"; + } ++ o->newline(-1) << "deref_fault: __attribute__((unused));"; ++ o->newline() << "out: __attribute__((unused));"; ++ o->newline() << "}"; + } + + // PR26296 diff --git a/SPECS/systemtap.spec b/SPECS/systemtap.spec index 04e9229..9ca5d64 100644 --- a/SPECS/systemtap.spec +++ b/SPECS/systemtap.spec @@ -54,7 +54,8 @@ %if 0%{?fedora} >= 18 || 0%{?rhel} >= 6 %define initdir %{sysconfdir}/rc.d/init.d # not scl-wrapped _initddir -%else # RHEL5 doesn't know _initddir +%else + # RHEL5 doesn't know _initddir %define initdir %{_initrddir} %endif @@ -64,7 +65,8 @@ %else %if 0%{?rhel} >= 6 %define udevrulesdir /lib/udev/rules.d - %else # RHEL5 + %else + # RHEL5 %define udevrulesdir /etc/udev/rules.d %endif %endif @@ -94,8 +96,8 @@ %define __brp_mangle_shebangs_exclude_from .stp$ Name: %{?scl_prefix}systemtap -Version: 4.3 -Release: 7%{?dist} +Version: 4.4 +Release: 5%{?release_override}%{?dist} # for version, see also configure.ac @@ -131,8 +133,12 @@ License: GPLv2+ URL: http://sourceware.org/systemtap/ Source: ftp://sourceware.org/pub/systemtap/releases/systemtap-%{version}.tar.gz -Patch10: rhbz1847676,1857749.patch -Patch11: rhbz1855264.patch +Patch1: rhbz1873492.patch +Patch2: rhbz1898288.patch +Patch3: rhbz1902696.patch +Patch4: rhbz1908904.patch +Patch5: rhbz1906662.patch +Patch6: rhbz1650594.patch # Build* BuildRequires: gcc-c++ @@ -140,6 +146,7 @@ BuildRequires: cpio BuildRequires: gettext-devel BuildRequires: pkgconfig(nss) BuildRequires: pkgconfig(avahi-client) +BuildRequires: %{?scl_prefix}elfutils-debuginfod-client-devel >= 0.179 %if %{with_dyninst} BuildRequires: %{?scl_prefix}dyninst-devel >= 10.0 BuildRequires: pkgconfig(libselinux) @@ -363,8 +370,8 @@ URL: http://sourceware.org/systemtap/ Requires: %{?scl_prefix}systemtap = %{version}-%{release} Requires: %{?scl_prefix}systemtap-sdt-devel = %{version}-%{release} Requires: %{?scl_prefix}systemtap-server = %{version}-%{release} -Requires: %{?scl_prefix}elfutils -Requires: dejagnu which grep nc +Requires: dejagnu which %{?scl_prefix}elfutils grep nc +Requires: %{?scl_prefix}elfutils-debuginfod Requires: gcc gcc-c++ make glibc-devel # testsuite/systemtap.base/ptrace.exp needs strace Requires: strace @@ -509,9 +516,12 @@ systemtap-runtime-virthost machine to execute systemtap scripts. %prep %setup -q -n systemtap-%{version} - -%patch10 -p1 -%patch11 -p1 +%patch1 -p1 +%patch2 -p1 +%patch3 -p1 +%patch4 -p1 +%patch5 -p1 +%patch6 -p1 %build @@ -1248,6 +1258,20 @@ done # PRERELEASE %changelog +* Wed Feb 10 2021 Frank Ch. Eigler - 4.4-5 +- rhbz1878303 - align with base rhel systemtap-4.4-9.el8 + +* Tue Jan 26 2021 Martin Cermak - 4.4-4 +- rhbz1878303 - align with base rhel systemtap-4.4-8.el8 + +* Sun Nov 22 2020 Frank Ch. Eigler - 4.4-3 +- rhbz1898288: stability for exhausted UTRACE_TASK_WORK_POOL +- rhbz1873492 related: mokutil parser robustness for RH keys +- rhbz1873492 related: rhel8 kernel_is_locked_down detection + +* Mon Nov 09 2020 Frank Ch. Eigler - 4.4-1 +- Upstream release. + * Tue Jul 28 2020 Frank Ch. Eigler - 4.3-7 - Fix rhbz1857749 with rhel8.3 baseos patch - Fix rhbz1852021 with more complete rhel8.3 baseos patch