commit 9b494b7006bcc7f3f0bf3be9e3c6d3a5e703a728 Author: Dave Anderson Date: Tue Jun 26 16:00:28 2018 -0400 Update for the "kmem -V" option to also dump the global entries that are contained in the "vm_numa_stat" array that was introduced in Linux 4.14. Also, the command output separates the "vm_zone_stat", "vm_node_stat" and "vm_numa_stat" entries into separate sections with "VM_ZONE_STAT", "VM_NODE_STAT" and "VM_NUMA_STAT" headers. Without the patch, the "vm_zone_stat" and "vm_node_stat" entries are listed together under a "VM_STAT" header. (anderson@redhat.com) diff --git a/help.c b/help.c index 5a52650..638c6ec 100644 --- a/help.c +++ b/help.c @@ -6451,9 +6451,10 @@ char *help_kmem[] = { " -C same as -c, but also dumps all pages in the page_hash_table.", " -i displays general memory usage information", " -v displays the mapped virtual memory regions allocated by vmalloc().", -" -V displays the kernel vm_stat table if it exists, the cumulative", -" page_states counter values if they exist, and/or the cumulative", -" vm_event_states counter values if they exist.", +" -V displays the kernel vm_stat table if it exists, or in more recent", +" kernels, the vm_zone_stat, vm_node_stat and vm_numa_stat tables,", +" the cumulative page_states counter values if they exist, and/or ", +" the cumulative, vm_event_states counter values if they exist.", " -n display memory node data (if supported).", " -z displays per-zone memory statistics.", " -o displays each cpu's offset value that is added to per-cpu symbol", @@ -6761,24 +6762,69 @@ char *help_kmem[] = { " f63d5cc0 f6287b80 f83c2000 - f84c3000 1052672", " ...", " ", -" Dump the vm_table contents:\n", +" Dump the virtual memory statistics:\n", " %s> kmem -V", -" NR_ANON_PAGES: 38989", -" NR_FILE_MAPPED: 3106", -" NR_FILE_PAGES: 169570", -" NR_SLAB: 32439", -" NR_PAGETABLE: 1181", -" NR_FILE_DIRTY: 4633", -" NR_WRITEBACK: 0", -" NR_UNSTABLE_NFS: 0", -" NR_BOUNCE: 0", -" NUMA_HIT: 63545992", -" NUMA_MISS: 0", -" NUMA_FOREIGN: 0", -" NUMA_INTERLEAVE_HIT: 24002", -" NUMA_LOCAL: 63545992", -" NUMA_OTHER: 0", -" ", +" VM_ZONE_STAT:", +" NR_FREE_PAGES: 30085", +" NR_ZONE_INACTIVE_ANON: 1985", +" NR_ZONE_ACTIVE_ANON: 338275", +" NR_ZONE_INACTIVE_FILE: 19760", +" NR_ZONE_ACTIVE_FILE: 12018", +" NR_ZONE_UNEVICTABLE: 0", +" NR_ZONE_WRITE_PENDING: 4", +" NR_MLOCK: 0", +" NR_PAGETABLE: 1562", +" NR_KERNEL_STACK_KB: 1728", +" NR_BOUNCE: 0", +" NR_FREE_CMA_PAGES: 0", +" ", +" VM_NODE_STAT:", +" NR_INACTIVE_ANON: 1985", +" NR_ACTIVE_ANON: 338275", +" NR_INACTIVE_FILE: 19760", +" NR_ACTIVE_FILE: 12018", +" NR_UNEVICTABLE: 0", +" NR_SLAB_RECLAIMABLE: 3111", +" NR_SLAB_UNRECLAIMABLE: 3039", +" NR_ISOLATED_ANON: 0", +" NR_ISOLATED_FILE: 0", +" WORKINGSET_REFAULT: 0", +" WORKINGSET_ACTIVATE: 0", +" WORKINGSET_NODERECLAIM: 0", +" NR_ANON_MAPPED: 338089", +" NR_FILE_MAPPED: 8102", +" NR_FILE_PAGES: 33949", +" NR_FILE_DIRTY: 4", +" NR_WRITEBACK: 0", +" NR_WRITEBACK_TEMP: 0", +" NR_SHMEM: 2171", +" NR_SHMEM_THPS: 0", +" NR_SHMEM_PMDMAPPED: 0", +" NR_ANON_THPS: 86", +" NR_UNSTABLE_NFS: 0", +" NR_VMSCAN_WRITE: 0", +" NR_VMSCAN_IMMEDIATE: 0", +" NR_DIRTIED: 155", +" NR_WRITTEN: 75", +" ", +" VM_NUMA_STAT:", +" NUMA_HIT: 575409", +" NUMA_MISS: 0", +" NUMA_FOREIGN: 0", +" NUMA_INTERLEAVE_HIT: 12930", +" NUMA_LOCAL: 575409", +" NUMA_OTHER: 0", +" ", +" VM_EVENT_STATES:", +" PGPGIN: 282492", +" PGPGOUT: 6773", +" PSWPIN: 0", +" PSWPOUT: 0", +" PGALLOC_DMA: 0", +" PGALLOC_DMA32: 693092", +" PGALLOC_NORMAL: 0", +" ...", +" ", " Display hugepage hstate information: \n", " %s> kmem -h", " HSTATE SIZE FREE TOTAL NAME", diff --git a/memory.c b/memory.c index 5c0a853..81ed689 100644 --- a/memory.c +++ b/memory.c @@ -17422,7 +17422,7 @@ vm_stat_init(void) int c ATTRIBUTE_UNUSED; struct gnu_request *req; char *start; - long enum_value, zc = -1; + long enum_value, zone_cnt = -1, node_cnt = -1; int split_vmstat = 0, ni = 0; if (vt->flags & VM_STAT) @@ -17451,11 +17451,21 @@ vm_stat_init(void) } else if (symbol_exists("vm_zone_stat") && get_symbol_type("vm_zone_stat", NULL, NULL) == TYPE_CODE_ARRAY) { - vt->nr_vm_stat_items = - get_array_length("vm_zone_stat", NULL, 0) - + get_array_length("vm_node_stat", NULL, 0); - split_vmstat = 1; - enumerator_value("NR_VM_ZONE_STAT_ITEMS", &zc); + if (symbol_exists("vm_numa_stat")) { + vt->nr_vm_stat_items = + get_array_length("vm_zone_stat", NULL, 0) + + get_array_length("vm_node_stat", NULL, 0) + + get_array_length("vm_numa_stat", NULL, 0); + split_vmstat = 2; + enumerator_value("NR_VM_ZONE_STAT_ITEMS", &zone_cnt); + enumerator_value("NR_VM_NODE_STAT_ITEMS", &node_cnt); + } else { + vt->nr_vm_stat_items = + get_array_length("vm_zone_stat", NULL, 0) + + get_array_length("vm_node_stat", NULL, 0); + split_vmstat = 1; + enumerator_value("NR_VM_ZONE_STAT_ITEMS", &zone_cnt); + } } else { goto bailout; } @@ -17468,13 +17478,20 @@ vm_stat_init(void) req->flags = GNU_PRINT_ENUMERATORS; gdb_interface(req); - if (split_vmstat) { + if (split_vmstat >= 1) { req->command = GNU_GET_DATATYPE; req->name = "node_stat_item"; req->flags = GNU_PRINT_ENUMERATORS; gdb_interface(req); } + if (split_vmstat == 2) { + req->command = GNU_GET_DATATYPE; + req->name = "numa_stat_item"; + req->flags = GNU_PRINT_ENUMERATORS; + gdb_interface(req); + } + FREEBUF(req); stringlen = 1; @@ -17488,15 +17505,20 @@ vm_stat_init(void) c = parse_line(buf, arglist); if ((!split_vmstat && STREQ(arglist[0], "NR_VM_ZONE_STAT_ITEMS")) || - (split_vmstat && - STREQ(arglist[0], "NR_VM_NODE_STAT_ITEMS"))) { + ((split_vmstat == 1) && + STREQ(arglist[0], "NR_VM_NODE_STAT_ITEMS")) || + ((split_vmstat == 2) && + STREQ(arglist[0], "NR_VM_NUMA_STAT_ITEMS"))) { if (LKCD_KERNTYPES()) vt->nr_vm_stat_items = MAX(atoi(arglist[2]), count); break; - } else if (split_vmstat && + } else if ((split_vmstat == 1) && STREQ(arglist[0], "NR_VM_ZONE_STAT_ITEMS")) { continue; + } else if ((split_vmstat == 2) && + STREQ(arglist[0], "NR_VM_NODE_STAT_ITEMS")) { + continue; } else { stringlen += strlen(arglist[0]) + 1; count++; @@ -17523,8 +17545,11 @@ vm_stat_init(void) } i = ni + enum_value; - if (!ni && (enum_value == zc)) { - ni = zc; + if (!ni && (enum_value == zone_cnt)) { + ni = zone_cnt; + continue; + } else if ((ni == zone_cnt) && (enum_value == node_cnt)) { + ni += node_cnt; continue; } @@ -17556,8 +17581,8 @@ dump_vm_stat(char *item, long *retval, ulong zone) char *buf; ulong *vp; ulong location; - int i, maxlen, len; - long tc, zc = 0, nc = 0; + int i, maxlen, len, node_start = -1, numa_start = 1; + long total_cnt, zone_cnt = 0, node_cnt = 0, numa_cnt = 0; int split_vmstat = 0; if (!vm_stat_init()) { @@ -17570,48 +17595,86 @@ dump_vm_stat(char *item, long *retval, ulong zone) buf = GETBUF(sizeof(ulong) * vt->nr_vm_stat_items); - if (symbol_exists("vm_node_stat") && symbol_exists("vm_zone_stat")) + if (symbol_exists("vm_node_stat") && symbol_exists("vm_zone_stat") && + symbol_exists("vm_numa_stat")) + split_vmstat = 2; + else if (symbol_exists("vm_node_stat") && symbol_exists("vm_zone_stat")) split_vmstat = 1; else location = zone ? zone : symbol_value("vm_stat"); - if (split_vmstat) { - enumerator_value("NR_VM_ZONE_STAT_ITEMS", &zc); + if (split_vmstat == 1) { + enumerator_value("NR_VM_ZONE_STAT_ITEMS", &zone_cnt); location = zone ? zone : symbol_value("vm_zone_stat"); readmem(location, KVADDR, buf, - sizeof(ulong) * zc, + sizeof(ulong) * zone_cnt, "vm_zone_stat", FAULT_ON_ERROR); if (!zone) { location = symbol_value("vm_node_stat"); - enumerator_value("NR_VM_NODE_STAT_ITEMS", &nc); - readmem(location, KVADDR, buf + (sizeof(ulong) * zc), - sizeof(ulong) * nc, + enumerator_value("NR_VM_NODE_STAT_ITEMS", &node_cnt); + readmem(location, KVADDR, buf + (sizeof(ulong) * zone_cnt), + sizeof(ulong) * node_cnt, "vm_node_stat", FAULT_ON_ERROR); } - tc = zc + nc; + node_start = zone_cnt; + total_cnt = zone_cnt + node_cnt; + } else if (split_vmstat == 2) { + enumerator_value("NR_VM_ZONE_STAT_ITEMS", &zone_cnt); + location = zone ? zone : symbol_value("vm_zone_stat"); + readmem(location, KVADDR, buf, + sizeof(ulong) * zone_cnt, + "vm_zone_stat", FAULT_ON_ERROR); + if (!zone) { + location = symbol_value("vm_node_stat"); + enumerator_value("NR_VM_NODE_STAT_ITEMS", &node_cnt); + readmem(location, KVADDR, buf + (sizeof(ulong) * zone_cnt), + sizeof(ulong) * node_cnt, + "vm_node_stat", FAULT_ON_ERROR); + } + node_start = zone_cnt; + if (!zone) { + location = symbol_value("vm_numa_stat"); + enumerator_value("NR_VM_NUMA_STAT_ITEMS", &numa_cnt); + readmem(location, KVADDR, buf + (sizeof(ulong) * (zone_cnt+node_cnt)), + sizeof(ulong) * numa_cnt, + "vm_numa_stat", FAULT_ON_ERROR); + } + numa_start = zone_cnt+node_cnt; + total_cnt = zone_cnt + node_cnt + numa_cnt; } else { readmem(location, KVADDR, buf, sizeof(ulong) * vt->nr_vm_stat_items, "vm_stat", FAULT_ON_ERROR); - tc = vt->nr_vm_stat_items; + total_cnt = vt->nr_vm_stat_items; } if (!item) { - if (!zone) - fprintf(fp, " VM_STAT:\n"); - for (i = maxlen = 0; i < tc; i++) + if (!zone) { + if (symbol_exists("vm_zone_stat")) + fprintf(fp, " VM_ZONE_STAT:\n"); + else + fprintf(fp, " VM_STAT:\n"); + } + for (i = maxlen = 0; i < total_cnt; i++) if ((len = strlen(vt->vm_stat_items[i])) > maxlen) maxlen = len; vp = (ulong *)buf; - for (i = 0; i < tc; i++) + for (i = 0; i < total_cnt; i++) { + if (!zone) { + if ((i == node_start) && symbol_exists("vm_node_stat")) + fprintf(fp, "\n VM_NODE_STAT:\n"); + if ((i == numa_start) && symbol_exists("vm_numa_stat")) + fprintf(fp, "\n VM_NUMA_STAT:\n"); + } fprintf(fp, "%s%s: %ld\n", space(maxlen - strlen(vt->vm_stat_items[i])), vt->vm_stat_items[i], vp[i]); + } return TRUE; } vp = (ulong *)buf; - for (i = 0; i < tc; i++) { + for (i = 0; i < total_cnt; i++) { if (STREQ(vt->vm_stat_items[i], item)) { *retval = vp[i]; return TRUE; commit f294197b5511537e6b14d5e1db324f4fc4fdd3f8 Author: Dave Anderson Date: Fri Jul 6 10:57:50 2018 -0400 Support for the "bpf" command on RHEL 3.10.0-913.el7 and later 3.10-based RHEL7 kernels, which contain a backport of the upstream eBPF code, but still use the older, pre-4.11, IDR facility that does not use radix trees for linking the active bpf_prog and bpf_map structures. Without the patch, the command indicates "bpf: command not supported or applicable on this architecture or kernel". (anderson@redhat.com) diff --git a/bpf.c b/bpf.c index 427263d..8871b76 100644 --- a/bpf.c +++ b/bpf.c @@ -45,6 +45,10 @@ static void bpf_prog_gpl_compatible(char *, ulong); static void dump_xlated_plain(void *, unsigned int, int); static void print_boot_time(unsigned long long, char *, unsigned int); +static int do_old_idr(int, ulong, struct radix_tree_pair *); +#define OLD_IDR_INIT (1) +#define OLD_IDR_COUNT (2) +#define OLD_IDR_GATHER (3) #define PROG_ID (0x1) #define MAP_ID (0x2) @@ -167,7 +171,6 @@ bpf_init(struct bpf_info *bpf) !VALID_STRUCT(bpf_map) || !VALID_STRUCT(bpf_insn) || INVALID_MEMBER(bpf_prog_aux) || - INVALID_MEMBER(idr_idr_rt) || INVALID_MEMBER(bpf_prog_type) || INVALID_MEMBER(bpf_prog_tag) || INVALID_MEMBER(bpf_prog_jited_len) || @@ -210,6 +213,9 @@ bpf_init(struct bpf_info *bpf) mkstring(buf2, VADDR_PRLEN, CENTER|LJUST, "BPF_MAP"), mkstring(buf3, bpf->bpf_map_map_type_size, CENTER|LJUST, "BPF_MAP_TYPE")); + if (INVALID_MEMBER(idr_idr_rt)) + do_old_idr(OLD_IDR_INIT, 0, NULL); + bpf->status = TRUE; break; @@ -220,24 +226,38 @@ bpf_init(struct bpf_info *bpf) command_not_supported(); } - bpf->progs = do_radix_tree(symbol_value("prog_idr") + OFFSET(idr_idr_rt), - RADIX_TREE_COUNT, NULL); + if (VALID_MEMBER(idr_idr_rt)) + bpf->progs = do_radix_tree(symbol_value("prog_idr") + OFFSET(idr_idr_rt), + RADIX_TREE_COUNT, NULL); + else + bpf->progs = do_old_idr(OLD_IDR_COUNT, symbol_value("prog_idr"), NULL); + if (bpf->progs) { len = sizeof(struct radix_tree_pair) * (bpf->progs+1); bpf->proglist = (struct radix_tree_pair *)GETBUF(len); bpf->proglist[0].index = bpf->progs; - bpf->progs = do_radix_tree(symbol_value("prog_idr") + OFFSET(idr_idr_rt), - RADIX_TREE_GATHER, bpf->proglist); + if (VALID_MEMBER(idr_idr_rt)) + bpf->progs = do_radix_tree(symbol_value("prog_idr") + OFFSET(idr_idr_rt), + RADIX_TREE_GATHER, bpf->proglist); + else + bpf->progs = do_old_idr(OLD_IDR_GATHER, symbol_value("prog_idr"), bpf->proglist); } - bpf->maps = do_radix_tree(symbol_value("map_idr") + OFFSET(idr_idr_rt), - RADIX_TREE_COUNT, NULL); + if (VALID_MEMBER(idr_idr_rt)) + bpf->maps = do_radix_tree(symbol_value("map_idr") + OFFSET(idr_idr_rt), + RADIX_TREE_COUNT, NULL); + else + bpf->maps = do_old_idr(OLD_IDR_COUNT, symbol_value("map_idr"), NULL); + if (bpf->maps) { len = sizeof(struct radix_tree_pair) * (bpf->maps+1); bpf->maplist = (struct radix_tree_pair *)GETBUF(len); bpf->maplist[0].index = bpf->maps; - bpf->maps = do_radix_tree(symbol_value("map_idr") + OFFSET(idr_idr_rt), - RADIX_TREE_GATHER, bpf->maplist); + if (VALID_MEMBER(idr_idr_rt)) + bpf->maps = do_radix_tree(symbol_value("map_idr") + OFFSET(idr_idr_rt), + RADIX_TREE_GATHER, bpf->maplist); + else + bpf->maps = do_old_idr(OLD_IDR_GATHER, symbol_value("map_idr"), bpf->maplist); } bpf->bpf_prog_buf = GETBUF(SIZE(bpf_prog)); @@ -538,8 +558,10 @@ do_map_only: } bailout: - FREEBUF(bpf->proglist); - FREEBUF(bpf->maplist); + if (bpf->proglist) + FREEBUF(bpf->proglist); + if (bpf->maplist) + FREEBUF(bpf->maplist); FREEBUF(bpf->bpf_prog_buf); FREEBUF(bpf->bpf_prog_aux_buf); FREEBUF(bpf->bpf_map_buf); @@ -1255,3 +1277,50 @@ print_boot_time(unsigned long long nsecs, char *buf, unsigned int size) sprintf(buf, "(unknown)"); #endif } + +/* + * Borrow the old (pre-radix_tree) IDR facility code used by + * the ipcs command. + */ +static int +do_old_idr(int cmd, ulong idr, struct radix_tree_pair *rtp) +{ + int i, max, cur, next_id, total = 0; + ulong entry; + + switch (cmd) + { + case OLD_IDR_INIT: + ipcs_init(); + break; + + case OLD_IDR_COUNT: + readmem(idr + OFFSET(idr_cur), KVADDR, &cur, + sizeof(int), "idr.cur", FAULT_ON_ERROR); + for (total = next_id = 0; next_id < cur; next_id++) { + entry = idr_find(idr, next_id); + if (entry == 0) + continue; + total++; + } + break; + + case OLD_IDR_GATHER: + max = rtp[0].index; + readmem(idr + OFFSET(idr_cur), KVADDR, &cur, + sizeof(int), "idr.cur", FAULT_ON_ERROR); + for (i = total = next_id = 0; next_id < cur; next_id++) { + entry = idr_find(idr, next_id); + if (entry == 0) + continue; + total++; + rtp[i].index = next_id; + rtp[i].value = (void *)entry; + if (++i == max) + break; + } + break; + } + + return total; +} diff --git a/defs.h b/defs.h index e6e3850..b05aecc 100644 --- a/defs.h +++ b/defs.h @@ -2031,6 +2031,7 @@ struct offset_table { /* stash of commonly-used offsets */ long bpf_prog_aux_load_time; long bpf_prog_aux_user; long user_struct_uid; + long idr_cur; }; struct size_table { /* stash of commonly-used sizes */ @@ -5591,6 +5592,12 @@ enum { void dev_init(void); void dump_dev_table(void); +/* + * ipcs.c + */ +void ipcs_init(void); +ulong idr_find(ulong, int); + #ifdef ARM void arm_init(int); void arm_dump_machdep_table(ulong); diff --git a/ipcs.c b/ipcs.c index ef51fdd..80f78e4 100644 --- a/ipcs.c +++ b/ipcs.c @@ -79,13 +79,11 @@ struct ipcs_table { * function declaration */ -static void ipcs_init(void); static int dump_shared_memory(int, ulong, int, ulong); static int dump_semaphore_arrays(int, ulong, int, ulong); static int dump_message_queues(int, ulong, int, ulong); static int ipc_search_idr(ulong, int, ulong, int (*)(ulong, int, ulong, int, int), int); static int ipc_search_array(ulong, int, ulong, int (*)(ulong, int, ulong, int, int), int); -static ulong idr_find(ulong, int); static int dump_shm_info(ulong, int, ulong, int, int); static int dump_sem_info(ulong, int, ulong, int, int); static int dump_msg_info(ulong, int, ulong, int, int); @@ -101,7 +99,7 @@ static void gather_radix_tree_entries(ulong); */ static struct ipcs_table ipcs_table = { 0 }; -static void +void ipcs_init(void) { if (ipcs_table.init_flags & IPCS_INIT) { @@ -119,6 +117,7 @@ ipcs_init(void) MEMBER_OFFSET_INIT(idr_layer_layer, "idr_layer", "layer"); MEMBER_OFFSET_INIT(idr_layer_ary, "idr_layer", "ary"); MEMBER_OFFSET_INIT(idr_top, "idr", "top"); + MEMBER_OFFSET_INIT(idr_cur, "idr", "cur"); MEMBER_OFFSET_INIT(ipc_id_ary_p, "ipc_id_ary", "p"); MEMBER_OFFSET_INIT(ipc_ids_entries, "ipc_ids", "entries"); MEMBER_OFFSET_INIT(ipc_ids_max_id, "ipc_ids", "max_id"); @@ -188,7 +187,10 @@ ipcs_init(void) ipcs_table.shm_f_op_huge_addr = -1; } - if (BITS32()) + if (VALID_MEMBER(idr_layer_ary) && + get_array_length("idr_layer.ary", NULL, 0) > 64) + ipcs_table.idr_bits = 8; + else if (BITS32()) ipcs_table.idr_bits = 5; else if (BITS64()) ipcs_table.idr_bits = 6; @@ -635,7 +637,7 @@ ipc_search_idr(ulong ipc_ids_p, int specified, ulong specified_value, int (*fn)( /* * search every idr_layer */ -static ulong +ulong idr_find(ulong idp, int id) { ulong idr_layer_p; diff --git a/symbols.c b/symbols.c index bf55319..8ff1430 100644 --- a/symbols.c +++ b/symbols.c @@ -10055,6 +10055,8 @@ dump_offset_table(char *spec, ulong makestruct) OFFSET(idr_layers)); fprintf(fp, " idr_top: %ld\n", OFFSET(idr_top)); + fprintf(fp, " idr_cur: %ld\n", + OFFSET(idr_cur)); fprintf(fp, " ipc_id_ary_p: %ld\n", OFFSET(ipc_id_ary_p)); fprintf(fp, " ipc_ids_entries: %ld\n", commit b21633026a725a10b9394aabc65d8c57d5bbe33a Author: Dave Anderson Date: Tue Jul 10 15:24:38 2018 -0400 Third phase of support for x86_64 5-level page tables in Linux 4.17 and later kernels. With this patch, the usage of 5-level page tables is automatically detected on live systems and when running against vmcores that contain the new "NUMBER(pgtable_l5_enabled)" VMCOREINFO entry. Without the patch, the "--machdep vm=5level" command line option is required. (douly.fnst@cn.fujitsu.com, anderson@redhat.com) diff --git a/x86_64.c b/x86_64.c index 6d1ae2f..b07d6f2 100644 --- a/x86_64.c +++ b/x86_64.c @@ -294,25 +294,6 @@ x86_64_init(int when) machdep->machspec->pgdir_shift = PGDIR_SHIFT; machdep->machspec->ptrs_per_pgd = PTRS_PER_PGD; break; - - case VM_5LEVEL: - machdep->machspec->userspace_top = USERSPACE_TOP_5LEVEL; - machdep->machspec->page_offset = PAGE_OFFSET_5LEVEL; - machdep->machspec->vmalloc_start_addr = VMALLOC_START_ADDR_5LEVEL; - machdep->machspec->vmalloc_end = VMALLOC_END_5LEVEL; - machdep->machspec->modules_vaddr = MODULES_VADDR_5LEVEL; - machdep->machspec->modules_end = MODULES_END_5LEVEL; - machdep->machspec->vmemmap_vaddr = VMEMMAP_VADDR_5LEVEL; - machdep->machspec->vmemmap_end = VMEMMAP_END_5LEVEL; - if (symbol_exists("vmemmap_populate")) - machdep->flags |= VMEMMAP; - machdep->machspec->physical_mask_shift = __PHYSICAL_MASK_SHIFT_5LEVEL; - machdep->machspec->pgdir_shift = PGDIR_SHIFT_5LEVEL; - machdep->machspec->ptrs_per_pgd = PTRS_PER_PGD_5LEVEL; - if ((machdep->machspec->p4d = (char *)malloc(PAGESIZE())) == NULL) - error(FATAL, "cannot malloc p4d space."); - machdep->machspec->last_p4d_read = 0; - machdep->uvtop = x86_64_uvtop_level4; /* 5-level is optional per-task */ } machdep->kvbase = (ulong)PAGE_OFFSET; machdep->identity_map_base = (ulong)PAGE_OFFSET; @@ -346,6 +327,43 @@ x86_64_init(int when) break; case POST_RELOC: + /* Check for 5-level paging */ + if (!(machdep->flags & VM_5LEVEL)) { + int l5_enabled = 0; + if ((string = pc->read_vmcoreinfo("NUMBER(pgtable_l5_enabled)"))) { + l5_enabled = atoi(string); + free(string); + } else if (kernel_symbol_exists("__pgtable_l5_enabled")) + readmem(symbol_value("__pgtable_l5_enabled"), KVADDR, + &l5_enabled, sizeof(int), "__pgtable_l5_enabled", + FAULT_ON_ERROR); + + if (l5_enabled) + machdep->flags |= VM_5LEVEL; + } + if (machdep->flags & VM_5LEVEL) { + machdep->machspec->userspace_top = USERSPACE_TOP_5LEVEL; + machdep->machspec->page_offset = PAGE_OFFSET_5LEVEL; + machdep->machspec->vmalloc_start_addr = VMALLOC_START_ADDR_5LEVEL; + machdep->machspec->vmalloc_end = VMALLOC_END_5LEVEL; + machdep->machspec->modules_vaddr = MODULES_VADDR_5LEVEL; + machdep->machspec->modules_end = MODULES_END_5LEVEL; + machdep->machspec->vmemmap_vaddr = VMEMMAP_VADDR_5LEVEL; + machdep->machspec->vmemmap_end = VMEMMAP_END_5LEVEL; + if (symbol_exists("vmemmap_populate")) + machdep->flags |= VMEMMAP; + machdep->machspec->physical_mask_shift = __PHYSICAL_MASK_SHIFT_5LEVEL; + machdep->machspec->pgdir_shift = PGDIR_SHIFT_5LEVEL; + machdep->machspec->ptrs_per_pgd = PTRS_PER_PGD_5LEVEL; + if ((machdep->machspec->p4d = (char *)malloc(PAGESIZE())) == NULL) + error(FATAL, "cannot malloc p4d space."); + machdep->machspec->last_p4d_read = 0; + machdep->uvtop = x86_64_uvtop_level4; /* 5-level is optional per-task */ + machdep->kvbase = (ulong)PAGE_OFFSET; + machdep->identity_map_base = (ulong)PAGE_OFFSET; + + } + /* * Check for CONFIG_RANDOMIZE_MEMORY, and set page_offset here. * The remainder of the virtual address range setups will get commit 6596f1121b89162f96d1e1825c2905b83b59bec1 Author: Dave Anderson Date: Wed Jul 11 16:25:59 2018 -0400 The existing "list" command uses a hash table to detect duplicate items as it traverses the list. The hash table approach has worked well for many years. However, with increasing memory sizes and list sizes, the overhead of the hash table can be substantial, often leading to commands running for a very long time. For large lists, we have found that the existing hash based approach may slow the system to a crawl and possibly never complete. You can turn off the hash with "set hash off" but then there is no loop detection; in that case, loop detection must be done manually after dumping the list to disk or some other method. This patch is an implementation of the cycle detection algorithm from R. P. Brent as an alternative algorithm for the "list" command. The algorithm both avoids the overhead of the hash table and yet is able to detect a loop. In addition, further loop characteristics are printed, such as the distance to the start of the loop as well as the loop length. An excellent description of the algorithm can be found here on the crash-utility mailing list: https://www.redhat.com/archives/crash-utility/2018-July/msg00019.html A new "list -B" option has been added to the "list" command to invoke this new algorithm rather than using the hash table. In addition to low memory usage, the output of the list command is slightly different when a loop is detected. In addition to printing the first duplicate entry, the length of the loop, and the distance to the loop is output. (dwysocha@redhat.com) diff --git a/defs.h b/defs.h index b05aecc..5af82be 100644 --- a/defs.h +++ b/defs.h @@ -2491,6 +2491,7 @@ struct list_data { /* generic structure used by do_list() to walk */ #define CALLBACK_RETURN (VERBOSE << 12) #define LIST_PARSE_MEMBER (VERBOSE << 13) #define LIST_READ_MEMBER (VERBOSE << 14) +#define LIST_BRENT_ALGO (VERBOSE << 15) struct tree_data { ulong flags; @@ -4944,6 +4945,7 @@ char *shift_string_right(char *, int); int bracketed(char *, char *, int); void backspace(int); int do_list(struct list_data *); +int do_list_no_hash(struct list_data *); struct radix_tree_ops { void (*entry)(ulong node, ulong slot, const char *path, ulong index, void *private); diff --git a/help.c b/help.c index 638c6ec..54bf9b4 100644 --- a/help.c +++ b/help.c @@ -5724,7 +5724,7 @@ char *help__list[] = { "list", "linked list", "[[-o] offset][-e end][-[s|S] struct[.member[,member] [-l offset]] -[x|d]]" -"\n [-r|-h|-H] start", +"\n [-r|-B] [-h|-H] start", " ", " This command dumps the contents of a linked list. The entries in a linked", " list are typically data structures that are tied together in one of two", @@ -5822,6 +5822,12 @@ char *help__list[] = { " -r For a list linked with list_head structures, traverse the list", " in the reverse order by using the \"prev\" pointer instead", " of \"next\".", +" -B Use the algorithm from R. P. Brent to detect loops instead of", +" using a hash table. This algorithm uses a tiny fixed amount of", +" memory and so is especially helpful for longer lists. The output", +" is slightly different than the normal list output as it will", +" print the length of the loop, the start of the loop, and the", +" first duplicate in the list.", " ", " The meaning of the \"start\" argument, which can be expressed symbolically,", " in hexadecimal format, or an expression evaluating to an address, depends", diff --git a/tools.c b/tools.c index 1a83643..634aec6 100644 --- a/tools.c +++ b/tools.c @@ -3266,9 +3266,12 @@ cmd_list(void) BZERO(ld, sizeof(struct list_data)); struct_list_offset = 0; - while ((c = getopt(argcnt, args, "Hhrs:S:e:o:xdl:")) != EOF) { + while ((c = getopt(argcnt, args, "BHhrs:S:e:o:xdl:")) != EOF) { switch(c) { + case 'B': + ld->flags |= LIST_BRENT_ALGO; + break; case 'H': ld->flags |= LIST_HEAD_FORMAT; ld->flags |= LIST_HEAD_POINTER; @@ -3516,9 +3519,13 @@ next_arg: ld->flags &= ~(LIST_OFFSET_ENTERED|LIST_START_ENTERED); ld->flags |= VERBOSE; - hq_open(); - c = do_list(ld); - hq_close(); + if (ld->flags & LIST_BRENT_ALGO) + c = do_list_no_hash(ld); + else { + hq_open(); + c = do_list(ld); + hq_close(); + } if (ld->structname_args) FREEBUF(ld->structname); @@ -3862,6 +3869,283 @@ do_list(struct list_data *ld) return count; } +static void +do_list_debug_entry(struct list_data *ld) +{ + int i, others; + + if (CRASHDEBUG(1)) { + others = 0; + console(" flags: %lx (", ld->flags); + if (ld->flags & VERBOSE) + console("%sVERBOSE", others++ ? "|" : ""); + if (ld->flags & LIST_OFFSET_ENTERED) + console("%sLIST_OFFSET_ENTERED", others++ ? "|" : ""); + if (ld->flags & LIST_START_ENTERED) + console("%sLIST_START_ENTERED", others++ ? "|" : ""); + if (ld->flags & LIST_HEAD_FORMAT) + console("%sLIST_HEAD_FORMAT", others++ ? "|" : ""); + if (ld->flags & LIST_HEAD_POINTER) + console("%sLIST_HEAD_POINTER", others++ ? "|" : ""); + if (ld->flags & RETURN_ON_DUPLICATE) + console("%sRETURN_ON_DUPLICATE", others++ ? "|" : ""); + if (ld->flags & RETURN_ON_LIST_ERROR) + console("%sRETURN_ON_LIST_ERROR", others++ ? "|" : ""); + if (ld->flags & RETURN_ON_LIST_ERROR) + console("%sRETURN_ON_LIST_ERROR", others++ ? "|" : ""); + if (ld->flags & LIST_STRUCT_RADIX_10) + console("%sLIST_STRUCT_RADIX_10", others++ ? "|" : ""); + if (ld->flags & LIST_STRUCT_RADIX_16) + console("%sLIST_STRUCT_RADIX_16", others++ ? "|" : ""); + if (ld->flags & LIST_ALLOCATE) + console("%sLIST_ALLOCATE", others++ ? "|" : ""); + if (ld->flags & LIST_CALLBACK) + console("%sLIST_CALLBACK", others++ ? "|" : ""); + if (ld->flags & CALLBACK_RETURN) + console("%sCALLBACK_RETURN", others++ ? "|" : ""); + console(")\n"); + console(" start: %lx\n", ld->start); + console(" member_offset: %ld\n", ld->member_offset); + console(" list_head_offset: %ld\n", ld->list_head_offset); + console(" end: %lx\n", ld->end); + console(" searchfor: %lx\n", ld->searchfor); + console(" structname_args: %lx\n", ld->structname_args); + if (!ld->structname_args) + console(" structname: (unused)\n"); + for (i = 0; i < ld->structname_args; i++) + console(" structname[%d]: %s\n", i, ld->structname[i]); + console(" header: %s\n", ld->header); + console(" list_ptr: %lx\n", (ulong)ld->list_ptr); + console(" callback_func: %lx\n", (ulong)ld->callback_func); + console(" callback_data: %lx\n", (ulong)ld->callback_data); + console("struct_list_offset: %lx\n", ld->struct_list_offset); + } +} + + +static void +do_list_output_struct(struct list_data *ld, ulong next, ulong offset, + unsigned int radix, struct req_entry **e) +{ + int i; + + for (i = 0; i < ld->structname_args; i++) { + switch (count_chars(ld->structname[i], '.')) + { + case 0: + dump_struct(ld->structname[i], + next - offset, radix); + break; + default: + if (ld->flags & LIST_PARSE_MEMBER) + dump_struct_members(ld, i, next); + else if (ld->flags & LIST_READ_MEMBER) + dump_struct_members_fast(e[i], + radix, next - offset); + break; + } + } +} + +static int +do_list_no_hash_readmem(struct list_data *ld, ulong *next_ptr, + ulong readflag) +{ + if (!readmem(*next_ptr + ld->member_offset, KVADDR, next_ptr, + sizeof(void *), "list entry", readflag)) { + error(INFO, "\ninvalid list entry: %lx\n", *next_ptr); + return -1; + } + return 0; +} + +static ulong brent_x; /* tortoise */ +static ulong brent_y; /* hare */ +static ulong brent_r; /* power */ +static ulong brent_lambda; /* loop length */ +static ulong brent_mu; /* distance to start of loop */ +static ulong brent_loop_detect; +static ulong brent_loop_exit; +/* + * 'ptr': representative of x or y; modified on return + */ +static int +brent_f(ulong *ptr, struct list_data *ld, ulong readflag) +{ + return do_list_no_hash_readmem(ld, ptr, readflag); +} + +/* + * Similar to do_list() but without the hash_table or LIST_ALLOCATE. + * Useful for the 'list' command and other callers needing faster list + * enumeration. + */ +int +do_list_no_hash(struct list_data *ld) +{ + ulong next, last, first, offset; + ulong searchfor, readflag; + int i, count, ret; + unsigned int radix; + struct req_entry **e = NULL; + + do_list_debug_entry(ld); + + count = 0; + searchfor = ld->searchfor; + ld->searchfor = 0; + if (ld->flags & LIST_STRUCT_RADIX_10) + radix = 10; + else if (ld->flags & LIST_STRUCT_RADIX_16) + radix = 16; + else + radix = 0; + next = ld->start; + + readflag = ld->flags & RETURN_ON_LIST_ERROR ? + (RETURN_ON_ERROR|QUIET) : FAULT_ON_ERROR; + + if (!readmem(next + ld->member_offset, KVADDR, &first, sizeof(void *), + "first list entry", readflag)) { + error(INFO, "\ninvalid list entry: %lx\n", next); + return -1; + } + + if (ld->header) + fprintf(fp, "%s", ld->header); + + offset = ld->list_head_offset + ld->struct_list_offset; + + if (ld->structname && (ld->flags & LIST_READ_MEMBER)) { + e = (struct req_entry **)GETBUF(sizeof(*e) * ld->structname_args); + for (i = 0; i < ld->structname_args; i++) + e[i] = fill_member_offsets(ld->structname[i]); + } + + brent_loop_detect = brent_loop_exit = 0; + brent_lambda = 0; + brent_r = 2; + brent_x = brent_y = next; + ret = brent_f(&brent_y, ld, readflag); + if (ret == -1) + return -1; + while (1) { + if (!brent_loop_detect && ld->flags & VERBOSE) { + fprintf(fp, "%lx\n", next - ld->list_head_offset); + if (ld->structname) { + do_list_output_struct(ld, next, offset, radix, e); + } + } + + if (next && brent_loop_exit) { + if (ld->flags & + (RETURN_ON_DUPLICATE|RETURN_ON_LIST_ERROR)) { + error(INFO, "\nduplicate list entry: %lx\n", + brent_x); + return -1; + } + error(FATAL, "\nduplicate list entry: %lx\n", brent_x); + } + + if ((searchfor == next) || + (searchfor == (next - ld->list_head_offset))) + ld->searchfor = searchfor; + + count++; + last = next; + + if ((ld->flags & LIST_CALLBACK) && + ld->callback_func((void *)(next - ld->list_head_offset), + ld->callback_data) && (ld->flags & CALLBACK_RETURN)) + break; + + ret = do_list_no_hash_readmem(ld, &next, readflag); + if (ret == -1) + return -1; + + if (!brent_loop_detect) { + if (brent_x == brent_y) { + brent_loop_detect = 1; + error(INFO, "loop detected, loop length: %lx\n", brent_lambda); + /* reset x and y to start; advance y loop length */ + brent_mu = 0; + brent_x = brent_y = ld->start; + while (brent_lambda--) { + ret = brent_f(&brent_y, ld, readflag); + if (ret == -1) + return -1; + } + } else { + if (brent_r == brent_lambda) { + brent_x = brent_y; + brent_r *= 2; + brent_lambda = 0; + } + brent_y = next; + brent_lambda++; + } + } else { + if (!brent_loop_exit && brent_x == brent_y) { + brent_loop_exit = 1; + error(INFO, "length from start to loop: %lx", + brent_mu); + } else { + ret = brent_f(&brent_x, ld, readflag); + if (ret == -1) + return -1; + ret = brent_f(&brent_y, ld, readflag); + if (ret == -1) + return -1; + brent_mu++; + } + } + + if (next == 0) { + if (ld->flags & LIST_HEAD_FORMAT) { + error(INFO, "\ninvalid list entry: 0\n"); + return -1; + } + if (CRASHDEBUG(1)) + console("do_list end: next:%lx\n", next); + + break; + } + + if (next == ld->end) { + if (CRASHDEBUG(1)) + console("do_list end: next:%lx == end:%lx\n", + next, ld->end); + break; + } + + if (next == ld->start) { + if (CRASHDEBUG(1)) + console("do_list end: next:%lx == start:%lx\n", + next, ld->start); + break; + } + + if (next == last) { + if (CRASHDEBUG(1)) + console("do_list end: next:%lx == last:%lx\n", + next, last); + break; + } + + if ((next == first) && (count != 1)) { + if (CRASHDEBUG(1)) + console("do_list end: next:%lx == first:%lx (count %d)\n", + next, last, count); + break; + } + } + + if (CRASHDEBUG(1)) + console("do_list count: %d\n", count); + + return count; +} + /* * Issue a dump_struct_member() call for one or more structure * members. Multiple members are passed in a comma-separated commit 582f8b1ea4bb843f996d5285288e7a12b519ee73 Author: Dave Anderson Date: Thu Jul 12 14:06:22 2018 -0400 Fix for x86_64 "bt" command to prevent an in-kernel exception frame from not being displayed. Without the patch, if the RIP in a pt_regs structure on the stack is not a kernel text address, such as a NULL pointer, it is not recognized as an exception frame and the register set is not displayed. (anderson@redhat.com) diff --git a/x86_64.c b/x86_64.c index b07d6f2..15800fb 100644 --- a/x86_64.c +++ b/x86_64.c @@ -4662,6 +4662,8 @@ x86_64_eframe_verify(struct bt_info *bt, long kvaddr, long cs, long ss, STREQ(sp->name, "page_fault")) return TRUE; + if ((kvaddr + SIZE(pt_regs)) == rsp) + return TRUE; } if ((cs == 0x10) && kvaddr) { @@ -8393,7 +8395,7 @@ x86_64_get_framesize(struct bt_info *bt, ulong textaddr, ulong rsp) *p1 = NULLCHAR; p2 = arglist[arg]; reterror = 0; - offset = htol(p2+1, RETURN_ON_ERROR, &reterror); + offset = htol(p2+1, RETURN_ON_ERROR|QUIET, &reterror); if (reterror) continue; framesize += offset; commit 528849c15a02d9162c2dd17f4551390763711ad5 Author: Dave Anderson Date: Fri Jul 13 13:50:44 2018 -0400 Fix for the "repeat" command when the argument consists of an input file construct, for example, "repeat -1 < input_file". Without the patch, only the first command line in the input file is executed each time. (anderson@redhat.com) diff --git a/cmdline.c b/cmdline.c index aab37ce..ee08f06 100644 --- a/cmdline.c +++ b/cmdline.c @@ -1,8 +1,8 @@ /* cmdline.c - core analysis suite * * Copyright (C) 1999, 2000, 2001, 2002 Mission Critical Linux, Inc. - * Copyright (C) 2002-2015,2017 David Anderson - * Copyright (C) 2002-2015,2017 Red Hat, Inc. All rights reserved. + * Copyright (C) 2002-2015,2018 David Anderson + * Copyright (C) 2002-2015,2018 Red Hat, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -2324,6 +2324,9 @@ is_args_input_file(struct command_table_entry *ct, struct args_input_file *aif) if (pc->curcmd_flags & NO_MODIFY) return FALSE; + if (STREQ(ct->name, "repeat")) + return FALSE; + BZERO(aif, sizeof(struct args_input_file)); retval = FALSE; commit 61fcad549faa479e6831d5283387f8f2e4ec9202 Author: Dave Anderson Date: Mon Jul 16 09:32:57 2018 -0400 Fourth phase of support for x86_64 5-level page tables in Linux 4.17 and later kernels. This patch adds support for user virtual address translation when the kernel is configured with CONFIG_X86_5LEVEL. (douly.fnst@cn.fujitsu.com) diff --git a/x86_64.c b/x86_64.c index 15800fb..4073acb 100644 --- a/x86_64.c +++ b/x86_64.c @@ -24,7 +24,6 @@ static int x86_64_uvtop(struct task_context *, ulong, physaddr_t *, int); static int x86_64_uvtop_level4(struct task_context *, ulong, physaddr_t *, int); static int x86_64_uvtop_level4_xen_wpt(struct task_context *, ulong, physaddr_t *, int); static int x86_64_uvtop_level4_rhel4_xen_wpt(struct task_context *, ulong, physaddr_t *, int); -static int x86_64_task_uses_5level(struct task_context *); static ulong x86_64_vmalloc_start(void); static int x86_64_is_task_addr(ulong); static int x86_64_verify_symbol(const char *, ulong, char); @@ -341,6 +340,7 @@ x86_64_init(int when) if (l5_enabled) machdep->flags |= VM_5LEVEL; } + if (machdep->flags & VM_5LEVEL) { machdep->machspec->userspace_top = USERSPACE_TOP_5LEVEL; machdep->machspec->page_offset = PAGE_OFFSET_5LEVEL; @@ -361,7 +361,6 @@ x86_64_init(int when) machdep->uvtop = x86_64_uvtop_level4; /* 5-level is optional per-task */ machdep->kvbase = (ulong)PAGE_OFFSET; machdep->identity_map_base = (ulong)PAGE_OFFSET; - } /* @@ -812,7 +811,7 @@ x86_64_dump_machdep_table(ulong arg) else if (machdep->uvtop == x86_64_uvtop_level4) { fprintf(fp, " uvtop: x86_64_uvtop_level4()"); if (machdep->flags & VM_5LEVEL) - fprintf(fp, " or x86_64_uvtop_5level()"); + fprintf(fp, " (uses 5-level page tables)"); fprintf(fp, "\n"); } else if (machdep->uvtop == x86_64_uvtop_level4_xen_wpt) fprintf(fp, " uvtop: x86_64_uvtop_level4_xen_wpt()\n"); @@ -1915,7 +1914,7 @@ x86_64_uvtop_level4(struct task_context *tc, ulong uvaddr, physaddr_t *paddr, in goto no_upage; /* If the VM is in 5-level page table */ - if (machdep->flags & VM_5LEVEL && x86_64_task_uses_5level(tc)) { + if (machdep->flags & VM_5LEVEL) { ulong p4d_pte; /* * p4d = p4d_offset(pgd, address); @@ -1987,12 +1986,6 @@ no_upage: } static int -x86_64_task_uses_5level(struct task_context *tc) -{ - return FALSE; -} - -static int x86_64_uvtop_level4_xen_wpt(struct task_context *tc, ulong uvaddr, physaddr_t *paddr, int verbose) { ulong pgd_pte; commit eb823b79385f61be97411a06dd57b6fc0973d280 Author: Dave Anderson Date: Mon Jul 16 10:50:19 2018 -0400 Fix to prevent an unnecessary "read error" message during session initialization on live systems running a kernel that is configured with CONFIG_X86_5LEVEL. Without the patch, a message indicating "crash: read error: kernel virtual address: <address> type: __pgtable_l5_enabled" will be displayed if /proc/kcore gets selected as the live memory source after /dev/mem is determined to be unusable. (anderson@redhat.com) diff --git a/x86_64.c b/x86_64.c index 4073acb..0574041 100644 --- a/x86_64.c +++ b/x86_64.c @@ -335,7 +335,7 @@ x86_64_init(int when) } else if (kernel_symbol_exists("__pgtable_l5_enabled")) readmem(symbol_value("__pgtable_l5_enabled"), KVADDR, &l5_enabled, sizeof(int), "__pgtable_l5_enabled", - FAULT_ON_ERROR); + QUIET|FAULT_ON_ERROR); if (l5_enabled) machdep->flags |= VM_5LEVEL;