f57669
Description: Makes trimming work consistently across arenas.
f57669
Author: Mel Gorman <mgorman@suse.de>
f57669
Origin: git://sourceware.org/git/glibc.git
f57669
Bug-RHEL: N/A
f57669
Bug-Fedora: N/A
f57669
Bug-Upstream: #17195
f57669
Upstream status: committed
f57669
f57669
Part of commit 8a35c3fe122d49ba76dff815b3537affb5a50b45 is also included
f57669
to allow the use of ALIGN_UP within malloc/arena.c.
f57669
f57669
commit c26efef9798914e208329c0e8c3c73bb1135d9e3
f57669
Author: Mel Gorman <mgorman@suse.de>
f57669
Date:   Thu Apr 2 12:14:14 2015 +0530
f57669
f57669
    malloc: Consistently apply trim_threshold to all heaps [BZ #17195]
f57669
    
f57669
    Trimming heaps is a balance between saving memory and the system overhead
f57669
    required to update page tables and discard allocated pages. The malloc
f57669
    option M_TRIM_THRESHOLD is a tunable that users are meant to use to decide
f57669
    where this balance point is but it is only applied to the main arena.
f57669
    
f57669
    For scalability reasons, glibc malloc has per-thread heaps but these are
f57669
    shrunk with madvise() if there is one page free at the top of the heap.
f57669
    In some circumstances this can lead to high system overhead if a thread
f57669
    has a control flow like
f57669
    
f57669
        while (data_to_process) {
f57669
            buf = malloc(large_size);
f57669
            do_stuff();
f57669
            free(buf);
f57669
        }
f57669
    
f57669
    For a large size, the free() will call madvise (pagetable teardown, page
f57669
    free and TLB flush) every time followed immediately by a malloc (fault,
f57669
    kernel page alloc, zeroing and charge accounting). The kernel overhead
f57669
    can dominate such a workload.
f57669
    
f57669
    This patch allows the user to tune when madvise gets called by applying
f57669
    the trim threshold to the per-thread heaps and using similar logic to the
f57669
    main arena when deciding whether to shrink. Alternatively if the dynamic
f57669
    brk/mmap threshold gets adjusted then the new values will be obeyed by
f57669
    the per-thread heaps.
f57669
    
f57669
    Bug 17195 was a test case motivated by a problem encountered in scientific
f57669
    applications written in python that performance badly due to high page fault
f57669
    overhead. The basic operation of such a program was posted by Julian Taylor
f57669
    https://sourceware.org/ml/libc-alpha/2015-02/msg00373.html
f57669
    
f57669
    With this patch applied, the overhead is eliminated. All numbers in this
f57669
    report are in seconds and were recorded by running Julian's program 30
f57669
    times.
f57669
    
f57669
    pyarray
f57669
                                     glibc               madvise
f57669
                                      2.21                    v2
f57669
    System  min             1.81 (  0.00%)        0.00 (100.00%)
f57669
    System  mean            1.93 (  0.00%)        0.02 ( 99.20%)
f57669
    System  stddev          0.06 (  0.00%)        0.01 ( 88.99%)
f57669
    System  max             2.06 (  0.00%)        0.03 ( 98.54%)
f57669
    Elapsed min             3.26 (  0.00%)        2.37 ( 27.30%)
f57669
    Elapsed mean            3.39 (  0.00%)        2.41 ( 28.84%)
f57669
    Elapsed stddev          0.14 (  0.00%)        0.02 ( 82.73%)
f57669
    Elapsed max             4.05 (  0.00%)        2.47 ( 39.01%)
f57669
    
f57669
                   glibc     madvise
f57669
                    2.21          v2
f57669
    User          141.86      142.28
f57669
    System         57.94        0.60
f57669
    Elapsed       102.02       72.66
f57669
    
f57669
    Note that almost a minutes worth of system time is eliminted and the
f57669
    program completes 28% faster on average.
f57669
    
f57669
    To illustrate the problem without python this is a basic test-case for
f57669
    the worst case scenario where every free is a madvise followed by a an alloc
f57669
    
f57669
    /* gcc bench-free.c -lpthread -o bench-free */
f57669
    static int num = 1024;
f57669
    
f57669
    void __attribute__((noinline,noclone)) dostuff (void *p)
f57669
    {
f57669
    }
f57669
    
f57669
    void *worker (void *data)
f57669
    {
f57669
      int i;
f57669
    
f57669
      for (i = num; i--;)
f57669
        {
f57669
          void *m = malloc (48*4096);
f57669
          dostuff (m);
f57669
          free (m);
f57669
        }
f57669
    
f57669
      return NULL;
f57669
    }
f57669
    
f57669
    int main()
f57669
    {
f57669
      int i;
f57669
      pthread_t t;
f57669
      void *ret;
f57669
      if (pthread_create (&t, NULL, worker, NULL))
f57669
        exit (2);
f57669
      if (pthread_join (t, &ret))
f57669
        exit (3);
f57669
      return 0;
f57669
    }
f57669
    
f57669
    Before the patch, this resulted in 1024 calls to madvise. With the patch applied,
f57669
    madvise is called twice because the default trim threshold is high enough to avoid
f57669
    this.
f57669
    
f57669
    This a more complex case where there is a mix of frees. It's simply a different worker
f57669
    function for the test case above
f57669
    
f57669
    void *worker (void *data)
f57669
    {
f57669
      int i;
f57669
      int j = 0;
f57669
      void *free_index[num];
f57669
    
f57669
      for (i = num; i--;)
f57669
        {
f57669
          void *m = malloc ((i % 58) *4096);
f57669
          dostuff (m);
f57669
          if (i % 2 == 0) {
f57669
            free (m);
f57669
          } else {
f57669
            free_index[j++] = m;
f57669
          }
f57669
        }
f57669
      for (; j >= 0; j--)
f57669
        {
f57669
          free(free_index[j]);
f57669
        }
f57669
    
f57669
      return NULL;
f57669
    }
f57669
    
f57669
    glibc 2.21 calls malloc 90305 times but with the patch applied, it's
f57669
    called 13438. Increasing the trim threshold will decrease the number of
f57669
    times it's called with the option of eliminating the overhead.
f57669
    
f57669
    ebizzy is meant to generate a workload resembling common web application
f57669
    server workloads. It is threaded with a large working set that at its core
f57669
    has an allocation, do_stuff, free loop that also hits this case. The primary
f57669
    metric of the benchmark is records processed per second. This is running on
f57669
    my desktop which is a single socket machine with an I7-4770 and 8 cores.
f57669
    Each thread count was run for 30 seconds. It was only run once as the
f57669
    performance difference is so high that the variation is insignificant.
f57669
    
f57669
                    glibc 2.21              patch
f57669
    threads 1            10230              44114
f57669
    threads 2            19153              84925
f57669
    threads 4            34295             134569
f57669
    threads 8            51007             183387
f57669
    
f57669
    Note that the saving happens to be a concidence as the size allocated
f57669
    by ebizzy was less than the default threshold. If a different number of
f57669
    chunks were specified then it may also be necessary to tune the threshold
f57669
    to compensate
f57669
    
f57669
    This is roughly quadrupling the performance of this benchmark. The difference in
f57669
    system CPU usage illustrates why.
f57669
    
f57669
    ebizzy running 1 thread with glibc 2.21
f57669
    10230 records/s 306904
f57669
    real 30.00 s
f57669
    user  7.47 s
f57669
    sys  22.49 s
f57669
    
f57669
    22.49 seconds was spent in the kernel for a workload runinng 30 seconds. With the
f57669
    patch applied
f57669
    
f57669
    ebizzy running 1 thread with patch applied
f57669
    44126 records/s 1323792
f57669
    real 30.00 s
f57669
    user 29.97 s
f57669
    sys   0.00 s
f57669
    
f57669
    system CPU usage was zero with the patch applied. strace shows that glibc
f57669
    running this workload calls madvise approximately 9000 times a second. With
f57669
    the patch applied madvise was called twice during the workload (or 0.06
f57669
    times per second).
f57669
    
f57669
    2015-02-10  Mel Gorman  <mgorman@suse.de>
f57669
    
f57669
      [BZ #17195]
f57669
      * malloc/arena.c (free): Apply trim threshold to per-thread heaps
f57669
        as well as the main arena.
f57669
f57669
Index: glibc-2.17-c758a686/malloc/arena.c
f57669
===================================================================
f57669
--- glibc-2.17-c758a686.orig/malloc/arena.c
f57669
+++ glibc-2.17-c758a686/malloc/arena.c
f57669
@@ -661,7 +661,7 @@ heap_trim(heap_info *heap, size_t pad)
f57669
   unsigned long pagesz = GLRO(dl_pagesize);
f57669
   mchunkptr top_chunk = top(ar_ptr), p, bck, fwd;
f57669
   heap_info *prev_heap;
f57669
-  long new_size, top_size, extra, prev_size, misalign;
f57669
+  long new_size, top_size, top_area, extra, prev_size, misalign;
f57669
 
f57669
   /* Can this heap go away completely? */
f57669
   while(top_chunk == chunk_at_offset(heap, sizeof(*heap))) {
f57669
@@ -695,9 +695,16 @@ heap_trim(heap_info *heap, size_t pad)
f57669
     set_head(top_chunk, new_size | PREV_INUSE);
f57669
     /*check_chunk(ar_ptr, top_chunk);*/
f57669
   }
f57669
+
f57669
+  /* Uses similar logic for per-thread arenas as the main arena with systrim
f57669
+     by preserving the top pad and at least a page.  */
f57669
   top_size = chunksize(top_chunk);
f57669
-  extra = (top_size - pad - MINSIZE - 1) & ~(pagesz - 1);
f57669
-  if(extra < (long)pagesz)
f57669
+  top_area = top_size - MINSIZE - 1;
f57669
+  if (top_area <= pad)
f57669
+    return 0;
f57669
+
f57669
+  extra = ALIGN_DOWN(top_area - pad, pagesz);
f57669
+  if ((unsigned long) extra < mp_.trim_threshold)
f57669
     return 0;
f57669
   /* Try to shrink. */
f57669
   if(shrink_heap(heap, extra) != 0)
f57669
Index: glibc-2.17-c758a686/malloc/malloc.c
f57669
===================================================================
f57669
--- glibc-2.17-c758a686.orig/malloc/malloc.c
f57669
+++ glibc-2.17-c758a686/malloc/malloc.c
f57669
@@ -236,6 +236,8 @@
f57669
 /* For va_arg, va_start, va_end.  */
f57669
 #include <stdarg.h>
f57669
 
f57669
+/* For ALIGN_UP.  */
f57669
+#include <libc-internal.h>
f57669
 
f57669
 /*
f57669
   Debugging: