mm/slub.c

   1 /*
   2  * SLUB: A slab allocator that limits cache line use instead of queuing
   3  * objects in per cpu and per node lists.
   4  *
   5  * The allocator synchronizes using per slab locks and only
   6  * uses a centralized lock to manage a pool of partial slabs.
   7  *
   8  * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
   9  */
  10
  11 #include <linux/mm.h>
  12 #include <linux/module.h>
  13 #include <linux/bit_spinlock.h>
  14 #include <linux/interrupt.h>
  15 #include <linux/bitops.h>
  16 #include <linux/slab.h>
  17 #include <linux/seq_file.h>
  18 #include <linux/cpu.h>
  19 #include <linux/cpuset.h>
  20 #include <linux/mempolicy.h>
  21 #include <linux/ctype.h>
  22 #include <linux/kallsyms.h>
  23
  24 /*
  25  * Lock order:
  26  *   1. slab_lock(page)
  27  *   2. slab->list_lock
  28  *
  29  *   The slab_lock protects operations on the object of a particular
  30  *   slab and its metadata in the page struct. If the slab lock
  31  *   has been taken then no allocations nor frees can be performed
  32  *   on the objects in the slab nor can the slab be added or removed
  33  *   from the partial or full lists since this would mean modifying
  34  *   the page_struct of the slab.
  35  *
  36  *   The list_lock protects the partial and full list on each node and
  37  *   the partial slab counter. If taken then no new slabs may be added or
  38  *   removed from the lists nor make the number of partial slabs be modified.
  39  *   (Note that the total number of slabs is an atomic value that may be
  40  *   modified without taking the list lock).
  41  *
  42  *   The list_lock is a centralized lock and thus we avoid taking it as
  43  *   much as possible. As long as SLUB does not have to handle partial
  44  *   slabs, operations can continue without any centralized lock. F.e.
  45  *   allocating a long series of objects that fill up slabs does not require
  46  *   the list lock.
  47  *
  48  *   The lock order is sometimes inverted when we are trying to get a slab
  49  *   off a list. We take the list_lock and then look for a page on the list
  50  *   to use. While we do that objects in the slabs may be freed. We can
  51  *   only operate on the slab if we have also taken the slab_lock. So we use
  52  *   a slab_trylock() on the slab. If trylock was successful then no frees
  53  *   can occur anymore and we can use the slab for allocations etc. If the
  54  *   slab_trylock() does not succeed then frees are in progress in the slab and
  55  *   we must stay away from it for a while since we may cause a bouncing
  56  *   cacheline if we try to acquire the lock. So go onto the next slab.
  57  *   If all pages are busy then we may allocate a new slab instead of reusing
  58  *   a partial slab. A new slab has noone operating on it and thus there is
  59  *   no danger of cacheline contention.
  60  *
  61  *   Interrupts are disabled during allocation and deallocation in order to
  62  *   make the slab allocator safe to use in the context of an irq. In addition
  63  *   interrupts are disabled to ensure that the processor does not change
  64  *   while handling per_cpu slabs, due to kernel preemption.
  65  *
  66  * SLUB assigns one slab for allocation to each processor.
  67  * Allocations only occur from these slabs called cpu slabs.
  68  *
  69  * Slabs with free elements are kept on a partial list and during regular
  70  * operations no list for full slabs is used. If an object in a full slab is
  71  * freed then the slab will show up again on the partial lists.
  72  * We track full slabs for debugging purposes though because otherwise we
  73  * cannot scan all objects.
  74  *
  75  * Slabs are freed when they become empty. Teardown and setup is
  76  * minimal so we rely on the page allocators per cpu caches for
  77  * fast frees and allocs.
  78  *
  79  * Overloading of page flags that are otherwise used for LRU management.
  80  *
  81  * PageActive           The slab is used as a cpu cache. Allocations
  82  *                      may be performed from the slab. The slab is not
  83  *                      on any slab list and cannot be moved onto one.
  84  *
  85  * PageError            Slab requires special handling due to debug
  86  *                      options set. This moves slab handling out of
  87  *                      the fast path.
  88  */
  89
  90 /*
  91  * Issues still to be resolved:
  92  *
  93  * - The per cpu array is updated for each new slab and and is a remote
  94  *   cacheline for most nodes. This could become a bouncing cacheline given
  95  *   enough frequent updates. There are 16 pointers in a cacheline, so at
  96  *   max 16 cpus could compete for the cacheline which may be okay.
  97  *
  98  * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
  99  *
 100  * - Variable sizing of the per node arrays
 101  */
 102
 103 /* Enable to test recovery from slab corruption on boot */
 104 #undef SLUB_RESILIENCY_TEST
 105
 106 #if PAGE_SHIFT <= 12
 107
 108 /*
 109  * Small page size. Make sure that we do not fragment memory
 110  */
 111 #define DEFAULT_MAX_ORDER 1
 112 #define DEFAULT_MIN_OBJECTS 4
 113
 114 #else
 115
 116 /*
 117  * Large page machines are customarily able to handle larger
 118  * page orders.
 119  */
 120 #define DEFAULT_MAX_ORDER 2
 121 #define DEFAULT_MIN_OBJECTS 8
 122
 123 #endif
 124
 125 /*
 126  * Mininum number of partial slabs. These will be left on the partial
 127  * lists even if they are empty. kmem_cache_shrink may reclaim them.
 128  */
 129 #define MIN_PARTIAL 2
 130
 131 /*
 132  * Maximum number of desirable partial slabs.
 133  * The existence of more partial slabs makes kmem_cache_shrink
 134  * sort the partial list by the number of objects in the.
 135  */
 136 #define MAX_PARTIAL 10
 137
 138 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 139                                 SLAB_POISON | SLAB_STORE_USER)
 140
 141 /*
 142  * Set of flags that will prevent slab merging
 143  */
 144 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 145                 SLAB_TRACE | SLAB_DESTROY_BY_RCU)
 146
 147 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 148                 SLAB_CACHE_DMA)
 149
 150 #ifndef ARCH_KMALLOC_MINALIGN
 151 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 152 #endif
 153
 154 #ifndef ARCH_SLAB_MINALIGN
 155 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 156 #endif
 157
 158 /* Internal SLUB flags */
 159 #define __OBJECT_POISON 0x80000000      /* Poison object */
 160
 161 /* Not all arches define cache_line_size */
 162 #ifndef cache_line_size
 163 #define cache_line_size()       L1_CACHE_BYTES
 164 #endif
 165
 166 static int kmem_size = sizeof(struct kmem_cache);
 167
 168 #ifdef CONFIG_SMP
 169 static struct notifier_block slab_notifier;
 170 #endif
 171
 172 static enum {
 173         DOWN,           /* No slab functionality available */
 174         PARTIAL,        /* kmem_cache_open() works but kmalloc does not */
 175         UP,             /* Everything works but does not show up in sysfs */
 176         SYSFS           /* Sysfs up */
 177 } slab_state = DOWN;
 178
 179 /* A list of all slab caches on the system */
 180 static DECLARE_RWSEM(slub_lock);
 181 LIST_HEAD(slab_caches);
 182
 183 #ifdef CONFIG_SYSFS
 184 static int sysfs_slab_add(struct kmem_cache *);
 185 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 186 static void sysfs_slab_remove(struct kmem_cache *);
 187 #else
 188 static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 189 static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; }
 190 static void sysfs_slab_remove(struct kmem_cache *s) {}
 191 #endif
 192
 193 /********************************************************************
 194  *                      Core slab cache functions
 195  *******************************************************************/
 196
 197 int slab_is_available(void)
 198 {
 199         return slab_state >= UP;
 200 }
 201
 202 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 203 {
 204 #ifdef CONFIG_NUMA
 205         return s->node[node];
 206 #else
 207         return &s->local_node;
 208 #endif
 209 }
 210
 211 /*
 212  * Slow version of get and set free pointer.
 213  *
 214  * This version requires touching the cache lines of kmem_cache which
 215  * we avoid to do in the fast alloc free paths. There we obtain the offset
 216  * from the page struct.
 217  */
 218 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 219 {
 220         return *(void **)(object + s->offset);
 221 }
 222
 223 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 224 {
 225         *(void **)(object + s->offset) = fp;
 226 }
 227
 228 /* Loop over all objects in a slab */
 229 #define for_each_object(__p, __s, __addr) \
 230         for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
 231                         __p += (__s)->size)
 232
 233 /* Scan freelist */
 234 #define for_each_free_object(__p, __s, __free) \
 235         for (__p = (__free); __p; __p = get_freepointer((__s), __p))
 236
 237 /* Determine object index from a given position */
 238 static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 239 {
 240         return (p - addr) / s->size;
 241 }
 242
 243 /*
 244  * Object debugging
 245  */
 246 static void print_section(char *text, u8 *addr, unsigned int length)
 247 {
 248         int i, offset;
 249         int newline = 1;
 250         char ascii[17];
 251
 252         ascii[16] = 0;
 253
 254         for (i = 0; i < length; i++) {
 255                 if (newline) {
 256                         printk(KERN_ERR "%10s 0x%p: ", text, addr + i);
 257                         newline = 0;
 258                 }
 259                 printk(" %02x", addr[i]);
 260                 offset = i % 16;
 261                 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
 262                 if (offset == 15) {
 263                         printk(" %s\n",ascii);
 264                         newline = 1;
 265                 }
 266         }
 267         if (!newline) {
 268                 i %= 16;
 269                 while (i < 16) {
 270                         printk("   ");
 271                         ascii[i] = ' ';
 272                         i++;
 273                 }
 274                 printk(" %s\n", ascii);
 275         }
 276 }
 277
 278 /*
 279  * Tracking user of a slab.
 280  */
 281 struct track {
 282         void *addr;             /* Called from address */
 283         int cpu;                /* Was running on cpu */
 284         int pid;                /* Pid context */
 285         unsigned long when;     /* When did the operation occur */
 286 };
 287
 288 enum track_item { TRACK_ALLOC, TRACK_FREE };
 289
 290 static struct track *get_track(struct kmem_cache *s, void *object,
 291         enum track_item alloc)
 292 {
 293         struct track *p;
 294
 295         if (s->offset)
 296                 p = object + s->offset + sizeof(void *);
 297         else
 298                 p = object + s->inuse;
 299
 300         return p + alloc;
 301 }
 302
 303 static void set_track(struct kmem_cache *s, void *object,
 304                                 enum track_item alloc, void *addr)
 305 {
 306         struct track *p;
 307
 308         if (s->offset)
 309                 p = object + s->offset + sizeof(void *);
 310         else
 311                 p = object + s->inuse;
 312
 313         p += alloc;
 314         if (addr) {
 315                 p->addr = addr;
 316                 p->cpu = smp_processor_id();
 317                 p->pid = current ? current->pid : -1;
 318                 p->when = jiffies;
 319         } else
 320                 memset(p, 0, sizeof(struct track));
 321 }
 322
 323 static void init_tracking(struct kmem_cache *s, void *object)
 324 {
 325         if (s->flags & SLAB_STORE_USER) {
 326                 set_track(s, object, TRACK_FREE, NULL);
 327                 set_track(s, object, TRACK_ALLOC, NULL);
 328         }
 329 }
 330
 331 static void print_track(const char *s, struct track *t)
 332 {
 333         if (!t->addr)
 334                 return;
 335
 336         printk(KERN_ERR "%s: ", s);
 337         __print_symbol("%s", (unsigned long)t->addr);
 338         printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
 339 }
 340
 341 static void print_trailer(struct kmem_cache *s, u8 *p)
 342 {
 343         unsigned int off;       /* Offset of last byte */
 344
 345         if (s->flags & SLAB_RED_ZONE)
 346                 print_section("Redzone", p + s->objsize,
 347                         s->inuse - s->objsize);
 348
 349         printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n",
 350                         p + s->offset,
 351                         get_freepointer(s, p));
 352
 353         if (s->offset)
 354                 off = s->offset + sizeof(void *);
 355         else
 356                 off = s->inuse;
 357
 358         if (s->flags & SLAB_STORE_USER) {
 359                 print_track("Last alloc", get_track(s, p, TRACK_ALLOC));
 360                 print_track("Last free ", get_track(s, p, TRACK_FREE));
 361                 off += 2 * sizeof(struct track);
 362         }
 363
 364         if (off != s->size)
 365                 /* Beginning of the filler is the free pointer */
 366                 print_section("Filler", p + off, s->size - off);
 367 }
 368
 369 static void object_err(struct kmem_cache *s, struct page *page,
 370                         u8 *object, char *reason)
 371 {
 372         u8 *addr = page_address(page);
 373
 374         printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
 375                         s->name, reason, object, page);
 376         printk(KERN_ERR "    offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
 377                 object - addr, page->flags, page->inuse, page->freelist);
 378         if (object > addr + 16)
 379                 print_section("Bytes b4", object - 16, 16);
 380         print_section("Object", object, min(s->objsize, 128));
 381         print_trailer(s, object);
 382         dump_stack();
 383 }
 384
 385 static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...)
 386 {
 387         va_list args;
 388         char buf[100];
 389
 390         va_start(args, reason);
 391         vsnprintf(buf, sizeof(buf), reason, args);
 392         va_end(args);
 393         printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
 394                 page);
 395         dump_stack();
 396 }
 397
 398 static void init_object(struct kmem_cache *s, void *object, int active)
 399 {
 400         u8 *p = object;
 401
 402         if (s->flags & __OBJECT_POISON) {
 403                 memset(p, POISON_FREE, s->objsize - 1);
 404                 p[s->objsize -1] = POISON_END;
 405         }
 406
 407         if (s->flags & SLAB_RED_ZONE)
 408                 memset(p + s->objsize,
 409                         active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
 410                         s->inuse - s->objsize);
 411 }
 412
 413 static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
 414 {
 415         while (bytes) {
 416                 if (*start != (u8)value)
 417                         return 0;
 418                 start++;
 419                 bytes--;
 420         }
 421         return 1;
 422 }
 423
 424 static inline int check_valid_pointer(struct kmem_cache *s,
 425                                 struct page *page, const void *object)
 426 {
 427         void *base;
 428
 429         if (!object)
 430                 return 1;
 431
 432         base = page_address(page);
 433         if (object < base || object >= base + s->objects * s->size ||
 434                 (object - base) % s->size) {
 435                 return 0;
 436         }
 437
 438         return 1;
 439 }
 440
 441 /*
 442  * Object layout:
 443  *
 444  * object address
 445  *      Bytes of the object to be managed.
 446  *      If the freepointer may overlay the object then the free
 447  *      pointer is the first word of the object.
 448  *
 449  *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 450  *      0xa5 (POISON_END)
 451  *
 452  * object + s->objsize
 453  *      Padding to reach word boundary. This is also used for Redzoning.
 454  *      Padding is extended by another word if Redzoning is enabled and
 455  *      objsize == inuse.
 456  *
 457  *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 458  *      0xcc (RED_ACTIVE) for objects in use.
 459  *
 460  * object + s->inuse
 461  *      Meta data starts here.
 462  *
 463  *      A. Free pointer (if we cannot overwrite object on free)
 464  *      B. Tracking data for SLAB_STORE_USER
 465  *      C. Padding to reach required alignment boundary or at mininum
 466  *              one word if debuggin is on to be able to detect writes
 467  *              before the word boundary.
 468  *
 469  *      Padding is done using 0x5a (POISON_INUSE)
 470  *
 471  * object + s->size
 472  *      Nothing is used beyond s->size.
 473  *
 474  * If slabcaches are merged then the objsize and inuse boundaries are mostly
 475  * ignored. And therefore no slab options that rely on these boundaries
 476  * may be used with merged slabcaches.
 477  */
 478
 479 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 480                                                 void *from, void *to)
 481 {
 482         printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n",
 483                 s->name, message, data, from, to - 1);
 484         memset(from, data, to - from);
 485 }
 486
 487 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 488 {
 489         unsigned long off = s->inuse;   /* The end of info */
 490
 491         if (s->offset)
 492                 /* Freepointer is placed after the object. */
 493                 off += sizeof(void *);
 494
 495         if (s->flags & SLAB_STORE_USER)
 496                 /* We also have user information there */
 497                 off += 2 * sizeof(struct track);
 498
 499         if (s->size == off)
 500                 return 1;
 501
 502         if (check_bytes(p + off, POISON_INUSE, s->size - off))
 503                 return 1;
 504
 505         object_err(s, page, p, "Object padding check fails");
 506
 507         /*
 508          * Restore padding
 509          */
 510         restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size);
 511         return 0;
 512 }
 513
 514 static int slab_pad_check(struct kmem_cache *s, struct page *page)
 515 {
 516         u8 *p;
 517         int length, remainder;
 518
 519         if (!(s->flags & SLAB_POISON))
 520                 return 1;
 521
 522         p = page_address(page);
 523         length = s->objects * s->size;
 524         remainder = (PAGE_SIZE << s->order) - length;
 525         if (!remainder)
 526                 return 1;
 527
 528         if (!check_bytes(p + length, POISON_INUSE, remainder)) {
 529                 slab_err(s, page, "Padding check failed");
 530                 restore_bytes(s, "slab padding", POISON_INUSE, p + length,
 531                         p + length + remainder);
 532                 return 0;
 533         }
 534         return 1;
 535 }
 536
 537 static int check_object(struct kmem_cache *s, struct page *page,
 538                                         void *object, int active)
 539 {
 540         u8 *p = object;
 541         u8 *endobject = object + s->objsize;
 542
 543         if (s->flags & SLAB_RED_ZONE) {
 544                 unsigned int red =
 545                         active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
 546
 547                 if (!check_bytes(endobject, red, s->inuse - s->objsize)) {
 548                         object_err(s, page, object,
 549                         active ? "Redzone Active" : "Redzone Inactive");
 550                         restore_bytes(s, "redzone", red,
 551                                 endobject, object + s->inuse);
 552                         return 0;
 553                 }
 554         } else {
 555                 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse &&
 556                         !check_bytes(endobject, POISON_INUSE,
 557                                         s->inuse - s->objsize)) {
 558                 object_err(s, page, p, "Alignment padding check fails");
 559                 /*
 560                  * Fix it so that there will not be another report.
 561                  *
 562                  * Hmmm... We may be corrupting an object that now expects
 563                  * to be longer than allowed.
 564                  */
 565                 restore_bytes(s, "alignment padding", POISON_INUSE,
 566                         endobject, object + s->inuse);
 567                 }
 568         }
 569
 570         if (s->flags & SLAB_POISON) {
 571                 if (!active && (s->flags & __OBJECT_POISON) &&
 572                         (!check_bytes(p, POISON_FREE, s->objsize - 1) ||
 573                                 p[s->objsize - 1] != POISON_END)) {
 574
 575                         object_err(s, page, p, "Poison check failed");
 576                         restore_bytes(s, "Poison", POISON_FREE,
 577                                                 p, p + s->objsize -1);
 578                         restore_bytes(s, "Poison", POISON_END,
 579                                         p + s->objsize - 1, p + s->objsize);
 580                         return 0;
 581                 }
 582                 /*
 583                  * check_pad_bytes cleans up on its own.
 584                  */
 585                 check_pad_bytes(s, page, p);
 586         }
 587
 588         if (!s->offset && active)
 589                 /*
 590                  * Object and freepointer overlap. Cannot check
 591                  * freepointer while object is allocated.
 592                  */
 593                 return 1;
 594
 595         /* Check free pointer validity */
 596         if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
 597                 object_err(s, page, p, "Freepointer corrupt");
 598                 /*
 599                  * No choice but to zap it and thus loose the remainder
 600                  * of the free objects in this slab. May cause
 601                  * another error because the object count is now wrong.
 602                  */
 603                 set_freepointer(s, p, NULL);
 604                 return 0;
 605         }
 606         return 1;
 607 }
 608
 609 static int check_slab(struct kmem_cache *s, struct page *page)
 610 {
 611         VM_BUG_ON(!irqs_disabled());
 612
 613         if (!PageSlab(page)) {
 614                 slab_err(s, page, "Not a valid slab page flags=%lx "
 615                         "mapping=0x%p count=%d", page->flags, page->mapping,
 616                         page_count(page));
 617                 return 0;
 618         }
 619         if (page->offset * sizeof(void *) != s->offset) {
 620                 slab_err(s, page, "Corrupted offset %lu flags=0x%lx "
 621                         "mapping=0x%p count=%d",
 622                         (unsigned long)(page->offset * sizeof(void *)),
 623                         page->flags,
 624                         page->mapping,
 625                         page_count(page));
 626                 return 0;
 627         }
 628         if (page->inuse > s->objects) {
 629                 slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx "
 630                         "mapping=0x%p count=%d",
 631                         s->name, page->inuse, s->objects, page->flags,
 632                         page->mapping, page_count(page));
 633                 return 0;
 634         }
 635         /* Slab_pad_check fixes things up after itself */
 636         slab_pad_check(s, page);
 637         return 1;
 638 }
 639
 640 /*
 641  * Determine if a certain object on a page is on the freelist. Must hold the
 642  * slab lock to guarantee that the chains are in a consistent state.
 643  */
 644 static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 645 {
 646         int nr = 0;
 647         void *fp = page->freelist;
 648         void *object = NULL;
 649
 650         while (fp && nr <= s->objects) {
 651                 if (fp == search)
 652                         return 1;
 653                 if (!check_valid_pointer(s, page, fp)) {
 654                         if (object) {
 655                                 object_err(s, page, object,
 656                                         "Freechain corrupt");
 657                                 set_freepointer(s, object, NULL);
 658                                 break;
 659                         } else {
 660                                 slab_err(s, page, "Freepointer 0x%p corrupt",
 661                                                                         fp);
 662                                 page->freelist = NULL;
 663                                 page->inuse = s->objects;
 664                                 printk(KERN_ERR "@@@ SLUB %s: Freelist "
 665                                         "cleared. Slab 0x%p\n",
 666                                         s->name, page);
 667                                 return 0;
 668                         }
 669                         break;
 670                 }
 671                 object = fp;
 672                 fp = get_freepointer(s, object);
 673                 nr++;
 674         }
 675
 676         if (page->inuse != s->objects - nr) {
 677                 slab_err(s, page, "Wrong object count. Counter is %d but "
 678                         "counted were %d", s, page, page->inuse,
 679                                                         s->objects - nr);
 680                 page->inuse = s->objects - nr;
 681                 printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. "
 682                         "Slab @0x%p\n", s->name, page);
 683         }
 684         return search == NULL;
 685 }
 686
 687 /*
 688  * Tracking of fully allocated slabs for debugging purposes.
 689  */
 690 static void add_full(struct kmem_cache_node *n, struct page *page)
 691 {
 692         spin_lock(&n->list_lock);
 693         list_add(&page->lru, &n->full);
 694         spin_unlock(&n->list_lock);
 695 }
 696
 697 static void remove_full(struct kmem_cache *s, struct page *page)
 698 {
 699         struct kmem_cache_node *n;
 700
 701         if (!(s->flags & SLAB_STORE_USER))
 702                 return;
 703
 704         n = get_node(s, page_to_nid(page));
 705
 706         spin_lock(&n->list_lock);
 707         list_del(&page->lru);
 708         spin_unlock(&n->list_lock);
 709 }
 710
 711 static int alloc_object_checks(struct kmem_cache *s, struct page *page,
 712                                                         void *object)
 713 {
 714         if (!check_slab(s, page))
 715                 goto bad;
 716
 717         if (object && !on_freelist(s, page, object)) {
 718                 slab_err(s, page, "Object 0x%p already allocated", object);
 719                 goto bad;
 720         }
 721
 722         if (!check_valid_pointer(s, page, object)) {
 723                 object_err(s, page, object, "Freelist Pointer check fails");
 724                 goto bad;
 725         }
 726
 727         if (!object)
 728                 return 1;
 729
 730         if (!check_object(s, page, object, 0))
 731                 goto bad;
 732
 733         return 1;
 734 bad:
 735         if (PageSlab(page)) {
 736                 /*
 737                  * If this is a slab page then lets do the best we can
 738                  * to avoid issues in the future. Marking all objects
 739                  * as used avoids touching the remaining objects.
 740                  */
 741                 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
 742                         s->name, page);
 743                 page->inuse = s->objects;
 744                 page->freelist = NULL;
 745                 /* Fix up fields that may be corrupted */
 746                 page->offset = s->offset / sizeof(void *);
 747         }
 748         return 0;
 749 }
 750
 751 static int free_object_checks(struct kmem_cache *s, struct page *page,
 752                                                         void *object)
 753 {
 754         if (!check_slab(s, page))
 755                 goto fail;
 756
 757         if (!check_valid_pointer(s, page, object)) {
 758                 slab_err(s, page, "Invalid object pointer 0x%p", object);
 759                 goto fail;
 760         }
 761
 762         if (on_freelist(s, page, object)) {
 763                 slab_err(s, page, "Object 0x%p already free", object);
 764                 goto fail;
 765         }
 766
 767         if (!check_object(s, page, object, 1))
 768                 return 0;
 769
 770         if (unlikely(s != page->slab)) {
 771                 if (!PageSlab(page))
 772                         slab_err(s, page, "Attempt to free object(0x%p) "
 773                                 "outside of slab", object);
 774                 else
 775                 if (!page->slab) {
 776                         printk(KERN_ERR
 777                                 "SLUB <none>: no slab for object 0x%p.\n",
 778                                                 object);
 779                         dump_stack();
 780                 }
 781                 else
 782                         slab_err(s, page, "object at 0x%p belongs "
 783                                 "to slab %s", object, page->slab->name);
 784                 goto fail;
 785         }
 786         return 1;
 787 fail:
 788         printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
 789                 s->name, page, object);
 790         return 0;
 791 }
 792
 793 /*
 794  * Slab allocation and freeing
 795  */
 796 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 797 {
 798         struct page * page;
 799         int pages = 1 << s->order;
 800
 801         if (s->order)
 802                 flags |= __GFP_COMP;
 803
 804         if (s->flags & SLAB_CACHE_DMA)
 805                 flags |= SLUB_DMA;
 806
 807         if (node == -1)
 808                 page = alloc_pages(flags, s->order);
 809         else
 810                 page = alloc_pages_node(node, flags, s->order);
 811
 812         if (!page)
 813                 return NULL;
 814
 815         mod_zone_page_state(page_zone(page),
 816                 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
 817                 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 818                 pages);
 819
 820         return page;
 821 }
 822
 823 static void setup_object(struct kmem_cache *s, struct page *page,
 824                                 void *object)
 825 {
 826         if (PageError(page)) {
 827                 init_object(s, object, 0);
 828                 init_tracking(s, object);
 829         }
 830
 831         if (unlikely(s->ctor))
 832                 s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR);
 833 }
 834
 835 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 836 {
 837         struct page *page;
 838         struct kmem_cache_node *n;
 839         void *start;
 840         void *end;
 841         void *last;
 842         void *p;
 843
 844         BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
 845
 846         if (flags & __GFP_WAIT)
 847                 local_irq_enable();
 848
 849         page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
 850         if (!page)
 851                 goto out;
 852
 853         n = get_node(s, page_to_nid(page));
 854         if (n)
 855                 atomic_long_inc(&n->nr_slabs);
 856         page->offset = s->offset / sizeof(void *);
 857         page->slab = s;
 858         page->flags |= 1 << PG_slab;
 859         if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
 860                         SLAB_STORE_USER | SLAB_TRACE))
 861                 page->flags |= 1 << PG_error;
 862
 863         start = page_address(page);
 864         end = start + s->objects * s->size;
 865
 866         if (unlikely(s->flags & SLAB_POISON))
 867                 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
 868
 869         last = start;
 870         for_each_object(p, s, start) {
 871                 setup_object(s, page, last);
 872                 set_freepointer(s, last, p);
 873                 last = p;
 874         }
 875         setup_object(s, page, last);
 876         set_freepointer(s, last, NULL);
 877
 878         page->freelist = start;
 879         page->inuse = 0;
 880 out:
 881         if (flags & __GFP_WAIT)
 882                 local_irq_disable();
 883         return page;
 884 }
 885
 886 static void __free_slab(struct kmem_cache *s, struct page *page)
 887 {
 888         int pages = 1 << s->order;
 889
 890         if (unlikely(PageError(page) || s->dtor)) {
 891                 void *p;
 892
 893                 slab_pad_check(s, page);
 894                 for_each_object(p, s, page_address(page)) {
 895                         if (s->dtor)
 896                                 s->dtor(p, s, 0);
 897                         check_object(s, page, p, 0);
 898                 }
 899         }
 900
 901         mod_zone_page_state(page_zone(page),
 902                 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
 903                 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 904                 - pages);
 905
 906         page->mapping = NULL;
 907         __free_pages(page, s->order);
 908 }
 909
 910 static void rcu_free_slab(struct rcu_head *h)
 911 {
 912         struct page *page;
 913
 914         page = container_of((struct list_head *)h, struct page, lru);
 915         __free_slab(page->slab, page);
 916 }
 917
 918 static void free_slab(struct kmem_cache *s, struct page *page)
 919 {
 920         if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
 921                 /*
 922                  * RCU free overloads the RCU head over the LRU
 923                  */
 924                 struct rcu_head *head = (void *)&page->lru;
 925
 926                 call_rcu(head, rcu_free_slab);
 927         } else
 928                 __free_slab(s, page);
 929 }
 930
 931 static void discard_slab(struct kmem_cache *s, struct page *page)
 932 {
 933         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 934
 935         atomic_long_dec(&n->nr_slabs);
 936         reset_page_mapcount(page);
 937         page->flags &= ~(1 << PG_slab | 1 << PG_error);
 938         free_slab(s, page);
 939 }
 940
 941 /*
 942  * Per slab locking using the pagelock
 943  */
 944 static __always_inline void slab_lock(struct page *page)
 945 {
 946         bit_spin_lock(PG_locked, &page->flags);
 947 }
 948
 949 static __always_inline void slab_unlock(struct page *page)
 950 {
 951         bit_spin_unlock(PG_locked, &page->flags);
 952 }
 953
 954 static __always_inline int slab_trylock(struct page *page)
 955 {
 956         int rc = 1;
 957
 958         rc = bit_spin_trylock(PG_locked, &page->flags);
 959         return rc;
 960 }
 961
 962 /*
 963  * Management of partially allocated slabs
 964  */
 965 static void add_partial_tail(struct kmem_cache_node *n, struct page *page)
 966 {
 967         spin_lock(&n->list_lock);
 968         n->nr_partial++;
 969         list_add_tail(&page->lru, &n->partial);
 970         spin_unlock(&n->list_lock);
 971 }
 972
 973 static void add_partial(struct kmem_cache_node *n, struct page *page)
 974 {
 975         spin_lock(&n->list_lock);
 976         n->nr_partial++;
 977         list_add(&page->lru, &n->partial);
 978         spin_unlock(&n->list_lock);
 979 }
 980
 981 static void remove_partial(struct kmem_cache *s,
 982                                                 struct page *page)
 983 {
 984         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 985
 986         spin_lock(&n->list_lock);
 987         list_del(&page->lru);
 988         n->nr_partial--;
 989         spin_unlock(&n->list_lock);
 990 }
 991
 992 /*
 993  * Lock slab and remove from the partial list.
 994  *
 995  * Must hold list_lock.
 996  */
 997 static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page)
 998 {
 999         if (slab_trylock(page)) {
1000                 list_del(&page->lru);
1001                 n->nr_partial--;
1002                 return 1;
1003         }
1004         return 0;
1005 }
1006
1007 /*
1008  * Try to allocate a partial slab from a specific node.
1009  */
1010 static struct page *get_partial_node(struct kmem_cache_node *n)
1011 {
1012         struct page *page;
1013
1014         /*
1015          * Racy check. If we mistakenly see no partial slabs then we
1016          * just allocate an empty slab. If we mistakenly try to get a
1017          * partial slab and there is none available then get_partials()
1018          * will return NULL.
1019          */
1020         if (!n || !n->nr_partial)
1021                 return NULL;
1022
1023         spin_lock(&n->list_lock);
1024         list_for_each_entry(page, &n->partial, lru)
1025                 if (lock_and_del_slab(n, page))
1026                         goto out;
1027         page = NULL;
1028 out:
1029         spin_unlock(&n->list_lock);
1030         return page;
1031 }
1032
1033 /*
1034  * Get a page from somewhere. Search in increasing NUMA distances.
1035  */
1036 static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1037 {
1038 #ifdef CONFIG_NUMA
1039         struct zonelist *zonelist;
1040         struct zone **z;
1041         struct page *page;
1042
1043         /*
1044          * The defrag ratio allows a configuration of the tradeoffs between
1045          * inter node defragmentation and node local allocations. A lower
1046          * defrag_ratio increases the tendency to do local allocations
1047          * instead of attempting to obtain partial slabs from other nodes.
1048          *
1049          * If the defrag_ratio is set to 0 then kmalloc() always
1050          * returns node local objects. If the ratio is higher then kmalloc()
1051          * may return off node objects because partial slabs are obtained
1052          * from other nodes and filled up.
1053          *
1054          * If /sys/slab/xx/defrag_ratio is set to 100 (which makes
1055          * defrag_ratio = 1000) then every (well almost) allocation will
1056          * first attempt to defrag slab caches on other nodes. This means
1057          * scanning over all nodes to look for partial slabs which may be
1058          * expensive if we do it every time we are trying to find a slab
1059          * with available objects.
1060          */
1061         if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
1062                 return NULL;
1063
1064         zonelist = &NODE_DATA(slab_node(current->mempolicy))
1065                                         ->node_zonelists[gfp_zone(flags)];
1066         for (z = zonelist->zones; *z; z++) {
1067                 struct kmem_cache_node *n;
1068
1069                 n = get_node(s, zone_to_nid(*z));
1070
1071                 if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
1072                                 n->nr_partial > MIN_PARTIAL) {
1073                         page = get_partial_node(n);
1074                         if (page)
1075                                 return page;
1076                 }
1077         }
1078 #endif
1079         return NULL;
1080 }
1081
1082 /*
1083  * Get a partial page, lock it and return it.
1084  */
1085 static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1086 {
1087         struct page *page;
1088         int searchnode = (node == -1) ? numa_node_id() : node;
1089
1090         page = get_partial_node(get_node(s, searchnode));
1091         if (page || (flags & __GFP_THISNODE))
1092                 return page;
1093
1094         return get_any_partial(s, flags);
1095 }
1096
1097 /*
1098  * Move a page back to the lists.
1099  *
1100  * Must be called with the slab lock held.
1101  *
1102  * On exit the slab lock will have been dropped.
1103  */
1104 static void putback_slab(struct kmem_cache *s, struct page *page)
1105 {
1106         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1107
1108         if (page->inuse) {
1109
1110                 if (page->freelist)
1111                         add_partial(n, page);
1112                 else if (PageError(page) && (s->flags & SLAB_STORE_USER))
1113                         add_full(n, page);
1114                 slab_unlock(page);
1115
1116         } else {
1117                 if (n->nr_partial < MIN_PARTIAL) {
1118                         /*
1119                          * Adding an empty slab to the partial slabs in order
1120                          * to avoid page allocator overhead. This slab needs
1121                          * to come after the other slabs with objects in
1122                          * order to fill them up. That way the size of the
1123                          * partial list stays small. kmem_cache_shrink can
1124                          * reclaim empty slabs from the partial list.
1125                          */
1126                         add_partial_tail(n, page);
1127                         slab_unlock(page);
1128                 } else {
1129                         slab_unlock(page);
1130                         discard_slab(s, page);
1131                 }
1132         }
1133 }
1134
1135 /*
1136  * Remove the cpu slab
1137  */
1138 static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
1139 {
1140         s->cpu_slab[cpu] = NULL;
1141         ClearPageActive(page);
1142
1143         putback_slab(s, page);
1144 }
1145
1146 static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
1147 {
1148         slab_lock(page);
1149         deactivate_slab(s, page, cpu);
1150 }
1151
1152 /*
1153  * Flush cpu slab.
1154  * Called from IPI handler with interrupts disabled.
1155  */
1156 static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1157 {
1158         struct page *page = s->cpu_slab[cpu];
1159
1160         if (likely(page))
1161                 flush_slab(s, page, cpu);
1162 }
1163
1164 static void flush_cpu_slab(void *d)
1165 {
1166         struct kmem_cache *s = d;
1167         int cpu = smp_processor_id();
1168
1169         __flush_cpu_slab(s, cpu);
1170 }
1171
1172 static void flush_all(struct kmem_cache *s)
1173 {
1174 #ifdef CONFIG_SMP
1175         on_each_cpu(flush_cpu_slab, s, 1, 1);
1176 #else
1177         unsigned long flags;
1178
1179         local_irq_save(flags);
1180         flush_cpu_slab(s);
1181         local_irq_restore(flags);
1182 #endif
1183 }
1184
1185 /*
1186  * slab_alloc is optimized to only modify two cachelines on the fast path
1187  * (aside from the stack):
1188  *
1189  * 1. The page struct
1190  * 2. The first cacheline of the object to be allocated.
1191  *
1192  * The only other cache lines that are read (apart from code) is the
1193  * per cpu array in the kmem_cache struct.
1194  *
1195  * Fastpath is not possible if we need to get a new slab or have
1196  * debugging enabled (which means all slabs are marked with PageError)
1197  */
1198 static void *slab_alloc(struct kmem_cache *s,
1199                                 gfp_t gfpflags, int node, void *addr)
1200 {
1201         struct page *page;
1202         void **object;
1203         unsigned long flags;
1204         int cpu;
1205
1206         local_irq_save(flags);
1207         cpu = smp_processor_id();
1208         page = s->cpu_slab[cpu];
1209         if (!page)
1210                 goto new_slab;
1211
1212         slab_lock(page);
1213         if (unlikely(node != -1 && page_to_nid(page) != node))
1214                 goto another_slab;
1215 redo:
1216         object = page->freelist;
1217         if (unlikely(!object))
1218                 goto another_slab;
1219         if (unlikely(PageError(page)))
1220                 goto debug;
1221
1222 have_object:
1223         page->inuse++;
1224         page->freelist = object[page->offset];
1225         slab_unlock(page);
1226         local_irq_restore(flags);
1227         return object;
1228
1229 another_slab:
1230         deactivate_slab(s, page, cpu);
1231
1232 new_slab:
1233         page = get_partial(s, gfpflags, node);
1234         if (likely(page)) {
1235 have_slab:
1236                 s->cpu_slab[cpu] = page;
1237                 SetPageActive(page);
1238                 goto redo;
1239         }
1240
1241         page = new_slab(s, gfpflags, node);
1242         if (page) {
1243                 cpu = smp_processor_id();
1244                 if (s->cpu_slab[cpu]) {
1245                         /*
1246                          * Someone else populated the cpu_slab while we
1247                          * enabled interrupts, or we have gotten scheduled
1248                          * on another cpu. The page may not be on the
1249                          * requested node even if __GFP_THISNODE was
1250                          * specified. So we need to recheck.
1251                          */
1252                         if (node == -1 ||
1253                                 page_to_nid(s->cpu_slab[cpu]) == node) {
1254                                 /*
1255                                  * Current cpuslab is acceptable and we
1256                                  * want the current one since its cache hot
1257                                  */
1258                                 discard_slab(s, page);
1259                                 page = s->cpu_slab[cpu];
1260                                 slab_lock(page);
1261                                 goto redo;
1262                         }
1263                         /* New slab does not fit our expectations */
1264                         flush_slab(s, s->cpu_slab[cpu], cpu);
1265                 }
1266                 slab_lock(page);
1267                 goto have_slab;
1268         }
1269         local_irq_restore(flags);
1270         return NULL;
1271 debug:
1272         if (!alloc_object_checks(s, page, object))
1273                 goto another_slab;
1274         if (s->flags & SLAB_STORE_USER)
1275                 set_track(s, object, TRACK_ALLOC, addr);
1276         if (s->flags & SLAB_TRACE) {
1277                 printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
1278                         s->name, object, page->inuse,
1279                         page->freelist);
1280                 dump_stack();
1281         }
1282         init_object(s, object, 1);
1283         goto have_object;
1284 }
1285
1286 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1287 {
1288         return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
1289 }
1290 EXPORT_SYMBOL(kmem_cache_alloc);
1291
1292 #ifdef CONFIG_NUMA
1293 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1294 {
1295         return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
1296 }
1297 EXPORT_SYMBOL(kmem_cache_alloc_node);
1298 #endif
1299
1300 /*
1301  * The fastpath only writes the cacheline of the page struct and the first
1302  * cacheline of the object.
1303  *
1304  * We read the cpu_slab cacheline to check if the slab is the per cpu
1305  * slab for this processor.
1306  */
1307 static void slab_free(struct kmem_cache *s, struct page *page,
1308                                         void *x, void *addr)
1309 {
1310         void *prior;
1311         void **object = (void *)x;
1312         unsigned long flags;
1313
1314         local_irq_save(flags);
1315         slab_lock(page);
1316
1317         if (unlikely(PageError(page)))
1318                 goto debug;
1319 checks_ok:
1320         prior = object[page->offset] = page->freelist;
1321         page->freelist = object;
1322         page->inuse--;
1323
1324         if (unlikely(PageActive(page)))
1325                 /*
1326                  * Cpu slabs are never on partial lists and are
1327                  * never freed.
1328                  */
1329                 goto out_unlock;
1330
1331         if (unlikely(!page->inuse))
1332                 goto slab_empty;
1333
1334         /*
1335          * Objects left in the slab. If it
1336          * was not on the partial list before
1337          * then add it.
1338          */
1339         if (unlikely(!prior))
1340                 add_partial(get_node(s, page_to_nid(page)), page);
1341
1342 out_unlock:
1343         slab_unlock(page);
1344         local_irq_restore(flags);
1345         return;
1346
1347 slab_empty:
1348         if (prior)
1349                 /*
1350                  * Slab still on the partial list.
1351                  */
1352                 remove_partial(s, page);
1353
1354         slab_unlock(page);
1355         discard_slab(s, page);
1356         local_irq_restore(flags);
1357         return;
1358
1359 debug:
1360         if (!free_object_checks(s, page, x))
1361                 goto out_unlock;
1362         if (!PageActive(page) && !page->freelist)
1363                 remove_full(s, page);
1364         if (s->flags & SLAB_STORE_USER)
1365                 set_track(s, x, TRACK_FREE, addr);
1366         if (s->flags & SLAB_TRACE) {
1367                 printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
1368                         s->name, object, page->inuse,
1369                         page->freelist);
1370                 print_section("Object", (void *)object, s->objsize);
1371                 dump_stack();
1372         }
1373         init_object(s, object, 0);
1374         goto checks_ok;
1375 }
1376
1377 void kmem_cache_free(struct kmem_cache *s, void *x)
1378 {
1379         struct page *page;
1380
1381         page = virt_to_head_page(x);
1382
1383         slab_free(s, page, x, __builtin_return_address(0));
1384 }
1385 EXPORT_SYMBOL(kmem_cache_free);
1386
1387 /* Figure out on which slab object the object resides */
1388 static struct page *get_object_page(const void *x)
1389 {
1390         struct page *page = virt_to_head_page(x);
1391
1392         if (!PageSlab(page))
1393                 return NULL;
1394
1395         return page;
1396 }
1397
1398 /*
1399  * Object placement in a slab is made very easy because we always start at
1400  * offset 0. If we tune the size of the object to the alignment then we can
1401  * get the required alignment by putting one properly sized object after
1402  * another.
1403  *
1404  * Notice that the allocation order determines the sizes of the per cpu
1405  * caches. Each processor has always one slab available for allocations.
1406  * Increasing the allocation order reduces the number of times that slabs
1407  * must be moved on and off the partial lists and is therefore a factor in
1408  * locking overhead.
1409  */
1410
1411 /*
1412  * Mininum / Maximum order of slab pages. This influences locking overhead
1413  * and slab fragmentation. A higher order reduces the number of partial slabs
1414  * and increases the number of allocations possible without having to
1415  * take the list_lock.
1416  */
1417 static int slub_min_order;
1418 static int slub_max_order = DEFAULT_MAX_ORDER;
1419 static int slub_min_objects = DEFAULT_MIN_OBJECTS;
1420
1421 /*
1422  * Merge control. If this is set then no merging of slab caches will occur.
1423  * (Could be removed. This was introduced to pacify the merge skeptics.)
1424  */
1425 static int slub_nomerge;
1426
1427 /*
1428  * Debug settings:
1429  */
1430 static int slub_debug;
1431
1432 static char *slub_debug_slabs;
1433
1434 /*
1435  * Calculate the order of allocation given an slab object size.
1436  *
1437  * The order of allocation has significant impact on performance and other
1438  * system components. Generally order 0 allocations should be preferred since
1439  * order 0 does not cause fragmentation in the page allocator. Larger objects
1440  * be problematic to put into order 0 slabs because there may be too much
1441  * unused space left. We go to a higher order if more than 1/8th of the slab
1442  * would be wasted.
1443  *
1444  * In order to reach satisfactory performance we must ensure that a minimum
1445  * number of objects is in one slab. Otherwise we may generate too much
1446  * activity on the partial lists which requires taking the list_lock. This is
1447  * less a concern for large slabs though which are rarely used.
1448  *
1449  * slub_max_order specifies the order where we begin to stop considering the
1450  * number of objects in a slab as critical. If we reach slub_max_order then
1451  * we try to keep the page order as low as possible. So we accept more waste
1452  * of space in favor of a small page order.
1453  *
1454  * Higher order allocations also allow the placement of more objects in a
1455  * slab and thereby reduce object handling overhead. If the user has
1456  * requested a higher mininum order then we start with that one instead of
1457  * the smallest order which will fit the object.
1458  */
1459 static int calculate_order(int size)
1460 {
1461         int order;
1462         int rem;
1463
1464         for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT);
1465                         order < MAX_ORDER; order++) {
1466                 unsigned long slab_size = PAGE_SIZE << order;
1467
1468                 if (slub_max_order > order &&
1469                                 slab_size < slub_min_objects * size)
1470                         continue;
1471
1472                 if (slab_size < size)
1473                         continue;
1474
1475                 rem = slab_size % size;
1476
1477                 if (rem <= slab_size / 8)
1478                         break;
1479
1480         }
1481         if (order >= MAX_ORDER)
1482                 return -E2BIG;
1483
1484         return order;
1485 }
1486
1487 /*
1488  * Figure out what the alignment of the objects will be.
1489  */
1490 static unsigned long calculate_alignment(unsigned long flags,
1491                 unsigned long align, unsigned long size)
1492 {
1493         /*
1494          * If the user wants hardware cache aligned objects then
1495          * follow that suggestion if the object is sufficiently
1496          * large.
1497          *
1498          * The hardware cache alignment cannot override the
1499          * specified alignment though. If that is greater
1500          * then use it.
1501          */
1502         if ((flags & SLAB_HWCACHE_ALIGN) &&
1503                         size > cache_line_size() / 2)
1504                 return max_t(unsigned long, align, cache_line_size());
1505
1506         if (align < ARCH_SLAB_MINALIGN)
1507                 return ARCH_SLAB_MINALIGN;
1508
1509         return ALIGN(align, sizeof(void *));
1510 }
1511
1512 static void init_kmem_cache_node(struct kmem_cache_node *n)
1513 {
1514         n->nr_partial = 0;
1515         atomic_long_set(&n->nr_slabs, 0);
1516         spin_lock_init(&n->list_lock);
1517         INIT_LIST_HEAD(&n->partial);
1518         INIT_LIST_HEAD(&n->full);
1519 }
1520
1521 #ifdef CONFIG_NUMA
1522 /*
1523  * No kmalloc_node yet so do it by hand. We know that this is the first
1524  * slab on the node for this slabcache. There are no concurrent accesses
1525  * possible.
1526  *
1527  * Note that this function only works on the kmalloc_node_cache
1528  * when allocating for the kmalloc_node_cache.
1529  */
1530 static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags,
1531                                                                 int node)
1532 {
1533         struct page *page;
1534         struct kmem_cache_node *n;
1535
1536         BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
1537
1538         page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node);
1539         /* new_slab() disables interupts */
1540         local_irq_enable();
1541
1542         BUG_ON(!page);
1543         n = page->freelist;
1544         BUG_ON(!n);
1545         page->freelist = get_freepointer(kmalloc_caches, n);
1546         page->inuse++;
1547         kmalloc_caches->node[node] = n;
1548         init_object(kmalloc_caches, n, 1);
1549         init_kmem_cache_node(n);
1550         atomic_long_inc(&n->nr_slabs);
1551         add_partial(n, page);
1552         return n;
1553 }
1554
1555 static void free_kmem_cache_nodes(struct kmem_cache *s)
1556 {
1557         int node;
1558
1559         for_each_online_node(node) {
1560                 struct kmem_cache_node *n = s->node[node];
1561                 if (n && n != &s->local_node)
1562                         kmem_cache_free(kmalloc_caches, n);
1563                 s->node[node] = NULL;
1564         }
1565 }
1566
1567 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
1568 {
1569         int node;
1570         int local_node;
1571
1572         if (slab_state >= UP)
1573                 local_node = page_to_nid(virt_to_page(s));
1574         else
1575                 local_node = 0;
1576
1577         for_each_online_node(node) {
1578                 struct kmem_cache_node *n;
1579
1580                 if (local_node == node)
1581                         n = &s->local_node;
1582                 else {
1583                         if (slab_state == DOWN) {
1584                                 n = early_kmem_cache_node_alloc(gfpflags,
1585                                                                 node);
1586                                 continue;
1587                         }
1588                         n = kmem_cache_alloc_node(kmalloc_caches,
1589                                                         gfpflags, node);
1590
1591                         if (!n) {
1592                                 free_kmem_cache_nodes(s);
1593                                 return 0;
1594                         }
1595
1596                 }
1597                 s->node[node] = n;
1598                 init_kmem_cache_node(n);
1599         }
1600         return 1;
1601 }
1602 #else
1603 static void free_kmem_cache_nodes(struct kmem_cache *s)
1604 {
1605 }
1606
1607 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
1608 {
1609         init_kmem_cache_node(&s->local_node);
1610         return 1;
1611 }
1612 #endif
1613
1614 /*
1615  * calculate_sizes() determines the order and the distribution of data within
1616  * a slab object.
1617  */
1618 static int calculate_sizes(struct kmem_cache *s)
1619 {
1620         unsigned long flags = s->flags;
1621         unsigned long size = s->objsize;
1622         unsigned long align = s->align;
1623
1624         /*
1625          * Determine if we can poison the object itself. If the user of
1626          * the slab may touch the object after free or before allocation
1627          * then we should never poison the object itself.
1628          */
1629         if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
1630                         !s->ctor && !s->dtor)
1631                 s->flags |= __OBJECT_POISON;
1632         else
1633                 s->flags &= ~__OBJECT_POISON;
1634
1635         /*
1636          * Round up object size to the next word boundary. We can only
1637          * place the free pointer at word boundaries and this determines
1638          * the possible location of the free pointer.
1639          */
1640         size = ALIGN(size, sizeof(void *));
1641
1642         /*
1643          * If we are Redzoning then check if there is some space between the
1644          * end of the object and the free pointer. If not then add an
1645          * additional word to have some bytes to store Redzone information.
1646          */
1647         if ((flags & SLAB_RED_ZONE) && size == s->objsize)
1648                 size += sizeof(void *);
1649
1650         /*
1651          * With that we have determined the number of bytes in actual use
1652          * by the object. This is the potential offset to the free pointer.
1653          */
1654         s->inuse = size;
1655
1656         if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
1657                 s->ctor || s->dtor)) {
1658                 /*
1659                  * Relocate free pointer after the object if it is not
1660                  * permitted to overwrite the first word of the object on
1661                  * kmem_cache_free.
1662                  *
1663                  * This is the case if we do RCU, have a constructor or
1664                  * destructor or are poisoning the objects.
1665                  */
1666                 s->offset = size;
1667                 size += sizeof(void *);
1668         }
1669
1670         if (flags & SLAB_STORE_USER)
1671                 /*
1672                  * Need to store information about allocs and frees after
1673                  * the object.
1674                  */
1675                 size += 2 * sizeof(struct track);
1676
1677         if (flags & SLAB_RED_ZONE)
1678                 /*
1679                  * Add some empty padding so that we can catch
1680                  * overwrites from earlier objects rather than let
1681                  * tracking information or the free pointer be
1682                  * corrupted if an user writes before the start
1683                  * of the object.
1684                  */
1685                 size += sizeof(void *);
1686
1687         /*
1688          * Determine the alignment based on various parameters that the
1689          * user specified and the dynamic determination of cache line size
1690          * on bootup.
1691          */
1692         align = calculate_alignment(flags, align, s->objsize);
1693
1694         /*
1695          * SLUB stores one object immediately after another beginning from
1696          * offset 0. In order to align the objects we have to simply size
1697          * each object to conform to the alignment.
1698          */
1699         size = ALIGN(size, align);
1700         s->size = size;
1701
1702         s->order = calculate_order(size);
1703         if (s->order < 0)
1704                 return 0;
1705
1706         /*
1707          * Determine the number of objects per slab
1708          */
1709         s->objects = (PAGE_SIZE << s->order) / size;
1710
1711         /*
1712          * Verify that the number of objects is within permitted limits.
1713          * The page->inuse field is only 16 bit wide! So we cannot have
1714          * more than 64k objects per slab.
1715          */
1716         if (!s->objects || s->objects > 65535)
1717                 return 0;
1718         return 1;
1719
1720 }
1721
1722 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
1723                 const char *name, size_t size,
1724                 size_t align, unsigned long flags,
1725                 void (*ctor)(void *, struct kmem_cache *, unsigned long),
1726                 void (*dtor)(void *, struct kmem_cache *, unsigned long))
1727 {
1728         memset(s, 0, kmem_size);
1729         s->name = name;
1730         s->ctor = ctor;
1731         s->dtor = dtor;
1732         s->objsize = size;
1733         s->flags = flags;
1734         s->align = align;
1735
1736         /*
1737          * The page->offset field is only 16 bit wide. This is an offset
1738          * in units of words from the beginning of an object. If the slab
1739          * size is bigger then we cannot move the free pointer behind the
1740          * object anymore.
1741          *
1742          * On 32 bit platforms the limit is 256k. On 64bit platforms
1743          * the limit is 512k.
1744          *
1745          * Debugging or ctor/dtors may create a need to move the free
1746          * pointer. Fail if this happens.
1747          */
1748         if (s->size >= 65535 * sizeof(void *)) {
1749                 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
1750                                 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
1751                 BUG_ON(ctor || dtor);
1752         }
1753         else
1754                 /*
1755                  * Enable debugging if selected on the kernel commandline.
1756                  */
1757                 if (slub_debug && (!slub_debug_slabs ||
1758                     strncmp(slub_debug_slabs, name,
1759                         strlen(slub_debug_slabs)) == 0))
1760                                 s->flags |= slub_debug;
1761
1762         if (!calculate_sizes(s))
1763                 goto error;
1764
1765         s->refcount = 1;
1766 #ifdef CONFIG_NUMA
1767         s->defrag_ratio = 100;
1768 #endif
1769
1770         if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
1771                 return 1;
1772 error:
1773         if (flags & SLAB_PANIC)
1774                 panic("Cannot create slab %s size=%lu realsize=%u "
1775                         "order=%u offset=%u flags=%lx\n",
1776                         s->name, (unsigned long)size, s->size, s->order,
1777                         s->offset, flags);
1778         return 0;
1779 }
1780 EXPORT_SYMBOL(kmem_cache_open);
1781
1782 /*
1783  * Check if a given pointer is valid
1784  */
1785 int kmem_ptr_validate(struct kmem_cache *s, const void *object)
1786 {
1787         struct page * page;
1788
1789         page = get_object_page(object);
1790
1791         if (!page || s != page->slab)
1792                 /* No slab or wrong slab */
1793                 return 0;
1794
1795         if (!check_valid_pointer(s, page, object))
1796                 return 0;
1797
1798         /*
1799          * We could also check if the object is on the slabs freelist.
1800          * But this would be too expensive and it seems that the main
1801          * purpose of kmem_ptr_valid is to check if the object belongs
1802          * to a certain slab.
1803          */
1804         return 1;
1805 }
1806 EXPORT_SYMBOL(kmem_ptr_validate);
1807
1808 /*
1809  * Determine the size of a slab object
1810  */
1811 unsigned int kmem_cache_size(struct kmem_cache *s)
1812 {
1813         return s->objsize;
1814 }
1815 EXPORT_SYMBOL(kmem_cache_size);
1816
1817 const char *kmem_cache_name(struct kmem_cache *s)
1818 {
1819         return s->name;
1820 }
1821 EXPORT_SYMBOL(kmem_cache_name);
1822
1823 /*
1824  * Attempt to free all slabs on a node. Return the number of slabs we
1825  * were unable to free.
1826  */
1827 static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
1828                         struct list_head *list)
1829 {
1830         int slabs_inuse = 0;
1831         unsigned long flags;
1832         struct page *page, *h;
1833
1834         spin_lock_irqsave(&n->list_lock, flags);
1835         list_for_each_entry_safe(page, h, list, lru)
1836                 if (!page->inuse) {
1837                         list_del(&page->lru);
1838                         discard_slab(s, page);
1839                 } else
1840                         slabs_inuse++;
1841         spin_unlock_irqrestore(&n->list_lock, flags);
1842         return slabs_inuse;
1843 }
1844
1845 /*
1846  * Release all resources used by a slab cache.
1847  */
1848 static int kmem_cache_close(struct kmem_cache *s)
1849 {
1850         int node;
1851
1852         flush_all(s);
1853
1854         /* Attempt to free all objects */
1855         for_each_online_node(node) {
1856                 struct kmem_cache_node *n = get_node(s, node);
1857
1858                 n->nr_partial -= free_list(s, n, &n->partial);
1859                 if (atomic_long_read(&n->nr_slabs))
1860                         return 1;
1861         }
1862         free_kmem_cache_nodes(s);
1863         return 0;
1864 }
1865
1866 /*
1867  * Close a cache and release the kmem_cache structure
1868  * (must be used for caches created using kmem_cache_create)
1869  */
1870 void kmem_cache_destroy(struct kmem_cache *s)
1871 {
1872         down_write(&slub_lock);
1873         s->refcount--;
1874         if (!s->refcount) {
1875                 list_del(&s->list);
1876                 if (kmem_cache_close(s))
1877                         WARN_ON(1);
1878                 sysfs_slab_remove(s);
1879                 kfree(s);
1880         }
1881         up_write(&slub_lock);
1882 }
1883 EXPORT_SYMBOL(kmem_cache_destroy);
1884
1885 /********************************************************************
1886  *              Kmalloc subsystem
1887  *******************************************************************/
1888
1889 struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
1890 EXPORT_SYMBOL(kmalloc_caches);
1891
1892 #ifdef CONFIG_ZONE_DMA
1893 static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1];
1894 #endif
1895
1896 static int __init setup_slub_min_order(char *str)
1897 {
1898         get_option (&str, &slub_min_order);
1899
1900         return 1;
1901 }
1902
1903 __setup("slub_min_order=", setup_slub_min_order);
1904
1905 static int __init setup_slub_max_order(char *str)
1906 {
1907         get_option (&str, &slub_max_order);
1908
1909         return 1;
1910 }
1911
1912 __setup("slub_max_order=", setup_slub_max_order);
1913
1914 static int __init setup_slub_min_objects(char *str)
1915 {
1916         get_option (&str, &slub_min_objects);
1917
1918         return 1;
1919 }
1920
1921 __setup("slub_min_objects=", setup_slub_min_objects);
1922
1923 static int __init setup_slub_nomerge(char *str)
1924 {
1925         slub_nomerge = 1;
1926         return 1;
1927 }
1928
1929 __setup("slub_nomerge", setup_slub_nomerge);
1930
1931 static int __init setup_slub_debug(char *str)
1932 {
1933         if (!str || *str != '=')
1934                 slub_debug = DEBUG_DEFAULT_FLAGS;
1935         else {
1936                 str++;
1937                 if (*str == 0 || *str == ',')
1938                         slub_debug = DEBUG_DEFAULT_FLAGS;
1939                 else
1940                 for( ;*str && *str != ','; str++)
1941                         switch (*str) {
1942                         case 'f' : case 'F' :
1943                                 slub_debug |= SLAB_DEBUG_FREE;
1944                                 break;
1945                         case 'z' : case 'Z' :
1946                                 slub_debug |= SLAB_RED_ZONE;
1947                                 break;
1948                         case 'p' : case 'P' :
1949                                 slub_debug |= SLAB_POISON;
1950                                 break;
1951                         case 'u' : case 'U' :
1952                                 slub_debug |= SLAB_STORE_USER;
1953                                 break;
1954                         case 't' : case 'T' :
1955                                 slub_debug |= SLAB_TRACE;
1956                                 break;
1957                         default:
1958                                 printk(KERN_ERR "slub_debug option '%c' "
1959                                         "unknown. skipped\n",*str);
1960                         }
1961         }
1962
1963         if (*str == ',')
1964                 slub_debug_slabs = str + 1;
1965         return 1;
1966 }
1967
1968 __setup("slub_debug", setup_slub_debug);
1969
1970 static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
1971                 const char *name, int size, gfp_t gfp_flags)
1972 {
1973         unsigned int flags = 0;
1974
1975         if (gfp_flags & SLUB_DMA)
1976                 flags = SLAB_CACHE_DMA;
1977
1978         down_write(&slub_lock);
1979         if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
1980                         flags, NULL, NULL))
1981                 goto panic;
1982
1983         list_add(&s->list, &slab_caches);
1984         up_write(&slub_lock);
1985         if (sysfs_slab_add(s))
1986                 goto panic;
1987         return s;
1988
1989 panic:
1990         panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
1991 }
1992
1993 static struct kmem_cache *get_slab(size_t size, gfp_t flags)
1994 {
1995         int index = kmalloc_index(size);
1996
1997         if (!index)
1998                 return NULL;
1999
2000         /* Allocation too large? */
2001         BUG_ON(index < 0);
2002
2003 #ifdef CONFIG_ZONE_DMA
2004         if ((flags & SLUB_DMA)) {
2005                 struct kmem_cache *s;
2006                 struct kmem_cache *x;
2007                 char *text;
2008                 size_t realsize;
2009
2010                 s = kmalloc_caches_dma[index];
2011                 if (s)
2012                         return s;
2013
2014                 /* Dynamically create dma cache */
2015                 x = kmalloc(kmem_size, flags & ~SLUB_DMA);
2016                 if (!x)
2017                         panic("Unable to allocate memory for dma cache\n");
2018
2019                 if (index <= KMALLOC_SHIFT_HIGH)
2020                         realsize = 1 << index;
2021                 else {
2022                         if (index == 1)
2023                                 realsize = 96;
2024                         else
2025                                 realsize = 192;
2026                 }
2027
2028                 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2029                                 (unsigned int)realsize);
2030                 s = create_kmalloc_cache(x, text, realsize, flags);
2031                 kmalloc_caches_dma[index] = s;
2032                 return s;
2033         }
2034 #endif
2035         return &kmalloc_caches[index];
2036 }
2037
2038 void *__kmalloc(size_t size, gfp_t flags)
2039 {
2040         struct kmem_cache *s = get_slab(size, flags);
2041
2042         if (s)
2043                 return slab_alloc(s, flags, -1, __builtin_return_address(0));
2044         return NULL;
2045 }
2046 EXPORT_SYMBOL(__kmalloc);
2047
2048 #ifdef CONFIG_NUMA
2049 void *__kmalloc_node(size_t size, gfp_t flags, int node)
2050 {
2051         struct kmem_cache *s = get_slab(size, flags);
2052
2053         if (s)
2054                 return slab_alloc(s, flags, node, __builtin_return_address(0));
2055         return NULL;
2056 }
2057 EXPORT_SYMBOL(__kmalloc_node);
2058 #endif
2059
2060 size_t ksize(const void *object)
2061 {
2062         struct page *page = get_object_page(object);
2063         struct kmem_cache *s;
2064
2065         BUG_ON(!page);
2066         s = page->slab;
2067         BUG_ON(!s);
2068
2069         /*
2070          * Debugging requires use of the padding between object
2071          * and whatever may come after it.
2072          */
2073         if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2074                 return s->objsize;
2075
2076         /*
2077          * If we have the need to store the freelist pointer
2078          * back there or track user information then we can
2079          * only use the space before that information.
2080          */
2081         if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2082                 return s->inuse;
2083
2084         /*
2085          * Else we can use all the padding etc for the allocation
2086          */
2087         return s->size;
2088 }
2089 EXPORT_SYMBOL(ksize);
2090
2091 void kfree(const void *x)
2092 {
2093         struct kmem_cache *s;
2094         struct page *page;
2095
2096         if (!x)
2097                 return;
2098
2099         page = virt_to_head_page(x);
2100         s = page->slab;
2101
2102         slab_free(s, page, (void *)x, __builtin_return_address(0));
2103 }
2104 EXPORT_SYMBOL(kfree);
2105
2106 /*
2107  * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2108  * the remaining slabs by the number of items in use. The slabs with the
2109  * most items in use come first. New allocations will then fill those up
2110  * and thus they can be removed from the partial lists.
2111  *
2112  * The slabs with the least items are placed last. This results in them
2113  * being allocated from last increasing the chance that the last objects
2114  * are freed in them.
2115  */
2116 int kmem_cache_shrink(struct kmem_cache *s)
2117 {
2118         int node;
2119         int i;
2120         struct kmem_cache_node *n;
2121         struct page *page;
2122         struct page *t;
2123         struct list_head *slabs_by_inuse =
2124                 kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
2125         unsigned long flags;
2126
2127         if (!slabs_by_inuse)
2128                 return -ENOMEM;
2129
2130         flush_all(s);
2131         for_each_online_node(node) {
2132                 n = get_node(s, node);
2133
2134                 if (!n->nr_partial)
2135                         continue;
2136
2137                 for (i = 0; i < s->objects; i++)
2138                         INIT_LIST_HEAD(slabs_by_inuse + i);
2139
2140                 spin_lock_irqsave(&n->list_lock, flags);
2141
2142                 /*
2143                  * Build lists indexed by the items in use in each slab.
2144                  *
2145                  * Note that concurrent frees may occur while we hold the
2146                  * list_lock. page->inuse here is the upper limit.
2147                  */
2148                 list_for_each_entry_safe(page, t, &n->partial, lru) {
2149                         if (!page->inuse && slab_trylock(page)) {
2150                                 /*
2151                                  * Must hold slab lock here because slab_free
2152                                  * may have freed the last object and be
2153                                  * waiting to release the slab.
2154                                  */
2155                                 list_del(&page->lru);
2156                                 n->nr_partial--;
2157                                 slab_unlock(page);
2158                                 discard_slab(s, page);
2159                         } else {
2160                                 if (n->nr_partial > MAX_PARTIAL)
2161                                         list_move(&page->lru,
2162                                         slabs_by_inuse + page->inuse);
2163                         }
2164                 }
2165
2166                 if (n->nr_partial <= MAX_PARTIAL)
2167                         goto out;
2168
2169                 /*
2170                  * Rebuild the partial list with the slabs filled up most
2171                  * first and the least used slabs at the end.
2172                  */
2173                 for (i = s->objects - 1; i >= 0; i--)
2174                         list_splice(slabs_by_inuse + i, n->partial.prev);
2175
2176         out:
2177                 spin_unlock_irqrestore(&n->list_lock, flags);
2178         }
2179
2180         kfree(slabs_by_inuse);
2181         return 0;
2182 }
2183 EXPORT_SYMBOL(kmem_cache_shrink);
2184
2185 /**
2186  * krealloc - reallocate memory. The contents will remain unchanged.
2187  *
2188  * @p: object to reallocate memory for.
2189  * @new_size: how many bytes of memory are required.
2190  * @flags: the type of memory to allocate.
2191  *
2192  * The contents of the object pointed to are preserved up to the
2193  * lesser of the new and old sizes.  If @p is %NULL, krealloc()
2194  * behaves exactly like kmalloc().  If @size is 0 and @p is not a
2195  * %NULL pointer, the object pointed to is freed.
2196  */
2197 void *krealloc(const void *p, size_t new_size, gfp_t flags)
2198 {
2199         void *ret;
2200         size_t ks;
2201
2202         if (unlikely(!p))
2203                 return kmalloc(new_size, flags);
2204
2205         if (unlikely(!new_size)) {
2206                 kfree(p);
2207                 return NULL;
2208         }
2209
2210         ks = ksize(p);
2211         if (ks >= new_size)
2212                 return (void *)p;
2213
2214         ret = kmalloc(new_size, flags);
2215         if (ret) {
2216                 memcpy(ret, p, min(new_size, ks));
2217                 kfree(p);
2218         }
2219         return ret;
2220 }
2221 EXPORT_SYMBOL(krealloc);
2222
2223 /********************************************************************
2224  *                      Basic setup of slabs
2225  *******************************************************************/
2226
2227 void __init kmem_cache_init(void)
2228 {
2229         int i;
2230
2231 #ifdef CONFIG_NUMA
2232         /*
2233          * Must first have the slab cache available for the allocations of the
2234          * struct kmem_cache_node's. There is special bootstrap code in
2235          * kmem_cache_open for slab_state == DOWN.
2236          */
2237         create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
2238                 sizeof(struct kmem_cache_node), GFP_KERNEL);
2239 #endif
2240
2241         /* Able to allocate the per node structures */
2242         slab_state = PARTIAL;
2243
2244         /* Caches that are not of the two-to-the-power-of size */
2245         create_kmalloc_cache(&kmalloc_caches[1],
2246                                 "kmalloc-96", 96, GFP_KERNEL);
2247         create_kmalloc_cache(&kmalloc_caches[2],
2248                                 "kmalloc-192", 192, GFP_KERNEL);
2249
2250         for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
2251                 create_kmalloc_cache(&kmalloc_caches[i],
2252                         "kmalloc", 1 << i, GFP_KERNEL);
2253
2254         slab_state = UP;
2255
2256         /* Provide the correct kmalloc names now that the caches are up */
2257         for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
2258                 kmalloc_caches[i]. name =
2259                         kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
2260
2261 #ifdef CONFIG_SMP
2262         register_cpu_notifier(&slab_notifier);
2263 #endif
2264
2265         if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */
2266                 kmem_size = offsetof(struct kmem_cache, cpu_slab)
2267                          + nr_cpu_ids * sizeof(struct page *);
2268
2269         printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2270                 " Processors=%d, Nodes=%d\n",
2271                 KMALLOC_SHIFT_HIGH, cache_line_size(),
2272                 slub_min_order, slub_max_order, slub_min_objects,
2273                 nr_cpu_ids, nr_node_ids);
2274 }
2275
2276 /*
2277  * Find a mergeable slab cache
2278  */
2279 static int slab_unmergeable(struct kmem_cache *s)
2280 {
2281         if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
2282                 return 1;
2283
2284         if (s->ctor || s->dtor)
2285                 return 1;
2286
2287         return 0;
2288 }
2289
2290 static struct kmem_cache *find_mergeable(size_t size,
2291                 size_t align, unsigned long flags,
2292                 void (*ctor)(void *, struct kmem_cache *, unsigned long),
2293                 void (*dtor)(void *, struct kmem_cache *, unsigned long))
2294 {
2295         struct list_head *h;
2296
2297         if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
2298                 return NULL;
2299
2300         if (ctor || dtor)
2301                 return NULL;
2302
2303         size = ALIGN(size, sizeof(void *));
2304         align = calculate_alignment(flags, align, size);
2305         size = ALIGN(size, align);
2306
2307         list_for_each(h, &slab_caches) {
2308                 struct kmem_cache *s =
2309                         container_of(h, struct kmem_cache, list);
2310
2311                 if (slab_unmergeable(s))
2312                         continue;
2313
2314                 if (size > s->size)
2315                         continue;
2316
2317                 if (((flags | slub_debug) & SLUB_MERGE_SAME) !=
2318                         (s->flags & SLUB_MERGE_SAME))
2319                                 continue;
2320                 /*
2321                  * Check if alignment is compatible.
2322                  * Courtesy of Adrian Drzewiecki
2323                  */
2324                 if ((s->size & ~(align -1)) != s->size)
2325                         continue;
2326
2327                 if (s->size - size >= sizeof(void *))
2328                         continue;
2329
2330                 return s;
2331         }
2332         return NULL;
2333 }
2334
2335 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
2336                 size_t align, unsigned long flags,
2337                 void (*ctor)(void *, struct kmem_cache *, unsigned long),
2338                 void (*dtor)(void *, struct kmem_cache *, unsigned long))
2339 {
2340         struct kmem_cache *s;
2341
2342         down_write(&slub_lock);
2343         s = find_mergeable(size, align, flags, dtor, ctor);
2344         if (s) {
2345                 s->refcount++;
2346                 /*
2347                  * Adjust the object sizes so that we clear
2348                  * the complete object on kzalloc.
2349                  */
2350                 s->objsize = max(s->objsize, (int)size);
2351                 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
2352                 if (sysfs_slab_alias(s, name))
2353                         goto err;
2354         } else {
2355                 s = kmalloc(kmem_size, GFP_KERNEL);
2356                 if (s && kmem_cache_open(s, GFP_KERNEL, name,
2357                                 size, align, flags, ctor, dtor)) {
2358                         if (sysfs_slab_add(s)) {
2359                                 kfree(s);
2360                                 goto err;
2361                         }
2362                         list_add(&s->list, &slab_caches);
2363                 } else
2364                         kfree(s);
2365         }
2366         up_write(&slub_lock);
2367         return s;
2368
2369 err:
2370         up_write(&slub_lock);
2371         if (flags & SLAB_PANIC)
2372                 panic("Cannot create slabcache %s\n", name);
2373         else
2374                 s = NULL;
2375         return s;
2376 }
2377 EXPORT_SYMBOL(kmem_cache_create);
2378
2379 void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags)
2380 {
2381         void *x;
2382
2383         x = slab_alloc(s, flags, -1, __builtin_return_address(0));
2384         if (x)
2385                 memset(x, 0, s->objsize);
2386         return x;
2387 }
2388 EXPORT_SYMBOL(kmem_cache_zalloc);
2389
2390 #ifdef CONFIG_SMP
2391 static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu)
2392 {
2393         struct list_head *h;
2394
2395         down_read(&slub_lock);
2396         list_for_each(h, &slab_caches) {
2397                 struct kmem_cache *s =
2398                         container_of(h, struct kmem_cache, list);
2399
2400                 func(s, cpu);
2401         }
2402         up_read(&slub_lock);
2403 }
2404
2405 /*
2406  * Use the cpu notifier to insure that the cpu slabs are flushed when
2407  * necessary.
2408  */
2409 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2410                 unsigned long action, void *hcpu)
2411 {
2412         long cpu = (long)hcpu;
2413
2414         switch (action) {
2415         case CPU_UP_CANCELED:
2416         case CPU_DEAD:
2417                 for_all_slabs(__flush_cpu_slab, cpu);
2418                 break;
2419         default:
2420                 break;
2421         }
2422         return NOTIFY_OK;
2423 }
2424
2425 static struct notifier_block __cpuinitdata slab_notifier =
2426         { &slab_cpuup_callback, NULL, 0 };
2427
2428 #endif
2429
2430 #ifdef CONFIG_NUMA
2431
2432 /*****************************************************************
2433  * Generic reaper used to support the page allocator
2434  * (the cpu slabs are reaped by a per slab workqueue).
2435  *
2436  * Maybe move this to the page allocator?
2437  ****************************************************************/
2438
2439 static DEFINE_PER_CPU(unsigned long, reap_node);
2440
2441 static void init_reap_node(int cpu)
2442 {
2443         int node;
2444
2445         node = next_node(cpu_to_node(cpu), node_online_map);
2446         if (node == MAX_NUMNODES)
2447                 node = first_node(node_online_map);
2448
2449         __get_cpu_var(reap_node) = node;
2450 }
2451
2452 static void next_reap_node(void)
2453 {
2454         int node = __get_cpu_var(reap_node);
2455
2456         /*
2457          * Also drain per cpu pages on remote zones
2458          */
2459         if (node != numa_node_id())
2460                 drain_node_pages(node);
2461
2462         node = next_node(node, node_online_map);
2463         if (unlikely(node >= MAX_NUMNODES))
2464                 node = first_node(node_online_map);
2465         __get_cpu_var(reap_node) = node;
2466 }
2467 #else
2468 #define init_reap_node(cpu) do { } while (0)
2469 #define next_reap_node(void) do { } while (0)
2470 #endif
2471
2472 #define REAPTIMEOUT_CPUC        (2*HZ)
2473
2474 #ifdef CONFIG_SMP
2475 static DEFINE_PER_CPU(struct delayed_work, reap_work);
2476
2477 static void cache_reap(struct work_struct *unused)
2478 {
2479         next_reap_node();
2480         refresh_cpu_vm_stats(smp_processor_id());
2481         schedule_delayed_work(&__get_cpu_var(reap_work),
2482                                       REAPTIMEOUT_CPUC);
2483 }
2484
2485 static void __devinit start_cpu_timer(int cpu)
2486 {
2487         struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
2488
2489         /*
2490          * When this gets called from do_initcalls via cpucache_init(),
2491          * init_workqueues() has already run, so keventd will be setup
2492          * at that time.
2493          */
2494         if (keventd_up() && reap_work->work.func == NULL) {
2495                 init_reap_node(cpu);
2496                 INIT_DELAYED_WORK(reap_work, cache_reap);
2497                 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
2498         }
2499 }
2500
2501 static int __init cpucache_init(void)
2502 {
2503         int cpu;
2504
2505         /*
2506          * Register the timers that drain pcp pages and update vm statistics
2507          */
2508         for_each_online_cpu(cpu)
2509                 start_cpu_timer(cpu);
2510         return 0;
2511 }
2512 __initcall(cpucache_init);
2513 #endif
2514
2515 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2516 {
2517         struct kmem_cache *s = get_slab(size, gfpflags);
2518
2519         if (!s)
2520                 return NULL;
2521
2522         return slab_alloc(s, gfpflags, -1, caller);
2523 }
2524
2525 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
2526                                         int node, void *caller)
2527 {
2528         struct kmem_cache *s = get_slab(size, gfpflags);
2529
2530         if (!s)
2531                 return NULL;
2532
2533         return slab_alloc(s, gfpflags, node, caller);
2534 }
2535
2536 #ifdef CONFIG_SYSFS
2537
2538 static int validate_slab(struct kmem_cache *s, struct page *page)
2539 {
2540         void *p;
2541         void *addr = page_address(page);
2542         DECLARE_BITMAP(map, s->objects);
2543
2544         if (!check_slab(s, page) ||
2545                         !on_freelist(s, page, NULL))
2546                 return 0;
2547
2548         /* Now we know that a valid freelist exists */
2549         bitmap_zero(map, s->objects);
2550
2551         for_each_free_object(p, s, page->freelist) {
2552                 set_bit(slab_index(p, s, addr), map);
2553                 if (!check_object(s, page, p, 0))
2554                         return 0;
2555         }
2556
2557         for_each_object(p, s, addr)
2558                 if (!test_bit(slab_index(p, s, addr), map))
2559                         if (!check_object(s, page, p, 1))
2560                                 return 0;
2561         return 1;
2562 }
2563
2564 static void validate_slab_slab(struct kmem_cache *s, struct page *page)
2565 {
2566         if (slab_trylock(page)) {
2567                 validate_slab(s, page);
2568                 slab_unlock(page);
2569         } else
2570                 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
2571                         s->name, page);
2572
2573         if (s->flags & DEBUG_DEFAULT_FLAGS) {
2574                 if (!PageError(page))
2575                         printk(KERN_ERR "SLUB %s: PageError not set "
2576                                 "on slab 0x%p\n", s->name, page);
2577         } else {
2578                 if (PageError(page))
2579                         printk(KERN_ERR "SLUB %s: PageError set on "
2580                                 "slab 0x%p\n", s->name, page);
2581         }
2582 }
2583
2584 static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
2585 {
2586         unsigned long count = 0;
2587         struct page *page;
2588         unsigned long flags;
2589
2590         spin_lock_irqsave(&n->list_lock, flags);
2591
2592         list_for_each_entry(page, &n->partial, lru) {
2593                 validate_slab_slab(s, page);
2594                 count++;
2595         }
2596         if (count != n->nr_partial)
2597                 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
2598                         "counter=%ld\n", s->name, count, n->nr_partial);
2599
2600         if (!(s->flags & SLAB_STORE_USER))
2601                 goto out;
2602
2603         list_for_each_entry(page, &n->full, lru) {
2604                 validate_slab_slab(s, page);
2605                 count++;
2606         }
2607         if (count != atomic_long_read(&n->nr_slabs))
2608                 printk(KERN_ERR "SLUB: %s %ld slabs counted but "
2609                         "counter=%ld\n", s->name, count,
2610                         atomic_long_read(&n->nr_slabs));
2611
2612 out:
2613         spin_unlock_irqrestore(&n->list_lock, flags);
2614         return count;
2615 }
2616
2617 static unsigned long validate_slab_cache(struct kmem_cache *s)
2618 {
2619         int node;
2620         unsigned long count = 0;
2621
2622         flush_all(s);
2623         for_each_online_node(node) {
2624                 struct kmem_cache_node *n = get_node(s, node);
2625
2626                 count += validate_slab_node(s, n);
2627         }
2628         return count;
2629 }
2630
2631 #ifdef SLUB_RESILIENCY_TEST
2632 static void resiliency_test(void)
2633 {
2634         u8 *p;
2635
2636         printk(KERN_ERR "SLUB resiliency testing\n");
2637         printk(KERN_ERR "-----------------------\n");
2638         printk(KERN_ERR "A. Corruption after allocation\n");
2639
2640         p = kzalloc(16, GFP_KERNEL);
2641         p[16] = 0x12;
2642         printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
2643                         " 0x12->0x%p\n\n", p + 16);
2644
2645         validate_slab_cache(kmalloc_caches + 4);
2646
2647         /* Hmmm... The next two are dangerous */
2648         p = kzalloc(32, GFP_KERNEL);
2649         p[32 + sizeof(void *)] = 0x34;
2650         printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
2651                         " 0x34 -> -0x%p\n", p);
2652         printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2653
2654         validate_slab_cache(kmalloc_caches + 5);
2655         p = kzalloc(64, GFP_KERNEL);
2656         p += 64 + (get_cycles() & 0xff) * sizeof(void *);
2657         *p = 0x56;
2658         printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
2659                                                                         p);
2660         printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2661         validate_slab_cache(kmalloc_caches + 6);
2662
2663         printk(KERN_ERR "\nB. Corruption after free\n");
2664         p = kzalloc(128, GFP_KERNEL);
2665         kfree(p);
2666         *p = 0x78;
2667         printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
2668         validate_slab_cache(kmalloc_caches + 7);
2669
2670         p = kzalloc(256, GFP_KERNEL);
2671         kfree(p);
2672         p[50] = 0x9a;
2673         printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
2674         validate_slab_cache(kmalloc_caches + 8);
2675
2676         p = kzalloc(512, GFP_KERNEL);
2677         kfree(p);
2678         p[512] = 0xab;
2679         printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
2680         validate_slab_cache(kmalloc_caches + 9);
2681 }
2682 #else
2683 static void resiliency_test(void) {};
2684 #endif
2685
2686 /*
2687  * Generate lists of code addresses where slabcache objects are allocated
2688  * and freed.
2689  */
2690
2691 struct location {
2692         unsigned long count;
2693         void *addr;
2694 };
2695
2696 struct loc_track {
2697         unsigned long max;
2698         unsigned long count;
2699         struct location *loc;
2700 };
2701
2702 static void free_loc_track(struct loc_track *t)
2703 {
2704         if (t->max)
2705                 free_pages((unsigned long)t->loc,
2706                         get_order(sizeof(struct location) * t->max));
2707 }
2708
2709 static int alloc_loc_track(struct loc_track *t, unsigned long max)
2710 {
2711         struct location *l;
2712         int order;
2713
2714         if (!max)
2715                 max = PAGE_SIZE / sizeof(struct location);
2716
2717         order = get_order(sizeof(struct location) * max);
2718
2719         l = (void *)__get_free_pages(GFP_KERNEL, order);
2720
2721         if (!l)
2722                 return 0;
2723
2724         if (t->count) {
2725                 memcpy(l, t->loc, sizeof(struct location) * t->count);
2726                 free_loc_track(t);
2727         }
2728         t->max = max;
2729         t->loc = l;
2730         return 1;
2731 }
2732
2733 static int add_location(struct loc_track *t, struct kmem_cache *s,
2734                                                 void *addr)
2735 {
2736         long start, end, pos;
2737         struct location *l;
2738         void *caddr;
2739
2740         start = -1;
2741         end = t->count;
2742
2743         for ( ; ; ) {
2744                 pos = start + (end - start + 1) / 2;
2745
2746                 /*
2747                  * There is nothing at "end". If we end up there
2748                  * we need to add something to before end.
2749                  */
2750                 if (pos == end)
2751                         break;
2752
2753                 caddr = t->loc[pos].addr;
2754                 if (addr == caddr) {
2755                         t->loc[pos].count++;
2756                         return 1;
2757                 }
2758
2759                 if (addr < caddr)
2760                         end = pos;
2761                 else
2762                         start = pos;
2763         }
2764
2765         /*
2766          * Not found. Insert new tracking element.
2767          */
2768         if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
2769                 return 0;
2770
2771         l = t->loc + pos;
2772         if (pos < t->count)
2773                 memmove(l + 1, l,
2774                         (t->count - pos) * sizeof(struct location));
2775         t->count++;
2776         l->count = 1;
2777         l->addr = addr;
2778         return 1;
2779 }
2780
2781 static void process_slab(struct loc_track *t, struct kmem_cache *s,
2782                 struct page *page, enum track_item alloc)
2783 {
2784         void *addr = page_address(page);
2785         DECLARE_BITMAP(map, s->objects);
2786         void *p;
2787
2788         bitmap_zero(map, s->objects);
2789         for_each_free_object(p, s, page->freelist)
2790                 set_bit(slab_index(p, s, addr), map);
2791
2792         for_each_object(p, s, addr)
2793                 if (!test_bit(slab_index(p, s, addr), map)) {
2794                         void *addr = get_track(s, p, alloc)->addr;
2795
2796                         add_location(t, s, addr);
2797                 }
2798 }
2799
2800 static int list_locations(struct kmem_cache *s, char *buf,
2801                                         enum track_item alloc)
2802 {
2803         int n = 0;
2804         unsigned long i;
2805         struct loc_track t;
2806         int node;
2807
2808         t.count = 0;
2809         t.max = 0;
2810
2811         /* Push back cpu slabs */
2812         flush_all(s);
2813
2814         for_each_online_node(node) {
2815                 struct kmem_cache_node *n = get_node(s, node);
2816                 unsigned long flags;
2817                 struct page *page;
2818
2819                 if (!atomic_read(&n->nr_slabs))
2820                         continue;
2821
2822                 spin_lock_irqsave(&n->list_lock, flags);
2823                 list_for_each_entry(page, &n->partial, lru)
2824                         process_slab(&t, s, page, alloc);
2825                 list_for_each_entry(page, &n->full, lru)
2826                         process_slab(&t, s, page, alloc);
2827                 spin_unlock_irqrestore(&n->list_lock, flags);
2828         }
2829
2830         for (i = 0; i < t.count; i++) {
2831                 void *addr = t.loc[i].addr;
2832
2833                 if (n > PAGE_SIZE - 100)
2834                         break;
2835                 n += sprintf(buf + n, "%7ld ", t.loc[i].count);
2836                 if (addr)
2837                         n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr);
2838                 else
2839                         n += sprintf(buf + n, "<not-available>");
2840                 n += sprintf(buf + n, "\n");
2841         }
2842
2843         free_loc_track(&t);
2844         if (!t.count)
2845                 n += sprintf(buf, "No data\n");
2846         return n;
2847 }
2848
2849 static unsigned long count_partial(struct kmem_cache_node *n)
2850 {
2851         unsigned long flags;
2852         unsigned long x = 0;
2853         struct page *page;
2854
2855         spin_lock_irqsave(&n->list_lock, flags);
2856         list_for_each_entry(page, &n->partial, lru)
2857                 x += page->inuse;
2858         spin_unlock_irqrestore(&n->list_lock, flags);
2859         return x;
2860 }
2861
2862 enum slab_stat_type {
2863         SL_FULL,
2864         SL_PARTIAL,
2865         SL_CPU,
2866         SL_OBJECTS
2867 };
2868
2869 #define SO_FULL         (1 << SL_FULL)
2870 #define SO_PARTIAL      (1 << SL_PARTIAL)
2871 #define SO_CPU          (1 << SL_CPU)
2872 #define SO_OBJECTS      (1 << SL_OBJECTS)
2873
2874 static unsigned long slab_objects(struct kmem_cache *s,
2875                         char *buf, unsigned long flags)
2876 {
2877         unsigned long total = 0;
2878         int cpu;
2879         int node;
2880         int x;
2881         unsigned long *nodes;
2882         unsigned long *per_cpu;
2883
2884         nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
2885         per_cpu = nodes + nr_node_ids;
2886
2887         for_each_possible_cpu(cpu) {
2888                 struct page *page = s->cpu_slab[cpu];
2889                 int node;
2890
2891                 if (page) {
2892                         node = page_to_nid(page);
2893                         if (flags & SO_CPU) {
2894                                 int x = 0;
2895
2896                                 if (flags & SO_OBJECTS)
2897                                         x = page->inuse;
2898                                 else
2899                                         x = 1;
2900                                 total += x;
2901                                 nodes[node] += x;
2902                         }
2903                         per_cpu[node]++;
2904                 }
2905         }
2906
2907         for_each_online_node(node) {
2908                 struct kmem_cache_node *n = get_node(s, node);
2909
2910                 if (flags & SO_PARTIAL) {
2911                         if (flags & SO_OBJECTS)
2912                                 x = count_partial(n);
2913                         else
2914                                 x = n->nr_partial;
2915                         total += x;
2916                         nodes[node] += x;
2917                 }
2918
2919                 if (flags & SO_FULL) {
2920                         int full_slabs = atomic_read(&n->nr_slabs)
2921                                         - per_cpu[node]
2922                                         - n->nr_partial;
2923
2924                         if (flags & SO_OBJECTS)
2925                                 x = full_slabs * s->objects;
2926                         else
2927                                 x = full_slabs;
2928                         total += x;
2929                         nodes[node] += x;
2930                 }
2931         }
2932
2933         x = sprintf(buf, "%lu", total);
2934 #ifdef CONFIG_NUMA
2935         for_each_online_node(node)
2936                 if (nodes[node])
2937                         x += sprintf(buf + x, " N%d=%lu",
2938                                         node, nodes[node]);
2939 #endif
2940         kfree(nodes);
2941         return x + sprintf(buf + x, "\n");
2942 }
2943
2944 static int any_slab_objects(struct kmem_cache *s)
2945 {
2946         int node;
2947         int cpu;
2948
2949         for_each_possible_cpu(cpu)
2950                 if (s->cpu_slab[cpu])
2951                         return 1;
2952
2953         for_each_node(node) {
2954                 struct kmem_cache_node *n = get_node(s, node);
2955
2956                 if (n->nr_partial || atomic_read(&n->nr_slabs))
2957                         return 1;
2958         }
2959         return 0;
2960 }
2961
2962 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
2963 #define to_slab(n) container_of(n, struct kmem_cache, kobj);
2964
2965 struct slab_attribute {
2966         struct attribute attr;
2967         ssize_t (*show)(struct kmem_cache *s, char *buf);
2968         ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
2969 };
2970
2971 #define SLAB_ATTR_RO(_name) \
2972         static struct slab_attribute _name##_attr = __ATTR_RO(_name)
2973
2974 #define SLAB_ATTR(_name) \
2975         static struct slab_attribute _name##_attr =  \
2976         __ATTR(_name, 0644, _name##_show, _name##_store)
2977
2978 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
2979 {
2980         return sprintf(buf, "%d\n", s->size);
2981 }
2982 SLAB_ATTR_RO(slab_size);
2983
2984 static ssize_t align_show(struct kmem_cache *s, char *buf)
2985 {
2986         return sprintf(buf, "%d\n", s->align);
2987 }
2988 SLAB_ATTR_RO(align);
2989
2990 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
2991 {
2992         return sprintf(buf, "%d\n", s->objsize);
2993 }
2994 SLAB_ATTR_RO(object_size);
2995
2996 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
2997 {
2998         return sprintf(buf, "%d\n", s->objects);
2999 }
3000 SLAB_ATTR_RO(objs_per_slab);
3001
3002 static ssize_t order_show(struct kmem_cache *s, char *buf)
3003 {
3004         return sprintf(buf, "%d\n", s->order);
3005 }
3006 SLAB_ATTR_RO(order);
3007
3008 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3009 {
3010         if (s->ctor) {
3011                 int n = sprint_symbol(buf, (unsigned long)s->ctor);
3012
3013                 return n + sprintf(buf + n, "\n");
3014         }
3015         return 0;
3016 }
3017 SLAB_ATTR_RO(ctor);
3018
3019 static ssize_t dtor_show(struct kmem_cache *s, char *buf)
3020 {
3021         if (s->dtor) {
3022                 int n = sprint_symbol(buf, (unsigned long)s->dtor);
3023
3024                 return n + sprintf(buf + n, "\n");
3025         }
3026         return 0;
3027 }
3028 SLAB_ATTR_RO(dtor);
3029
3030 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3031 {
3032         return sprintf(buf, "%d\n", s->refcount - 1);
3033 }
3034 SLAB_ATTR_RO(aliases);
3035
3036 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3037 {
3038         return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
3039 }
3040 SLAB_ATTR_RO(slabs);
3041
3042 static ssize_t partial_show(struct kmem_cache *s, char *buf)
3043 {
3044         return slab_objects(s, buf, SO_PARTIAL);
3045 }
3046 SLAB_ATTR_RO(partial);
3047
3048 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
3049 {
3050         return slab_objects(s, buf, SO_CPU);
3051 }
3052 SLAB_ATTR_RO(cpu_slabs);
3053
3054 static ssize_t objects_show(struct kmem_cache *s, char *buf)
3055 {
3056         return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
3057 }
3058 SLAB_ATTR_RO(objects);
3059
3060 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
3061 {
3062         return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
3063 }
3064
3065 static ssize_t sanity_checks_store(struct kmem_cache *s,
3066                                 const char *buf, size_t length)
3067 {
3068         s->flags &= ~SLAB_DEBUG_FREE;
3069         if (buf[0] == '1')
3070                 s->flags |= SLAB_DEBUG_FREE;
3071         return length;
3072 }
3073 SLAB_ATTR(sanity_checks);
3074
3075 static ssize_t trace_show(struct kmem_cache *s, char *buf)
3076 {
3077         return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
3078 }
3079
3080 static ssize_t trace_store(struct kmem_cache *s, const char *buf,
3081                                                         size_t length)
3082 {
3083         s->flags &= ~SLAB_TRACE;
3084         if (buf[0] == '1')
3085                 s->flags |= SLAB_TRACE;
3086         return length;
3087 }
3088 SLAB_ATTR(trace);
3089
3090 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3091 {
3092         return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3093 }
3094
3095 static ssize_t reclaim_account_store(struct kmem_cache *s,
3096                                 const char *buf, size_t length)
3097 {
3098         s->flags &= ~SLAB_RECLAIM_ACCOUNT;
3099         if (buf[0] == '1')
3100                 s->flags |= SLAB_RECLAIM_ACCOUNT;
3101         return length;
3102 }
3103 SLAB_ATTR(reclaim_account);
3104
3105 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3106 {
3107         return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
3108 }
3109 SLAB_ATTR_RO(hwcache_align);
3110
3111 #ifdef CONFIG_ZONE_DMA
3112 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3113 {
3114         return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3115 }
3116 SLAB_ATTR_RO(cache_dma);
3117 #endif
3118
3119 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3120 {
3121         return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3122 }
3123 SLAB_ATTR_RO(destroy_by_rcu);
3124
3125 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3126 {
3127         return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3128 }
3129
3130 static ssize_t red_zone_store(struct kmem_cache *s,
3131                                 const char *buf, size_t length)
3132 {
3133         if (any_slab_objects(s))
3134                 return -EBUSY;
3135
3136         s->flags &= ~SLAB_RED_ZONE;
3137         if (buf[0] == '1')
3138                 s->flags |= SLAB_RED_ZONE;
3139         calculate_sizes(s);
3140         return length;
3141 }
3142 SLAB_ATTR(red_zone);
3143
3144 static ssize_t poison_show(struct kmem_cache *s, char *buf)
3145 {
3146         return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3147 }
3148
3149 static ssize_t poison_store(struct kmem_cache *s,
3150                                 const char *buf, size_t length)
3151 {
3152         if (any_slab_objects(s))
3153                 return -EBUSY;
3154
3155         s->flags &= ~SLAB_POISON;
3156         if (buf[0] == '1')
3157                 s->flags |= SLAB_POISON;
3158         calculate_sizes(s);
3159         return length;
3160 }
3161 SLAB_ATTR(poison);
3162
3163 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3164 {
3165         return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3166 }
3167
3168 static ssize_t store_user_store(struct kmem_cache *s,
3169                                 const char *buf, size_t length)
3170 {
3171         if (any_slab_objects(s))
3172                 return -EBUSY;
3173
3174         s->flags &= ~SLAB_STORE_USER;
3175         if (buf[0] == '1')
3176                 s->flags |= SLAB_STORE_USER;
3177         calculate_sizes(s);
3178         return length;
3179 }
3180 SLAB_ATTR(store_user);
3181
3182 static ssize_t validate_show(struct kmem_cache *s, char *buf)
3183 {
3184         return 0;
3185 }
3186
3187 static ssize_t validate_store(struct kmem_cache *s,
3188                         const char *buf, size_t length)
3189 {
3190         if (buf[0] == '1')
3191                 validate_slab_cache(s);
3192         else
3193                 return -EINVAL;
3194         return length;
3195 }
3196 SLAB_ATTR(validate);
3197
3198 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
3199 {
3200         return 0;
3201 }
3202
3203 static ssize_t shrink_store(struct kmem_cache *s,
3204                         const char *buf, size_t length)
3205 {
3206         if (buf[0] == '1') {
3207                 int rc = kmem_cache_shrink(s);
3208
3209                 if (rc)
3210                         return rc;
3211         } else
3212                 return -EINVAL;
3213         return length;
3214 }
3215 SLAB_ATTR(shrink);
3216
3217 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
3218 {
3219         if (!(s->flags & SLAB_STORE_USER))
3220                 return -ENOSYS;
3221         return list_locations(s, buf, TRACK_ALLOC);
3222 }
3223 SLAB_ATTR_RO(alloc_calls);
3224
3225 static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
3226 {
3227         if (!(s->flags & SLAB_STORE_USER))
3228                 return -ENOSYS;
3229         return list_locations(s, buf, TRACK_FREE);
3230 }
3231 SLAB_ATTR_RO(free_calls);
3232
3233 #ifdef CONFIG_NUMA
3234 static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf)
3235 {
3236         return sprintf(buf, "%d\n", s->defrag_ratio / 10);
3237 }
3238
3239 static ssize_t defrag_ratio_store(struct kmem_cache *s,
3240                                 const char *buf, size_t length)
3241 {
3242         int n = simple_strtoul(buf, NULL, 10);
3243
3244         if (n < 100)
3245                 s->defrag_ratio = n * 10;
3246         return length;
3247 }
3248 SLAB_ATTR(defrag_ratio);
3249 #endif
3250
3251 static struct attribute * slab_attrs[] = {
3252         &slab_size_attr.attr,
3253         &object_size_attr.attr,
3254         &objs_per_slab_attr.attr,
3255         &order_attr.attr,
3256         &objects_attr.attr,
3257         &slabs_attr.attr,
3258         &partial_attr.attr,
3259         &cpu_slabs_attr.attr,
3260         &ctor_attr.attr,
3261         &dtor_attr.attr,
3262         &aliases_attr.attr,
3263         &align_attr.attr,
3264         &sanity_checks_attr.attr,
3265         &trace_attr.attr,
3266         &hwcache_align_attr.attr,
3267         &reclaim_account_attr.attr,
3268         &destroy_by_rcu_attr.attr,
3269         &red_zone_attr.attr,
3270         &poison_attr.attr,
3271         &store_user_attr.attr,
3272         &validate_attr.attr,
3273         &shrink_attr.attr,
3274         &alloc_calls_attr.attr,
3275         &free_calls_attr.attr,
3276 #ifdef CONFIG_ZONE_DMA
3277         &cache_dma_attr.attr,
3278 #endif
3279 #ifdef CONFIG_NUMA
3280         &defrag_ratio_attr.attr,
3281 #endif
3282         NULL
3283 };
3284
3285 static struct attribute_group slab_attr_group = {
3286         .attrs = slab_attrs,
3287 };
3288
3289 static ssize_t slab_attr_show(struct kobject *kobj,
3290                                 struct attribute *attr,
3291                                 char *buf)
3292 {
3293         struct slab_attribute *attribute;
3294         struct kmem_cache *s;
3295         int err;
3296
3297         attribute = to_slab_attr(attr);
3298         s = to_slab(kobj);
3299
3300         if (!attribute->show)
3301                 return -EIO;
3302
3303         err = attribute->show(s, buf);
3304
3305         return err;
3306 }
3307
3308 static ssize_t slab_attr_store(struct kobject *kobj,
3309                                 struct attribute *attr,
3310                                 const char *buf, size_t len)
3311 {
3312         struct slab_attribute *attribute;
3313         struct kmem_cache *s;
3314         int err;
3315
3316         attribute = to_slab_attr(attr);
3317         s = to_slab(kobj);
3318
3319         if (!attribute->store)
3320                 return -EIO;
3321
3322         err = attribute->store(s, buf, len);
3323
3324         return err;
3325 }
3326
3327 static struct sysfs_ops slab_sysfs_ops = {
3328         .show = slab_attr_show,
3329         .store = slab_attr_store,
3330 };
3331
3332 static struct kobj_type slab_ktype = {
3333         .sysfs_ops = &slab_sysfs_ops,
3334 };
3335
3336 static int uevent_filter(struct kset *kset, struct kobject *kobj)
3337 {
3338         struct kobj_type *ktype = get_ktype(kobj);
3339
3340         if (ktype == &slab_ktype)
3341                 return 1;
3342         return 0;
3343 }
3344
3345 static struct kset_uevent_ops slab_uevent_ops = {
3346         .filter = uevent_filter,
3347 };
3348
3349 decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
3350
3351 #define ID_STR_LENGTH 64
3352
3353 /* Create a unique string id for a slab cache:
3354  * format
3355  * :[flags-]size:[memory address of kmemcache]
3356  */
3357 static char *create_unique_id(struct kmem_cache *s)
3358 {
3359         char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
3360         char *p = name;
3361
3362         BUG_ON(!name);
3363
3364         *p++ = ':';
3365         /*
3366          * First flags affecting slabcache operations. We will only
3367          * get here for aliasable slabs so we do not need to support
3368          * too many flags. The flags here must cover all flags that
3369          * are matched during merging to guarantee that the id is
3370          * unique.
3371          */
3372         if (s->flags & SLAB_CACHE_DMA)
3373                 *p++ = 'd';
3374         if (s->flags & SLAB_RECLAIM_ACCOUNT)
3375                 *p++ = 'a';
3376         if (s->flags & SLAB_DEBUG_FREE)
3377                 *p++ = 'F';
3378         if (p != name + 1)
3379                 *p++ = '-';
3380         p += sprintf(p, "%07d", s->size);
3381         BUG_ON(p > name + ID_STR_LENGTH - 1);
3382         return name;
3383 }
3384
3385 static int sysfs_slab_add(struct kmem_cache *s)
3386 {
3387         int err;
3388         const char *name;
3389         int unmergeable;
3390
3391         if (slab_state < SYSFS)
3392                 /* Defer until later */
3393                 return 0;
3394
3395         unmergeable = slab_unmergeable(s);
3396         if (unmergeable) {
3397                 /*
3398                  * Slabcache can never be merged so we can use the name proper.
3399                  * This is typically the case for debug situations. In that
3400                  * case we can catch duplicate names easily.
3401                  */
3402                 sysfs_remove_link(&slab_subsys.kobj, s->name);
3403                 name = s->name;
3404         } else {
3405                 /*
3406                  * Create a unique name for the slab as a target
3407                  * for the symlinks.
3408                  */
3409                 name = create_unique_id(s);
3410         }
3411
3412         kobj_set_kset_s(s, slab_subsys);
3413         kobject_set_name(&s->kobj, name);
3414         kobject_init(&s->kobj);
3415         err = kobject_add(&s->kobj);
3416         if (err)
3417                 return err;
3418
3419         err = sysfs_create_group(&s->kobj, &slab_attr_group);
3420         if (err)
3421                 return err;
3422         kobject_uevent(&s->kobj, KOBJ_ADD);
3423         if (!unmergeable) {
3424                 /* Setup first alias */
3425                 sysfs_slab_alias(s, s->name);
3426                 kfree(name);
3427         }
3428         return 0;
3429 }
3430
3431 static void sysfs_slab_remove(struct kmem_cache *s)
3432 {
3433         kobject_uevent(&s->kobj, KOBJ_REMOVE);
3434         kobject_del(&s->kobj);
3435 }
3436
3437 /*
3438  * Need to buffer aliases during bootup until sysfs becomes
3439  * available lest we loose that information.
3440  */
3441 struct saved_alias {
3442         struct kmem_cache *s;
3443         const char *name;
3444         struct saved_alias *next;
3445 };
3446
3447 struct saved_alias *alias_list;
3448
3449 static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
3450 {
3451         struct saved_alias *al;
3452
3453         if (slab_state == SYSFS) {
3454                 /*
3455                  * If we have a leftover link then remove it.
3456                  */
3457                 sysfs_remove_link(&slab_subsys.kobj, name);
3458                 return sysfs_create_link(&slab_subsys.kobj,
3459                                                 &s->kobj, name);
3460         }
3461
3462         al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
3463         if (!al)
3464                 return -ENOMEM;
3465
3466         al->s = s;
3467         al->name = name;
3468         al->next = alias_list;
3469         alias_list = al;
3470         return 0;
3471 }
3472
3473 static int __init slab_sysfs_init(void)
3474 {
3475         struct list_head *h;
3476         int err;
3477
3478         err = subsystem_register(&slab_subsys);
3479         if (err) {
3480                 printk(KERN_ERR "Cannot register slab subsystem.\n");
3481                 return -ENOSYS;
3482         }
3483
3484         slab_state = SYSFS;
3485
3486         list_for_each(h, &slab_caches) {
3487                 struct kmem_cache *s =
3488                         container_of(h, struct kmem_cache, list);
3489
3490                 err = sysfs_slab_add(s);
3491                 BUG_ON(err);
3492         }
3493
3494         while (alias_list) {
3495                 struct saved_alias *al = alias_list;
3496
3497                 alias_list = alias_list->next;
3498                 err = sysfs_slab_alias(al->s, al->name);
3499                 BUG_ON(err);
3500                 kfree(al);
3501         }
3502
3503         resiliency_test();
3504         return 0;
3505 }
3506
3507 __initcall(slab_sysfs_init);
3508 #endif