mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 static kmem_cache_t *policy_cache;
  99 static kmem_cache_t *sn_cache;
 100
 101 #define PDprintk(fmt...)
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 int policy_zone = ZONE_DMA;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 /* Do sanity checking on a policy */
 113 static int mpol_check_policy(int mode, nodemask_t *nodes)
 114 {
 115         int empty = nodes_empty(*nodes);
 116
 117         switch (mode) {
 118         case MPOL_DEFAULT:
 119                 if (!empty)
 120                         return -EINVAL;
 121                 break;
 122         case MPOL_BIND:
 123         case MPOL_INTERLEAVE:
 124                 /* Preferred will only use the first bit, but allow
 125                    more for now. */
 126                 if (empty)
 127                         return -EINVAL;
 128                 break;
 129         }
 130         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 131 }
 132 /* Generate a custom zonelist for the BIND policy. */
 133 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 134 {
 135         struct zonelist *zl;
 136         int num, max, nd;
 137
 138         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 139         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 140         if (!zl)
 141                 return NULL;
 142         num = 0;
 143         for_each_node_mask(nd, *nodes)
 144                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 184         return policy;
 185 }
 186
 187 static void gather_stats(struct page *, void *);
 188 static void migrate_page_add(struct vm_area_struct *vma,
 189         struct page *page, struct list_head *pagelist, unsigned long flags);
 190
 191 /* Scan through pages checking if pages follow certain conditions. */
 192 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 193                 unsigned long addr, unsigned long end,
 194                 const nodemask_t *nodes, unsigned long flags,
 195                 void *private)
 196 {
 197         pte_t *orig_pte;
 198         pte_t *pte;
 199         spinlock_t *ptl;
 200
 201         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 202         do {
 203                 struct page *page;
 204                 unsigned int nid;
 205
 206                 if (!pte_present(*pte))
 207                         continue;
 208                 page = vm_normal_page(vma, addr, *pte);
 209                 if (!page)
 210                         continue;
 211                 nid = page_to_nid(page);
 212                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 213                         continue;
 214
 215                 if (flags & MPOL_MF_STATS)
 216                         gather_stats(page, private);
 217                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 218                         spin_unlock(ptl);
 219                         migrate_page_add(vma, page, private, flags);
 220                         spin_lock(ptl);
 221                 }
 222                 else
 223                         break;
 224         } while (pte++, addr += PAGE_SIZE, addr != end);
 225         pte_unmap_unlock(orig_pte, ptl);
 226         return addr != end;
 227 }
 228
 229 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 230                 unsigned long addr, unsigned long end,
 231                 const nodemask_t *nodes, unsigned long flags,
 232                 void *private)
 233 {
 234         pmd_t *pmd;
 235         unsigned long next;
 236
 237         pmd = pmd_offset(pud, addr);
 238         do {
 239                 next = pmd_addr_end(addr, end);
 240                 if (pmd_none_or_clear_bad(pmd))
 241                         continue;
 242                 if (check_pte_range(vma, pmd, addr, next, nodes,
 243                                     flags, private))
 244                         return -EIO;
 245         } while (pmd++, addr = next, addr != end);
 246         return 0;
 247 }
 248
 249 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 250                 unsigned long addr, unsigned long end,
 251                 const nodemask_t *nodes, unsigned long flags,
 252                 void *private)
 253 {
 254         pud_t *pud;
 255         unsigned long next;
 256
 257         pud = pud_offset(pgd, addr);
 258         do {
 259                 next = pud_addr_end(addr, end);
 260                 if (pud_none_or_clear_bad(pud))
 261                         continue;
 262                 if (check_pmd_range(vma, pud, addr, next, nodes,
 263                                     flags, private))
 264                         return -EIO;
 265         } while (pud++, addr = next, addr != end);
 266         return 0;
 267 }
 268
 269 static inline int check_pgd_range(struct vm_area_struct *vma,
 270                 unsigned long addr, unsigned long end,
 271                 const nodemask_t *nodes, unsigned long flags,
 272                 void *private)
 273 {
 274         pgd_t *pgd;
 275         unsigned long next;
 276
 277         pgd = pgd_offset(vma->vm_mm, addr);
 278         do {
 279                 next = pgd_addr_end(addr, end);
 280                 if (pgd_none_or_clear_bad(pgd))
 281                         continue;
 282                 if (check_pud_range(vma, pgd, addr, next, nodes,
 283                                     flags, private))
 284                         return -EIO;
 285         } while (pgd++, addr = next, addr != end);
 286         return 0;
 287 }
 288
 289 /* Check if a vma is migratable */
 290 static inline int vma_migratable(struct vm_area_struct *vma)
 291 {
 292         if (vma->vm_flags & (
 293                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
 294                 return 0;
 295         return 1;
 296 }
 297
 298 /*
 299  * Check if all pages in a range are on a set of nodes.
 300  * If pagelist != NULL then isolate pages from the LRU and
 301  * put them on the pagelist.
 302  */
 303 static struct vm_area_struct *
 304 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 305                 const nodemask_t *nodes, unsigned long flags, void *private)
 306 {
 307         int err;
 308         struct vm_area_struct *first, *vma, *prev;
 309
 310         first = find_vma(mm, start);
 311         if (!first)
 312                 return ERR_PTR(-EFAULT);
 313         prev = NULL;
 314         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 315                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 316                         if (!vma->vm_next && vma->vm_end < end)
 317                                 return ERR_PTR(-EFAULT);
 318                         if (prev && prev->vm_end < vma->vm_start)
 319                                 return ERR_PTR(-EFAULT);
 320                 }
 321                 if (!is_vm_hugetlb_page(vma) &&
 322                     ((flags & MPOL_MF_STRICT) ||
 323                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 324                                 vma_migratable(vma)))) {
 325                         unsigned long endvma = vma->vm_end;
 326
 327                         if (endvma > end)
 328                                 endvma = end;
 329                         if (vma->vm_start > start)
 330                                 start = vma->vm_start;
 331                         err = check_pgd_range(vma, start, endvma, nodes,
 332                                                 flags, private);
 333                         if (err) {
 334                                 first = ERR_PTR(err);
 335                                 break;
 336                         }
 337                 }
 338                 prev = vma;
 339         }
 340         return first;
 341 }
 342
 343 /* Apply policy to a single VMA */
 344 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 345 {
 346         int err = 0;
 347         struct mempolicy *old = vma->vm_policy;
 348
 349         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 350                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 351                  vma->vm_ops, vma->vm_file,
 352                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 353
 354         if (vma->vm_ops && vma->vm_ops->set_policy)
 355                 err = vma->vm_ops->set_policy(vma, new);
 356         if (!err) {
 357                 mpol_get(new);
 358                 vma->vm_policy = new;
 359                 mpol_free(old);
 360         }
 361         return err;
 362 }
 363
 364 /* Step 2: apply policy to a range and do splits. */
 365 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 366                        unsigned long end, struct mempolicy *new)
 367 {
 368         struct vm_area_struct *next;
 369         int err;
 370
 371         err = 0;
 372         for (; vma && vma->vm_start < end; vma = next) {
 373                 next = vma->vm_next;
 374                 if (vma->vm_start < start)
 375                         err = split_vma(vma->vm_mm, vma, start, 1);
 376                 if (!err && vma->vm_end > end)
 377                         err = split_vma(vma->vm_mm, vma, end, 0);
 378                 if (!err)
 379                         err = policy_vma(vma, new);
 380                 if (err)
 381                         break;
 382         }
 383         return err;
 384 }
 385
 386 static int contextualize_policy(int mode, nodemask_t *nodes)
 387 {
 388         if (!nodes)
 389                 return 0;
 390
 391         cpuset_update_task_memory_state();
 392         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 393                 return -EINVAL;
 394         return mpol_check_policy(mode, nodes);
 395 }
 396
 397 /* Set the process memory policy */
 398 long do_set_mempolicy(int mode, nodemask_t *nodes)
 399 {
 400         struct mempolicy *new;
 401
 402         if (contextualize_policy(mode, nodes))
 403                 return -EINVAL;
 404         new = mpol_new(mode, nodes);
 405         if (IS_ERR(new))
 406                 return PTR_ERR(new);
 407         mpol_free(current->mempolicy);
 408         current->mempolicy = new;
 409         if (new && new->policy == MPOL_INTERLEAVE)
 410                 current->il_next = first_node(new->v.nodes);
 411         return 0;
 412 }
 413
 414 /* Fill a zone bitmap for a policy */
 415 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 416 {
 417         int i;
 418
 419         nodes_clear(*nodes);
 420         switch (p->policy) {
 421         case MPOL_BIND:
 422                 for (i = 0; p->v.zonelist->zones[i]; i++)
 423                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 424                                 *nodes);
 425                 break;
 426         case MPOL_DEFAULT:
 427                 break;
 428         case MPOL_INTERLEAVE:
 429                 *nodes = p->v.nodes;
 430                 break;
 431         case MPOL_PREFERRED:
 432                 /* or use current node instead of online map? */
 433                 if (p->v.preferred_node < 0)
 434                         *nodes = node_online_map;
 435                 else
 436                         node_set(p->v.preferred_node, *nodes);
 437                 break;
 438         default:
 439                 BUG();
 440         }
 441 }
 442
 443 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 444 {
 445         struct page *p;
 446         int err;
 447
 448         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 449         if (err >= 0) {
 450                 err = page_to_nid(p);
 451                 put_page(p);
 452         }
 453         return err;
 454 }
 455
 456 /* Retrieve NUMA policy */
 457 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 458                         unsigned long addr, unsigned long flags)
 459 {
 460         int err;
 461         struct mm_struct *mm = current->mm;
 462         struct vm_area_struct *vma = NULL;
 463         struct mempolicy *pol = current->mempolicy;
 464
 465         cpuset_update_task_memory_state();
 466         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 467                 return -EINVAL;
 468         if (flags & MPOL_F_ADDR) {
 469                 down_read(&mm->mmap_sem);
 470                 vma = find_vma_intersection(mm, addr, addr+1);
 471                 if (!vma) {
 472                         up_read(&mm->mmap_sem);
 473                         return -EFAULT;
 474                 }
 475                 if (vma->vm_ops && vma->vm_ops->get_policy)
 476                         pol = vma->vm_ops->get_policy(vma, addr);
 477                 else
 478                         pol = vma->vm_policy;
 479         } else if (addr)
 480                 return -EINVAL;
 481
 482         if (!pol)
 483                 pol = &default_policy;
 484
 485         if (flags & MPOL_F_NODE) {
 486                 if (flags & MPOL_F_ADDR) {
 487                         err = lookup_node(mm, addr);
 488                         if (err < 0)
 489                                 goto out;
 490                         *policy = err;
 491                 } else if (pol == current->mempolicy &&
 492                                 pol->policy == MPOL_INTERLEAVE) {
 493                         *policy = current->il_next;
 494                 } else {
 495                         err = -EINVAL;
 496                         goto out;
 497                 }
 498         } else
 499                 *policy = pol->policy;
 500
 501         if (vma) {
 502                 up_read(&current->mm->mmap_sem);
 503                 vma = NULL;
 504         }
 505
 506         err = 0;
 507         if (nmask)
 508                 get_zonemask(pol, nmask);
 509
 510  out:
 511         if (vma)
 512                 up_read(&current->mm->mmap_sem);
 513         return err;
 514 }
 515
 516 /*
 517  * page migration
 518  */
 519
 520 /* Check if we are the only process mapping the page in question */
 521 static inline int single_mm_mapping(struct mm_struct *mm,
 522                         struct address_space *mapping)
 523 {
 524         struct vm_area_struct *vma;
 525         struct prio_tree_iter iter;
 526         int rc = 1;
 527
 528         spin_lock(&mapping->i_mmap_lock);
 529         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
 530                 if (mm != vma->vm_mm) {
 531                         rc = 0;
 532                         goto out;
 533                 }
 534         list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 535                 if (mm != vma->vm_mm) {
 536                         rc = 0;
 537                         goto out;
 538                 }
 539 out:
 540         spin_unlock(&mapping->i_mmap_lock);
 541         return rc;
 542 }
 543
 544 /*
 545  * Add a page to be migrated to the pagelist
 546  */
 547 static void migrate_page_add(struct vm_area_struct *vma,
 548         struct page *page, struct list_head *pagelist, unsigned long flags)
 549 {
 550         /*
 551          * Avoid migrating a page that is shared by others and not writable.
 552          */
 553         if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 554             mapping_writably_mapped(page->mapping) ||
 555             single_mm_mapping(vma->vm_mm, page->mapping)) {
 556                 int rc = isolate_lru_page(page);
 557
 558                 if (rc == 1)
 559                         list_add(&page->lru, pagelist);
 560                 /*
 561                  * If the isolate attempt was not successful then we just
 562                  * encountered an unswappable page. Something must be wrong.
 563                  */
 564                 WARN_ON(rc == 0);
 565         }
 566 }
 567
 568 static int swap_pages(struct list_head *pagelist)
 569 {
 570         LIST_HEAD(moved);
 571         LIST_HEAD(failed);
 572         int n;
 573
 574         n = migrate_pages(pagelist, NULL, &moved, &failed);
 575         putback_lru_pages(&failed);
 576         putback_lru_pages(&moved);
 577
 578         return n;
 579 }
 580
 581 /*
 582  * For now migrate_pages simply swaps out the pages from nodes that are in
 583  * the source set but not in the target set. In the future, we would
 584  * want a function that moves pages between the two nodesets in such
 585  * a way as to preserve the physical layout as much as possible.
 586  *
 587  * Returns the number of page that could not be moved.
 588  */
 589 int do_migrate_pages(struct mm_struct *mm,
 590         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 591 {
 592         LIST_HEAD(pagelist);
 593         int count = 0;
 594         nodemask_t nodes;
 595
 596         nodes_andnot(nodes, *from_nodes, *to_nodes);
 597
 598         down_read(&mm->mmap_sem);
 599         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 600                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 601
 602         if (!list_empty(&pagelist)) {
 603                 count = swap_pages(&pagelist);
 604                 putback_lru_pages(&pagelist);
 605         }
 606
 607         up_read(&mm->mmap_sem);
 608         return count;
 609 }
 610
 611 long do_mbind(unsigned long start, unsigned long len,
 612                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 613 {
 614         struct vm_area_struct *vma;
 615         struct mm_struct *mm = current->mm;
 616         struct mempolicy *new;
 617         unsigned long end;
 618         int err;
 619         LIST_HEAD(pagelist);
 620
 621         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 622                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 623             || mode > MPOL_MAX)
 624                 return -EINVAL;
 625         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 626                 return -EPERM;
 627
 628         if (start & ~PAGE_MASK)
 629                 return -EINVAL;
 630
 631         if (mode == MPOL_DEFAULT)
 632                 flags &= ~MPOL_MF_STRICT;
 633
 634         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 635         end = start + len;
 636
 637         if (end < start)
 638                 return -EINVAL;
 639         if (end == start)
 640                 return 0;
 641
 642         if (mpol_check_policy(mode, nmask))
 643                 return -EINVAL;
 644
 645         new = mpol_new(mode, nmask);
 646         if (IS_ERR(new))
 647                 return PTR_ERR(new);
 648
 649         /*
 650          * If we are using the default policy then operation
 651          * on discontinuous address spaces is okay after all
 652          */
 653         if (!new)
 654                 flags |= MPOL_MF_DISCONTIG_OK;
 655
 656         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 657                         mode,nodes_addr(nodes)[0]);
 658
 659         down_write(&mm->mmap_sem);
 660         vma = check_range(mm, start, end, nmask,
 661                           flags | MPOL_MF_INVERT, &pagelist);
 662
 663         err = PTR_ERR(vma);
 664         if (!IS_ERR(vma)) {
 665                 int nr_failed = 0;
 666
 667                 err = mbind_range(vma, start, end, new);
 668                 if (!list_empty(&pagelist))
 669                         nr_failed = swap_pages(&pagelist);
 670
 671                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 672                         err = -EIO;
 673         }
 674         if (!list_empty(&pagelist))
 675                 putback_lru_pages(&pagelist);
 676
 677         up_write(&mm->mmap_sem);
 678         mpol_free(new);
 679         return err;
 680 }
 681
 682 /*
 683  * User space interface with variable sized bitmaps for nodelists.
 684  */
 685
 686 /* Copy a node mask from user space. */
 687 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 688                      unsigned long maxnode)
 689 {
 690         unsigned long k;
 691         unsigned long nlongs;
 692         unsigned long endmask;
 693
 694         --maxnode;
 695         nodes_clear(*nodes);
 696         if (maxnode == 0 || !nmask)
 697                 return 0;
 698
 699         nlongs = BITS_TO_LONGS(maxnode);
 700         if ((maxnode % BITS_PER_LONG) == 0)
 701                 endmask = ~0UL;
 702         else
 703                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 704
 705         /* When the user specified more nodes than supported just check
 706            if the non supported part is all zero. */
 707         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 708                 if (nlongs > PAGE_SIZE/sizeof(long))
 709                         return -EINVAL;
 710                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 711                         unsigned long t;
 712                         if (get_user(t, nmask + k))
 713                                 return -EFAULT;
 714                         if (k == nlongs - 1) {
 715                                 if (t & endmask)
 716                                         return -EINVAL;
 717                         } else if (t)
 718                                 return -EINVAL;
 719                 }
 720                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 721                 endmask = ~0UL;
 722         }
 723
 724         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 725                 return -EFAULT;
 726         nodes_addr(*nodes)[nlongs-1] &= endmask;
 727         return 0;
 728 }
 729
 730 /* Copy a kernel node mask to user space */
 731 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 732                               nodemask_t *nodes)
 733 {
 734         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 735         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 736
 737         if (copy > nbytes) {
 738                 if (copy > PAGE_SIZE)
 739                         return -EINVAL;
 740                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 741                         return -EFAULT;
 742                 copy = nbytes;
 743         }
 744         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 745 }
 746
 747 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 748                         unsigned long mode,
 749                         unsigned long __user *nmask, unsigned long maxnode,
 750                         unsigned flags)
 751 {
 752         nodemask_t nodes;
 753         int err;
 754
 755         err = get_nodes(&nodes, nmask, maxnode);
 756         if (err)
 757                 return err;
 758         return do_mbind(start, len, mode, &nodes, flags);
 759 }
 760
 761 /* Set the process memory policy */
 762 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 763                 unsigned long maxnode)
 764 {
 765         int err;
 766         nodemask_t nodes;
 767
 768         if (mode < 0 || mode > MPOL_MAX)
 769                 return -EINVAL;
 770         err = get_nodes(&nodes, nmask, maxnode);
 771         if (err)
 772                 return err;
 773         return do_set_mempolicy(mode, &nodes);
 774 }
 775
 776 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 777                 const unsigned long __user *old_nodes,
 778                 const unsigned long __user *new_nodes)
 779 {
 780         struct mm_struct *mm;
 781         struct task_struct *task;
 782         nodemask_t old;
 783         nodemask_t new;
 784         nodemask_t task_nodes;
 785         int err;
 786
 787         err = get_nodes(&old, old_nodes, maxnode);
 788         if (err)
 789                 return err;
 790
 791         err = get_nodes(&new, new_nodes, maxnode);
 792         if (err)
 793                 return err;
 794
 795         /* Find the mm_struct */
 796         read_lock(&tasklist_lock);
 797         task = pid ? find_task_by_pid(pid) : current;
 798         if (!task) {
 799                 read_unlock(&tasklist_lock);
 800                 return -ESRCH;
 801         }
 802         mm = get_task_mm(task);
 803         read_unlock(&tasklist_lock);
 804
 805         if (!mm)
 806                 return -EINVAL;
 807
 808         /*
 809          * Check if this process has the right to modify the specified
 810          * process. The right exists if the process has administrative
 811          * capabilities, superuser priviledges or the same
 812          * userid as the target process.
 813          */
 814         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 815             (current->uid != task->suid) && (current->uid != task->uid) &&
 816             !capable(CAP_SYS_ADMIN)) {
 817                 err = -EPERM;
 818                 goto out;
 819         }
 820
 821         task_nodes = cpuset_mems_allowed(task);
 822         /* Is the user allowed to access the target nodes? */
 823         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 824                 err = -EPERM;
 825                 goto out;
 826         }
 827
 828         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 829 out:
 830         mmput(mm);
 831         return err;
 832 }
 833
 834
 835 /* Retrieve NUMA policy */
 836 asmlinkage long sys_get_mempolicy(int __user *policy,
 837                                 unsigned long __user *nmask,
 838                                 unsigned long maxnode,
 839                                 unsigned long addr, unsigned long flags)
 840 {
 841         int err, pval;
 842         nodemask_t nodes;
 843
 844         if (nmask != NULL && maxnode < MAX_NUMNODES)
 845                 return -EINVAL;
 846
 847         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 848
 849         if (err)
 850                 return err;
 851
 852         if (policy && put_user(pval, policy))
 853                 return -EFAULT;
 854
 855         if (nmask)
 856                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 857
 858         return err;
 859 }
 860
 861 #ifdef CONFIG_COMPAT
 862
 863 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 864                                      compat_ulong_t __user *nmask,
 865                                      compat_ulong_t maxnode,
 866                                      compat_ulong_t addr, compat_ulong_t flags)
 867 {
 868         long err;
 869         unsigned long __user *nm = NULL;
 870         unsigned long nr_bits, alloc_size;
 871         DECLARE_BITMAP(bm, MAX_NUMNODES);
 872
 873         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 874         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 875
 876         if (nmask)
 877                 nm = compat_alloc_user_space(alloc_size);
 878
 879         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 880
 881         if (!err && nmask) {
 882                 err = copy_from_user(bm, nm, alloc_size);
 883                 /* ensure entire bitmap is zeroed */
 884                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 885                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 886         }
 887
 888         return err;
 889 }
 890
 891 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 892                                      compat_ulong_t maxnode)
 893 {
 894         long err = 0;
 895         unsigned long __user *nm = NULL;
 896         unsigned long nr_bits, alloc_size;
 897         DECLARE_BITMAP(bm, MAX_NUMNODES);
 898
 899         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 900         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 901
 902         if (nmask) {
 903                 err = compat_get_bitmap(bm, nmask, nr_bits);
 904                 nm = compat_alloc_user_space(alloc_size);
 905                 err |= copy_to_user(nm, bm, alloc_size);
 906         }
 907
 908         if (err)
 909                 return -EFAULT;
 910
 911         return sys_set_mempolicy(mode, nm, nr_bits+1);
 912 }
 913
 914 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 915                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 916                              compat_ulong_t maxnode, compat_ulong_t flags)
 917 {
 918         long err = 0;
 919         unsigned long __user *nm = NULL;
 920         unsigned long nr_bits, alloc_size;
 921         nodemask_t bm;
 922
 923         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 924         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 925
 926         if (nmask) {
 927                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 928                 nm = compat_alloc_user_space(alloc_size);
 929                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 930         }
 931
 932         if (err)
 933                 return -EFAULT;
 934
 935         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 936 }
 937
 938 #endif
 939
 940 /* Return effective policy for a VMA */
 941 static struct mempolicy * get_vma_policy(struct task_struct *task,
 942                 struct vm_area_struct *vma, unsigned long addr)
 943 {
 944         struct mempolicy *pol = task->mempolicy;
 945
 946         if (vma) {
 947                 if (vma->vm_ops && vma->vm_ops->get_policy)
 948                         pol = vma->vm_ops->get_policy(vma, addr);
 949                 else if (vma->vm_policy &&
 950                                 vma->vm_policy->policy != MPOL_DEFAULT)
 951                         pol = vma->vm_policy;
 952         }
 953         if (!pol)
 954                 pol = &default_policy;
 955         return pol;
 956 }
 957
 958 /* Return a zonelist representing a mempolicy */
 959 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 960 {
 961         int nd;
 962
 963         switch (policy->policy) {
 964         case MPOL_PREFERRED:
 965                 nd = policy->v.preferred_node;
 966                 if (nd < 0)
 967                         nd = numa_node_id();
 968                 break;
 969         case MPOL_BIND:
 970                 /* Lower zones don't get a policy applied */
 971                 /* Careful: current->mems_allowed might have moved */
 972                 if (gfp_zone(gfp) >= policy_zone)
 973                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 974                                 return policy->v.zonelist;
 975                 /*FALL THROUGH*/
 976         case MPOL_INTERLEAVE: /* should not happen */
 977         case MPOL_DEFAULT:
 978                 nd = numa_node_id();
 979                 break;
 980         default:
 981                 nd = 0;
 982                 BUG();
 983         }
 984         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 985 }
 986
 987 /* Do dynamic interleaving for a process */
 988 static unsigned interleave_nodes(struct mempolicy *policy)
 989 {
 990         unsigned nid, next;
 991         struct task_struct *me = current;
 992
 993         nid = me->il_next;
 994         next = next_node(nid, policy->v.nodes);
 995         if (next >= MAX_NUMNODES)
 996                 next = first_node(policy->v.nodes);
 997         me->il_next = next;
 998         return nid;
 999 }
1000
1001 /* Do static interleaving for a VMA with known offset. */
1002 static unsigned offset_il_node(struct mempolicy *pol,
1003                 struct vm_area_struct *vma, unsigned long off)
1004 {
1005         unsigned nnodes = nodes_weight(pol->v.nodes);
1006         unsigned target = (unsigned)off % nnodes;
1007         int c;
1008         int nid = -1;
1009
1010         c = 0;
1011         do {
1012                 nid = next_node(nid, pol->v.nodes);
1013                 c++;
1014         } while (c <= target);
1015         return nid;
1016 }
1017
1018 /* Determine a node number for interleave */
1019 static inline unsigned interleave_nid(struct mempolicy *pol,
1020                  struct vm_area_struct *vma, unsigned long addr, int shift)
1021 {
1022         if (vma) {
1023                 unsigned long off;
1024
1025                 off = vma->vm_pgoff;
1026                 off += (addr - vma->vm_start) >> shift;
1027                 return offset_il_node(pol, vma, off);
1028         } else
1029                 return interleave_nodes(pol);
1030 }
1031
1032 /* Return a zonelist suitable for a huge page allocation. */
1033 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1034 {
1035         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1036
1037         if (pol->policy == MPOL_INTERLEAVE) {
1038                 unsigned nid;
1039
1040                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1041                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1042         }
1043         return zonelist_policy(GFP_HIGHUSER, pol);
1044 }
1045
1046 /* Allocate a page in interleaved policy.
1047    Own path because it needs to do special accounting. */
1048 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1049                                         unsigned nid)
1050 {
1051         struct zonelist *zl;
1052         struct page *page;
1053
1054         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1055         page = __alloc_pages(gfp, order, zl);
1056         if (page && page_zone(page) == zl->zones[0]) {
1057                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1058                 put_cpu();
1059         }
1060         return page;
1061 }
1062
1063 /**
1064  *      alloc_page_vma  - Allocate a page for a VMA.
1065  *
1066  *      @gfp:
1067  *      %GFP_USER    user allocation.
1068  *      %GFP_KERNEL  kernel allocations,
1069  *      %GFP_HIGHMEM highmem/user allocations,
1070  *      %GFP_FS      allocation should not call back into a file system.
1071  *      %GFP_ATOMIC  don't sleep.
1072  *
1073  *      @vma:  Pointer to VMA or NULL if not available.
1074  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1075  *
1076  *      This function allocates a page from the kernel page pool and applies
1077  *      a NUMA policy associated with the VMA or the current process.
1078  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1079  *      mm_struct of the VMA to prevent it from going away. Should be used for
1080  *      all allocations for pages that will be mapped into
1081  *      user space. Returns NULL when no page can be allocated.
1082  *
1083  *      Should be called with the mm_sem of the vma hold.
1084  */
1085 struct page *
1086 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1087 {
1088         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1089
1090         cpuset_update_task_memory_state();
1091
1092         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1093                 unsigned nid;
1094
1095                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1096                 return alloc_page_interleave(gfp, 0, nid);
1097         }
1098         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1099 }
1100
1101 /**
1102  *      alloc_pages_current - Allocate pages.
1103  *
1104  *      @gfp:
1105  *              %GFP_USER   user allocation,
1106  *              %GFP_KERNEL kernel allocation,
1107  *              %GFP_HIGHMEM highmem allocation,
1108  *              %GFP_FS     don't call back into a file system.
1109  *              %GFP_ATOMIC don't sleep.
1110  *      @order: Power of two of allocation size in pages. 0 is a single page.
1111  *
1112  *      Allocate a page from the kernel page pool.  When not in
1113  *      interrupt context and apply the current process NUMA policy.
1114  *      Returns NULL when no page can be allocated.
1115  *
1116  *      Don't call cpuset_update_task_memory_state() unless
1117  *      1) it's ok to take cpuset_sem (can WAIT), and
1118  *      2) allocating for current task (not interrupt).
1119  */
1120 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1121 {
1122         struct mempolicy *pol = current->mempolicy;
1123
1124         if ((gfp & __GFP_WAIT) && !in_interrupt())
1125                 cpuset_update_task_memory_state();
1126         if (!pol || in_interrupt())
1127                 pol = &default_policy;
1128         if (pol->policy == MPOL_INTERLEAVE)
1129                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1130         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1131 }
1132 EXPORT_SYMBOL(alloc_pages_current);
1133
1134 /* Slow path of a mempolicy copy */
1135 struct mempolicy *__mpol_copy(struct mempolicy *old)
1136 {
1137         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1138
1139         if (!new)
1140                 return ERR_PTR(-ENOMEM);
1141         *new = *old;
1142         atomic_set(&new->refcnt, 1);
1143         if (new->policy == MPOL_BIND) {
1144                 int sz = ksize(old->v.zonelist);
1145                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1146                 if (!new->v.zonelist) {
1147                         kmem_cache_free(policy_cache, new);
1148                         return ERR_PTR(-ENOMEM);
1149                 }
1150                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1151         }
1152         return new;
1153 }
1154
1155 /* Slow path of a mempolicy comparison */
1156 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1157 {
1158         if (!a || !b)
1159                 return 0;
1160         if (a->policy != b->policy)
1161                 return 0;
1162         switch (a->policy) {
1163         case MPOL_DEFAULT:
1164                 return 1;
1165         case MPOL_INTERLEAVE:
1166                 return nodes_equal(a->v.nodes, b->v.nodes);
1167         case MPOL_PREFERRED:
1168                 return a->v.preferred_node == b->v.preferred_node;
1169         case MPOL_BIND: {
1170                 int i;
1171                 for (i = 0; a->v.zonelist->zones[i]; i++)
1172                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1173                                 return 0;
1174                 return b->v.zonelist->zones[i] == NULL;
1175         }
1176         default:
1177                 BUG();
1178                 return 0;
1179         }
1180 }
1181
1182 /* Slow path of a mpol destructor. */
1183 void __mpol_free(struct mempolicy *p)
1184 {
1185         if (!atomic_dec_and_test(&p->refcnt))
1186                 return;
1187         if (p->policy == MPOL_BIND)
1188                 kfree(p->v.zonelist);
1189         p->policy = MPOL_DEFAULT;
1190         kmem_cache_free(policy_cache, p);
1191 }
1192
1193 /*
1194  * Shared memory backing store policy support.
1195  *
1196  * Remember policies even when nobody has shared memory mapped.
1197  * The policies are kept in Red-Black tree linked from the inode.
1198  * They are protected by the sp->lock spinlock, which should be held
1199  * for any accesses to the tree.
1200  */
1201
1202 /* lookup first element intersecting start-end */
1203 /* Caller holds sp->lock */
1204 static struct sp_node *
1205 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1206 {
1207         struct rb_node *n = sp->root.rb_node;
1208
1209         while (n) {
1210                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1211
1212                 if (start >= p->end)
1213                         n = n->rb_right;
1214                 else if (end <= p->start)
1215                         n = n->rb_left;
1216                 else
1217                         break;
1218         }
1219         if (!n)
1220                 return NULL;
1221         for (;;) {
1222                 struct sp_node *w = NULL;
1223                 struct rb_node *prev = rb_prev(n);
1224                 if (!prev)
1225                         break;
1226                 w = rb_entry(prev, struct sp_node, nd);
1227                 if (w->end <= start)
1228                         break;
1229                 n = prev;
1230         }
1231         return rb_entry(n, struct sp_node, nd);
1232 }
1233
1234 /* Insert a new shared policy into the list. */
1235 /* Caller holds sp->lock */
1236 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1237 {
1238         struct rb_node **p = &sp->root.rb_node;
1239         struct rb_node *parent = NULL;
1240         struct sp_node *nd;
1241
1242         while (*p) {
1243                 parent = *p;
1244                 nd = rb_entry(parent, struct sp_node, nd);
1245                 if (new->start < nd->start)
1246                         p = &(*p)->rb_left;
1247                 else if (new->end > nd->end)
1248                         p = &(*p)->rb_right;
1249                 else
1250                         BUG();
1251         }
1252         rb_link_node(&new->nd, parent, p);
1253         rb_insert_color(&new->nd, &sp->root);
1254         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1255                  new->policy ? new->policy->policy : 0);
1256 }
1257
1258 /* Find shared policy intersecting idx */
1259 struct mempolicy *
1260 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1261 {
1262         struct mempolicy *pol = NULL;
1263         struct sp_node *sn;
1264
1265         if (!sp->root.rb_node)
1266                 return NULL;
1267         spin_lock(&sp->lock);
1268         sn = sp_lookup(sp, idx, idx+1);
1269         if (sn) {
1270                 mpol_get(sn->policy);
1271                 pol = sn->policy;
1272         }
1273         spin_unlock(&sp->lock);
1274         return pol;
1275 }
1276
1277 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1278 {
1279         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1280         rb_erase(&n->nd, &sp->root);
1281         mpol_free(n->policy);
1282         kmem_cache_free(sn_cache, n);
1283 }
1284
1285 struct sp_node *
1286 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1287 {
1288         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1289
1290         if (!n)
1291                 return NULL;
1292         n->start = start;
1293         n->end = end;
1294         mpol_get(pol);
1295         n->policy = pol;
1296         return n;
1297 }
1298
1299 /* Replace a policy range. */
1300 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1301                                  unsigned long end, struct sp_node *new)
1302 {
1303         struct sp_node *n, *new2 = NULL;
1304
1305 restart:
1306         spin_lock(&sp->lock);
1307         n = sp_lookup(sp, start, end);
1308         /* Take care of old policies in the same range. */
1309         while (n && n->start < end) {
1310                 struct rb_node *next = rb_next(&n->nd);
1311                 if (n->start >= start) {
1312                         if (n->end <= end)
1313                                 sp_delete(sp, n);
1314                         else
1315                                 n->start = end;
1316                 } else {
1317                         /* Old policy spanning whole new range. */
1318                         if (n->end > end) {
1319                                 if (!new2) {
1320                                         spin_unlock(&sp->lock);
1321                                         new2 = sp_alloc(end, n->end, n->policy);
1322                                         if (!new2)
1323                                                 return -ENOMEM;
1324                                         goto restart;
1325                                 }
1326                                 n->end = start;
1327                                 sp_insert(sp, new2);
1328                                 new2 = NULL;
1329                                 break;
1330                         } else
1331                                 n->end = start;
1332                 }
1333                 if (!next)
1334                         break;
1335                 n = rb_entry(next, struct sp_node, nd);
1336         }
1337         if (new)
1338                 sp_insert(sp, new);
1339         spin_unlock(&sp->lock);
1340         if (new2) {
1341                 mpol_free(new2->policy);
1342                 kmem_cache_free(sn_cache, new2);
1343         }
1344         return 0;
1345 }
1346
1347 int mpol_set_shared_policy(struct shared_policy *info,
1348                         struct vm_area_struct *vma, struct mempolicy *npol)
1349 {
1350         int err;
1351         struct sp_node *new = NULL;
1352         unsigned long sz = vma_pages(vma);
1353
1354         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1355                  vma->vm_pgoff,
1356                  sz, npol? npol->policy : -1,
1357                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1358
1359         if (npol) {
1360                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1361                 if (!new)
1362                         return -ENOMEM;
1363         }
1364         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1365         if (err && new)
1366                 kmem_cache_free(sn_cache, new);
1367         return err;
1368 }
1369
1370 /* Free a backing policy store on inode delete. */
1371 void mpol_free_shared_policy(struct shared_policy *p)
1372 {
1373         struct sp_node *n;
1374         struct rb_node *next;
1375
1376         if (!p->root.rb_node)
1377                 return;
1378         spin_lock(&p->lock);
1379         next = rb_first(&p->root);
1380         while (next) {
1381                 n = rb_entry(next, struct sp_node, nd);
1382                 next = rb_next(&n->nd);
1383                 rb_erase(&n->nd, &p->root);
1384                 mpol_free(n->policy);
1385                 kmem_cache_free(sn_cache, n);
1386         }
1387         spin_unlock(&p->lock);
1388 }
1389
1390 /* assumes fs == KERNEL_DS */
1391 void __init numa_policy_init(void)
1392 {
1393         policy_cache = kmem_cache_create("numa_policy",
1394                                          sizeof(struct mempolicy),
1395                                          0, SLAB_PANIC, NULL, NULL);
1396
1397         sn_cache = kmem_cache_create("shared_policy_node",
1398                                      sizeof(struct sp_node),
1399                                      0, SLAB_PANIC, NULL, NULL);
1400
1401         /* Set interleaving policy for system init. This way not all
1402            the data structures allocated at system boot end up in node zero. */
1403
1404         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1405                 printk("numa_policy_init: interleaving failed\n");
1406 }
1407
1408 /* Reset policy of current process to default */
1409 void numa_default_policy(void)
1410 {
1411         do_set_mempolicy(MPOL_DEFAULT, NULL);
1412 }
1413
1414 /* Migrate a policy to a different set of nodes */
1415 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1416 {
1417         nodemask_t *mpolmask;
1418         nodemask_t tmp;
1419
1420         if (!pol)
1421                 return;
1422         mpolmask = &pol->cpuset_mems_allowed;
1423         if (nodes_equal(*mpolmask, *newmask))
1424                 return;
1425
1426         switch (pol->policy) {
1427         case MPOL_DEFAULT:
1428                 break;
1429         case MPOL_INTERLEAVE:
1430                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1431                 pol->v.nodes = tmp;
1432                 *mpolmask = *newmask;
1433                 current->il_next = node_remap(current->il_next,
1434                                                 *mpolmask, *newmask);
1435                 break;
1436         case MPOL_PREFERRED:
1437                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1438                                                 *mpolmask, *newmask);
1439                 *mpolmask = *newmask;
1440                 break;
1441         case MPOL_BIND: {
1442                 nodemask_t nodes;
1443                 struct zone **z;
1444                 struct zonelist *zonelist;
1445
1446                 nodes_clear(nodes);
1447                 for (z = pol->v.zonelist->zones; *z; z++)
1448                         node_set((*z)->zone_pgdat->node_id, nodes);
1449                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1450                 nodes = tmp;
1451
1452                 zonelist = bind_zonelist(&nodes);
1453
1454                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1455                  * If that old zonelist has no remaining mems_allowed nodes,
1456                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1457                  */
1458
1459                 if (zonelist) {
1460                         /* Good - got mem - substitute new zonelist */
1461                         kfree(pol->v.zonelist);
1462                         pol->v.zonelist = zonelist;
1463                 }
1464                 *mpolmask = *newmask;
1465                 break;
1466         }
1467         default:
1468                 BUG();
1469                 break;
1470         }
1471 }
1472
1473 /*
1474  * Wrapper for mpol_rebind_policy() that just requires task
1475  * pointer, and updates task mempolicy.
1476  */
1477
1478 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1479 {
1480         mpol_rebind_policy(tsk->mempolicy, new);
1481 }
1482
1483 /*
1484  * Display pages allocated per node and memory policy via /proc.
1485  */
1486
1487 static const char *policy_types[] = { "default", "prefer", "bind",
1488                                       "interleave" };
1489
1490 /*
1491  * Convert a mempolicy into a string.
1492  * Returns the number of characters in buffer (if positive)
1493  * or an error (negative)
1494  */
1495 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1496 {
1497         char *p = buffer;
1498         int l;
1499         nodemask_t nodes;
1500         int mode = pol ? pol->policy : MPOL_DEFAULT;
1501
1502         switch (mode) {
1503         case MPOL_DEFAULT:
1504                 nodes_clear(nodes);
1505                 break;
1506
1507         case MPOL_PREFERRED:
1508                 nodes_clear(nodes);
1509                 node_set(pol->v.preferred_node, nodes);
1510                 break;
1511
1512         case MPOL_BIND:
1513                 get_zonemask(pol, &nodes);
1514                 break;
1515
1516         case MPOL_INTERLEAVE:
1517                 nodes = pol->v.nodes;
1518                 break;
1519
1520         default:
1521                 BUG();
1522                 return -EFAULT;
1523         }
1524
1525         l = strlen(policy_types[mode]);
1526         if (buffer + maxlen < p + l + 1)
1527                 return -ENOSPC;
1528
1529         strcpy(p, policy_types[mode]);
1530         p += l;
1531
1532         if (!nodes_empty(nodes)) {
1533                 if (buffer + maxlen < p + 2)
1534                         return -ENOSPC;
1535                 *p++ = '=';
1536                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1537         }
1538         return p - buffer;
1539 }
1540
1541 struct numa_maps {
1542         unsigned long pages;
1543         unsigned long anon;
1544         unsigned long mapped;
1545         unsigned long mapcount_max;
1546         unsigned long node[MAX_NUMNODES];
1547 };
1548
1549 static void gather_stats(struct page *page, void *private)
1550 {
1551         struct numa_maps *md = private;
1552         int count = page_mapcount(page);
1553
1554         if (count)
1555                 md->mapped++;
1556
1557         if (count > md->mapcount_max)
1558                 md->mapcount_max = count;
1559
1560         md->pages++;
1561
1562         if (PageAnon(page))
1563                 md->anon++;
1564
1565         md->node[page_to_nid(page)]++;
1566         cond_resched();
1567 }
1568
1569 int show_numa_map(struct seq_file *m, void *v)
1570 {
1571         struct task_struct *task = m->private;
1572         struct vm_area_struct *vma = v;
1573         struct numa_maps *md;
1574         int n;
1575         char buffer[50];
1576
1577         if (!vma->vm_mm)
1578                 return 0;
1579
1580         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1581         if (!md)
1582                 return 0;
1583
1584         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1585                     &node_online_map, MPOL_MF_STATS, md);
1586
1587         if (md->pages) {
1588                 mpol_to_str(buffer, sizeof(buffer),
1589                             get_vma_policy(task, vma, vma->vm_start));
1590
1591                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1592                            vma->vm_start, buffer, md->pages,
1593                            md->mapped, md->mapcount_max);
1594
1595                 if (md->anon)
1596                         seq_printf(m," anon=%lu",md->anon);
1597
1598                 for_each_online_node(n)
1599                         if (md->node[n])
1600                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1601
1602                 seq_putc(m, '\n');
1603         }
1604         kfree(md);
1605
1606         if (m->count < m->size)
1607                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1608         return 0;
1609 }
1610