mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87
  88 #include <asm/tlbflush.h>
  89 #include <asm/uaccess.h>
  90
  91 /* Internal MPOL_MF_xxx flags */
  92 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  93
  94 static kmem_cache_t *policy_cache;
  95 static kmem_cache_t *sn_cache;
  96
  97 #define PDprintk(fmt...)
  98
  99 /* Highest zone. An specific allocation for a zone below that is not
 100    policied. */
 101 int policy_zone = ZONE_DMA;
 102
 103 struct mempolicy default_policy = {
 104         .refcnt = ATOMIC_INIT(1), /* never free it */
 105         .policy = MPOL_DEFAULT,
 106 };
 107
 108 /* Do sanity checking on a policy */
 109 static int mpol_check_policy(int mode, nodemask_t *nodes)
 110 {
 111         int empty = nodes_empty(*nodes);
 112
 113         switch (mode) {
 114         case MPOL_DEFAULT:
 115                 if (!empty)
 116                         return -EINVAL;
 117                 break;
 118         case MPOL_BIND:
 119         case MPOL_INTERLEAVE:
 120                 /* Preferred will only use the first bit, but allow
 121                    more for now. */
 122                 if (empty)
 123                         return -EINVAL;
 124                 break;
 125         }
 126         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 127 }
 128 /* Generate a custom zonelist for the BIND policy. */
 129 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 130 {
 131         struct zonelist *zl;
 132         int num, max, nd;
 133
 134         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 135         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 136         if (!zl)
 137                 return NULL;
 138         num = 0;
 139         for_each_node_mask(nd, *nodes)
 140                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 141         zl->zones[num] = NULL;
 142         return zl;
 143 }
 144
 145 /* Create a new policy */
 146 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 147 {
 148         struct mempolicy *policy;
 149
 150         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 151         if (mode == MPOL_DEFAULT)
 152                 return NULL;
 153         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 154         if (!policy)
 155                 return ERR_PTR(-ENOMEM);
 156         atomic_set(&policy->refcnt, 1);
 157         switch (mode) {
 158         case MPOL_INTERLEAVE:
 159                 policy->v.nodes = *nodes;
 160                 if (nodes_weight(*nodes) == 0) {
 161                         kmem_cache_free(policy_cache, policy);
 162                         return ERR_PTR(-EINVAL);
 163                 }
 164                 break;
 165         case MPOL_PREFERRED:
 166                 policy->v.preferred_node = first_node(*nodes);
 167                 if (policy->v.preferred_node >= MAX_NUMNODES)
 168                         policy->v.preferred_node = -1;
 169                 break;
 170         case MPOL_BIND:
 171                 policy->v.zonelist = bind_zonelist(nodes);
 172                 if (policy->v.zonelist == NULL) {
 173                         kmem_cache_free(policy_cache, policy);
 174                         return ERR_PTR(-ENOMEM);
 175                 }
 176                 break;
 177         }
 178         policy->policy = mode;
 179         return policy;
 180 }
 181
 182 /* Check if we are the only process mapping the page in question */
 183 static inline int single_mm_mapping(struct mm_struct *mm,
 184                         struct address_space *mapping)
 185 {
 186         struct vm_area_struct *vma;
 187         struct prio_tree_iter iter;
 188         int rc = 1;
 189
 190         spin_lock(&mapping->i_mmap_lock);
 191         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
 192                 if (mm != vma->vm_mm) {
 193                         rc = 0;
 194                         goto out;
 195                 }
 196         list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 197                 if (mm != vma->vm_mm) {
 198                         rc = 0;
 199                         goto out;
 200                 }
 201 out:
 202         spin_unlock(&mapping->i_mmap_lock);
 203         return rc;
 204 }
 205
 206 /*
 207  * Add a page to be migrated to the pagelist
 208  */
 209 static void migrate_page_add(struct vm_area_struct *vma,
 210         struct page *page, struct list_head *pagelist, unsigned long flags)
 211 {
 212         /*
 213          * Avoid migrating a page that is shared by others and not writable.
 214          */
 215         if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 216             mapping_writably_mapped(page->mapping) ||
 217             single_mm_mapping(vma->vm_mm, page->mapping)) {
 218                 int rc = isolate_lru_page(page);
 219
 220                 if (rc == 1)
 221                         list_add(&page->lru, pagelist);
 222                 /*
 223                  * If the isolate attempt was not successful then we just
 224                  * encountered an unswappable page. Something must be wrong.
 225                  */
 226                 WARN_ON(rc == 0);
 227         }
 228 }
 229
 230 /* Ensure all existing pages follow the policy. */
 231 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 232                 unsigned long addr, unsigned long end,
 233                 const nodemask_t *nodes, unsigned long flags,
 234                 struct list_head *pagelist)
 235 {
 236         pte_t *orig_pte;
 237         pte_t *pte;
 238         spinlock_t *ptl;
 239
 240         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 241         do {
 242                 struct page *page;
 243                 unsigned int nid;
 244
 245                 if (!pte_present(*pte))
 246                         continue;
 247                 page = vm_normal_page(vma, addr, *pte);
 248                 if (!page)
 249                         continue;
 250                 nid = page_to_nid(page);
 251                 if (!node_isset(nid, *nodes)) {
 252                         if (pagelist)
 253                                 migrate_page_add(vma, page, pagelist, flags);
 254                         else
 255                                 break;
 256                 }
 257         } while (pte++, addr += PAGE_SIZE, addr != end);
 258         pte_unmap_unlock(orig_pte, ptl);
 259         return addr != end;
 260 }
 261
 262 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 263                 unsigned long addr, unsigned long end,
 264                 const nodemask_t *nodes, unsigned long flags,
 265                 struct list_head *pagelist)
 266 {
 267         pmd_t *pmd;
 268         unsigned long next;
 269
 270         pmd = pmd_offset(pud, addr);
 271         do {
 272                 next = pmd_addr_end(addr, end);
 273                 if (pmd_none_or_clear_bad(pmd))
 274                         continue;
 275                 if (check_pte_range(vma, pmd, addr, next, nodes,
 276                                     flags, pagelist))
 277                         return -EIO;
 278         } while (pmd++, addr = next, addr != end);
 279         return 0;
 280 }
 281
 282 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 283                 unsigned long addr, unsigned long end,
 284                 const nodemask_t *nodes, unsigned long flags,
 285                 struct list_head *pagelist)
 286 {
 287         pud_t *pud;
 288         unsigned long next;
 289
 290         pud = pud_offset(pgd, addr);
 291         do {
 292                 next = pud_addr_end(addr, end);
 293                 if (pud_none_or_clear_bad(pud))
 294                         continue;
 295                 if (check_pmd_range(vma, pud, addr, next, nodes,
 296                                     flags, pagelist))
 297                         return -EIO;
 298         } while (pud++, addr = next, addr != end);
 299         return 0;
 300 }
 301
 302 static inline int check_pgd_range(struct vm_area_struct *vma,
 303                 unsigned long addr, unsigned long end,
 304                 const nodemask_t *nodes, unsigned long flags,
 305                 struct list_head *pagelist)
 306 {
 307         pgd_t *pgd;
 308         unsigned long next;
 309
 310         pgd = pgd_offset(vma->vm_mm, addr);
 311         do {
 312                 next = pgd_addr_end(addr, end);
 313                 if (pgd_none_or_clear_bad(pgd))
 314                         continue;
 315                 if (check_pud_range(vma, pgd, addr, next, nodes,
 316                                     flags, pagelist))
 317                         return -EIO;
 318         } while (pgd++, addr = next, addr != end);
 319         return 0;
 320 }
 321
 322 /* Check if a vma is migratable */
 323 static inline int vma_migratable(struct vm_area_struct *vma)
 324 {
 325         if (vma->vm_flags & (
 326                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
 327                 return 0;
 328         return 1;
 329 }
 330
 331 /*
 332  * Check if all pages in a range are on a set of nodes.
 333  * If pagelist != NULL then isolate pages from the LRU and
 334  * put them on the pagelist.
 335  */
 336 static struct vm_area_struct *
 337 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 338                 const nodemask_t *nodes, unsigned long flags,
 339                 struct list_head *pagelist)
 340 {
 341         int err;
 342         struct vm_area_struct *first, *vma, *prev;
 343
 344         first = find_vma(mm, start);
 345         if (!first)
 346                 return ERR_PTR(-EFAULT);
 347         prev = NULL;
 348         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 349                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 350                         if (!vma->vm_next && vma->vm_end < end)
 351                                 return ERR_PTR(-EFAULT);
 352                         if (prev && prev->vm_end < vma->vm_start)
 353                                 return ERR_PTR(-EFAULT);
 354                 }
 355                 if (!is_vm_hugetlb_page(vma) &&
 356                     ((flags & MPOL_MF_STRICT) ||
 357                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 358                                 vma_migratable(vma)))) {
 359                         unsigned long endvma = vma->vm_end;
 360
 361                         if (endvma > end)
 362                                 endvma = end;
 363                         if (vma->vm_start > start)
 364                                 start = vma->vm_start;
 365                         err = check_pgd_range(vma, start, endvma, nodes,
 366                                                 flags, pagelist);
 367                         if (err) {
 368                                 first = ERR_PTR(err);
 369                                 break;
 370                         }
 371                 }
 372                 prev = vma;
 373         }
 374         return first;
 375 }
 376
 377 /* Apply policy to a single VMA */
 378 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 379 {
 380         int err = 0;
 381         struct mempolicy *old = vma->vm_policy;
 382
 383         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 384                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 385                  vma->vm_ops, vma->vm_file,
 386                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 387
 388         if (vma->vm_ops && vma->vm_ops->set_policy)
 389                 err = vma->vm_ops->set_policy(vma, new);
 390         if (!err) {
 391                 mpol_get(new);
 392                 vma->vm_policy = new;
 393                 mpol_free(old);
 394         }
 395         return err;
 396 }
 397
 398 /* Step 2: apply policy to a range and do splits. */
 399 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 400                        unsigned long end, struct mempolicy *new)
 401 {
 402         struct vm_area_struct *next;
 403         int err;
 404
 405         err = 0;
 406         for (; vma && vma->vm_start < end; vma = next) {
 407                 next = vma->vm_next;
 408                 if (vma->vm_start < start)
 409                         err = split_vma(vma->vm_mm, vma, start, 1);
 410                 if (!err && vma->vm_end > end)
 411                         err = split_vma(vma->vm_mm, vma, end, 0);
 412                 if (!err)
 413                         err = policy_vma(vma, new);
 414                 if (err)
 415                         break;
 416         }
 417         return err;
 418 }
 419
 420 static int contextualize_policy(int mode, nodemask_t *nodes)
 421 {
 422         if (!nodes)
 423                 return 0;
 424
 425         /* Update current mems_allowed */
 426         cpuset_update_current_mems_allowed();
 427         /* Ignore nodes not set in current->mems_allowed */
 428         cpuset_restrict_to_mems_allowed(nodes->bits);
 429         return mpol_check_policy(mode, nodes);
 430 }
 431
 432 long do_mbind(unsigned long start, unsigned long len,
 433                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 434 {
 435         struct vm_area_struct *vma;
 436         struct mm_struct *mm = current->mm;
 437         struct mempolicy *new;
 438         unsigned long end;
 439         int err;
 440         LIST_HEAD(pagelist);
 441
 442         if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
 443             || mode > MPOL_MAX)
 444                 return -EINVAL;
 445         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 446                 return -EPERM;
 447
 448         if (start & ~PAGE_MASK)
 449                 return -EINVAL;
 450
 451         if (mode == MPOL_DEFAULT)
 452                 flags &= ~MPOL_MF_STRICT;
 453
 454         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 455         end = start + len;
 456
 457         if (end < start)
 458                 return -EINVAL;
 459         if (end == start)
 460                 return 0;
 461
 462         if (mpol_check_policy(mode, nmask))
 463                 return -EINVAL;
 464
 465         new = mpol_new(mode, nmask);
 466         if (IS_ERR(new))
 467                 return PTR_ERR(new);
 468
 469         /*
 470          * If we are using the default policy then operation
 471          * on discontinuous address spaces is okay after all
 472          */
 473         if (!new)
 474                 flags |= MPOL_MF_DISCONTIG_OK;
 475
 476         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 477                         mode,nodes_addr(nodes)[0]);
 478
 479         down_write(&mm->mmap_sem);
 480         vma = check_range(mm, start, end, nmask, flags,
 481               (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
 482         err = PTR_ERR(vma);
 483         if (!IS_ERR(vma)) {
 484                 err = mbind_range(vma, start, end, new);
 485                 if (!list_empty(&pagelist))
 486                         migrate_pages(&pagelist, NULL);
 487                 if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
 488                         err = -EIO;
 489         }
 490         if (!list_empty(&pagelist))
 491                 putback_lru_pages(&pagelist);
 492
 493         up_write(&mm->mmap_sem);
 494         mpol_free(new);
 495         return err;
 496 }
 497
 498 /* Set the process memory policy */
 499 long do_set_mempolicy(int mode, nodemask_t *nodes)
 500 {
 501         struct mempolicy *new;
 502
 503         if (contextualize_policy(mode, nodes))
 504                 return -EINVAL;
 505         new = mpol_new(mode, nodes);
 506         if (IS_ERR(new))
 507                 return PTR_ERR(new);
 508         mpol_free(current->mempolicy);
 509         current->mempolicy = new;
 510         if (new && new->policy == MPOL_INTERLEAVE)
 511                 current->il_next = first_node(new->v.nodes);
 512         return 0;
 513 }
 514
 515 /* Fill a zone bitmap for a policy */
 516 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 517 {
 518         int i;
 519
 520         nodes_clear(*nodes);
 521         switch (p->policy) {
 522         case MPOL_BIND:
 523                 for (i = 0; p->v.zonelist->zones[i]; i++)
 524                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 525                                 *nodes);
 526                 break;
 527         case MPOL_DEFAULT:
 528                 break;
 529         case MPOL_INTERLEAVE:
 530                 *nodes = p->v.nodes;
 531                 break;
 532         case MPOL_PREFERRED:
 533                 /* or use current node instead of online map? */
 534                 if (p->v.preferred_node < 0)
 535                         *nodes = node_online_map;
 536                 else
 537                         node_set(p->v.preferred_node, *nodes);
 538                 break;
 539         default:
 540                 BUG();
 541         }
 542 }
 543
 544 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 545 {
 546         struct page *p;
 547         int err;
 548
 549         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 550         if (err >= 0) {
 551                 err = page_to_nid(p);
 552                 put_page(p);
 553         }
 554         return err;
 555 }
 556
 557 /* Retrieve NUMA policy */
 558 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 559                         unsigned long addr, unsigned long flags)
 560 {
 561         int err;
 562         struct mm_struct *mm = current->mm;
 563         struct vm_area_struct *vma = NULL;
 564         struct mempolicy *pol = current->mempolicy;
 565
 566         cpuset_update_current_mems_allowed();
 567         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 568                 return -EINVAL;
 569         if (flags & MPOL_F_ADDR) {
 570                 down_read(&mm->mmap_sem);
 571                 vma = find_vma_intersection(mm, addr, addr+1);
 572                 if (!vma) {
 573                         up_read(&mm->mmap_sem);
 574                         return -EFAULT;
 575                 }
 576                 if (vma->vm_ops && vma->vm_ops->get_policy)
 577                         pol = vma->vm_ops->get_policy(vma, addr);
 578                 else
 579                         pol = vma->vm_policy;
 580         } else if (addr)
 581                 return -EINVAL;
 582
 583         if (!pol)
 584                 pol = &default_policy;
 585
 586         if (flags & MPOL_F_NODE) {
 587                 if (flags & MPOL_F_ADDR) {
 588                         err = lookup_node(mm, addr);
 589                         if (err < 0)
 590                                 goto out;
 591                         *policy = err;
 592                 } else if (pol == current->mempolicy &&
 593                                 pol->policy == MPOL_INTERLEAVE) {
 594                         *policy = current->il_next;
 595                 } else {
 596                         err = -EINVAL;
 597                         goto out;
 598                 }
 599         } else
 600                 *policy = pol->policy;
 601
 602         if (vma) {
 603                 up_read(&current->mm->mmap_sem);
 604                 vma = NULL;
 605         }
 606
 607         err = 0;
 608         if (nmask)
 609                 get_zonemask(pol, nmask);
 610
 611  out:
 612         if (vma)
 613                 up_read(&current->mm->mmap_sem);
 614         return err;
 615 }
 616
 617 /*
 618  * For now migrate_pages simply swaps out the pages from nodes that are in
 619  * the source set but not in the target set. In the future, we would
 620  * want a function that moves pages between the two nodesets in such
 621  * a way as to preserve the physical layout as much as possible.
 622  *
 623  * Returns the number of page that could not be moved.
 624  */
 625 int do_migrate_pages(struct mm_struct *mm,
 626         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 627 {
 628         LIST_HEAD(pagelist);
 629         int count = 0;
 630         nodemask_t nodes;
 631
 632         nodes_andnot(nodes, *from_nodes, *to_nodes);
 633         nodes_complement(nodes, nodes);
 634
 635         down_read(&mm->mmap_sem);
 636         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 637                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 638         if (!list_empty(&pagelist)) {
 639                 migrate_pages(&pagelist, NULL);
 640                 if (!list_empty(&pagelist))
 641                         count = putback_lru_pages(&pagelist);
 642         }
 643         up_read(&mm->mmap_sem);
 644         return count;
 645 }
 646
 647 /*
 648  * User space interface with variable sized bitmaps for nodelists.
 649  */
 650
 651 /* Copy a node mask from user space. */
 652 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 653                      unsigned long maxnode)
 654 {
 655         unsigned long k;
 656         unsigned long nlongs;
 657         unsigned long endmask;
 658
 659         --maxnode;
 660         nodes_clear(*nodes);
 661         if (maxnode == 0 || !nmask)
 662                 return 0;
 663
 664         nlongs = BITS_TO_LONGS(maxnode);
 665         if ((maxnode % BITS_PER_LONG) == 0)
 666                 endmask = ~0UL;
 667         else
 668                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 669
 670         /* When the user specified more nodes than supported just check
 671            if the non supported part is all zero. */
 672         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 673                 if (nlongs > PAGE_SIZE/sizeof(long))
 674                         return -EINVAL;
 675                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 676                         unsigned long t;
 677                         if (get_user(t, nmask + k))
 678                                 return -EFAULT;
 679                         if (k == nlongs - 1) {
 680                                 if (t & endmask)
 681                                         return -EINVAL;
 682                         } else if (t)
 683                                 return -EINVAL;
 684                 }
 685                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 686                 endmask = ~0UL;
 687         }
 688
 689         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 690                 return -EFAULT;
 691         nodes_addr(*nodes)[nlongs-1] &= endmask;
 692         return 0;
 693 }
 694
 695 /* Copy a kernel node mask to user space */
 696 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 697                               nodemask_t *nodes)
 698 {
 699         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 700         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 701
 702         if (copy > nbytes) {
 703                 if (copy > PAGE_SIZE)
 704                         return -EINVAL;
 705                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 706                         return -EFAULT;
 707                 copy = nbytes;
 708         }
 709         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 710 }
 711
 712 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 713                         unsigned long mode,
 714                         unsigned long __user *nmask, unsigned long maxnode,
 715                         unsigned flags)
 716 {
 717         nodemask_t nodes;
 718         int err;
 719
 720         err = get_nodes(&nodes, nmask, maxnode);
 721         if (err)
 722                 return err;
 723         return do_mbind(start, len, mode, &nodes, flags);
 724 }
 725
 726 /* Set the process memory policy */
 727 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 728                 unsigned long maxnode)
 729 {
 730         int err;
 731         nodemask_t nodes;
 732
 733         if (mode < 0 || mode > MPOL_MAX)
 734                 return -EINVAL;
 735         err = get_nodes(&nodes, nmask, maxnode);
 736         if (err)
 737                 return err;
 738         return do_set_mempolicy(mode, &nodes);
 739 }
 740
 741 /* Macro needed until Paul implements this function in kernel/cpusets.c */
 742 #define cpuset_mems_allowed(task) node_online_map
 743
 744 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 745                 const unsigned long __user *old_nodes,
 746                 const unsigned long __user *new_nodes)
 747 {
 748         struct mm_struct *mm;
 749         struct task_struct *task;
 750         nodemask_t old;
 751         nodemask_t new;
 752         nodemask_t task_nodes;
 753         int err;
 754
 755         err = get_nodes(&old, old_nodes, maxnode);
 756         if (err)
 757                 return err;
 758
 759         err = get_nodes(&new, new_nodes, maxnode);
 760         if (err)
 761                 return err;
 762
 763         /* Find the mm_struct */
 764         read_lock(&tasklist_lock);
 765         task = pid ? find_task_by_pid(pid) : current;
 766         if (!task) {
 767                 read_unlock(&tasklist_lock);
 768                 return -ESRCH;
 769         }
 770         mm = get_task_mm(task);
 771         read_unlock(&tasklist_lock);
 772
 773         if (!mm)
 774                 return -EINVAL;
 775
 776         /*
 777          * Check if this process has the right to modify the specified
 778          * process. The right exists if the process has administrative
 779          * capabilities, superuser priviledges or the same
 780          * userid as the target process.
 781          */
 782         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 783             (current->uid != task->suid) && (current->uid != task->uid) &&
 784             !capable(CAP_SYS_ADMIN)) {
 785                 err = -EPERM;
 786                 goto out;
 787         }
 788
 789         task_nodes = cpuset_mems_allowed(task);
 790         /* Is the user allowed to access the target nodes? */
 791         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 792                 err = -EPERM;
 793                 goto out;
 794         }
 795
 796         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 797 out:
 798         mmput(mm);
 799         return err;
 800 }
 801
 802
 803 /* Retrieve NUMA policy */
 804 asmlinkage long sys_get_mempolicy(int __user *policy,
 805                                 unsigned long __user *nmask,
 806                                 unsigned long maxnode,
 807                                 unsigned long addr, unsigned long flags)
 808 {
 809         int err, pval;
 810         nodemask_t nodes;
 811
 812         if (nmask != NULL && maxnode < MAX_NUMNODES)
 813                 return -EINVAL;
 814
 815         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 816
 817         if (err)
 818                 return err;
 819
 820         if (policy && put_user(pval, policy))
 821                 return -EFAULT;
 822
 823         if (nmask)
 824                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 825
 826         return err;
 827 }
 828
 829 #ifdef CONFIG_COMPAT
 830
 831 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 832                                      compat_ulong_t __user *nmask,
 833                                      compat_ulong_t maxnode,
 834                                      compat_ulong_t addr, compat_ulong_t flags)
 835 {
 836         long err;
 837         unsigned long __user *nm = NULL;
 838         unsigned long nr_bits, alloc_size;
 839         DECLARE_BITMAP(bm, MAX_NUMNODES);
 840
 841         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 842         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 843
 844         if (nmask)
 845                 nm = compat_alloc_user_space(alloc_size);
 846
 847         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 848
 849         if (!err && nmask) {
 850                 err = copy_from_user(bm, nm, alloc_size);
 851                 /* ensure entire bitmap is zeroed */
 852                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 853                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 854         }
 855
 856         return err;
 857 }
 858
 859 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 860                                      compat_ulong_t maxnode)
 861 {
 862         long err = 0;
 863         unsigned long __user *nm = NULL;
 864         unsigned long nr_bits, alloc_size;
 865         DECLARE_BITMAP(bm, MAX_NUMNODES);
 866
 867         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 868         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 869
 870         if (nmask) {
 871                 err = compat_get_bitmap(bm, nmask, nr_bits);
 872                 nm = compat_alloc_user_space(alloc_size);
 873                 err |= copy_to_user(nm, bm, alloc_size);
 874         }
 875
 876         if (err)
 877                 return -EFAULT;
 878
 879         return sys_set_mempolicy(mode, nm, nr_bits+1);
 880 }
 881
 882 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 883                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 884                              compat_ulong_t maxnode, compat_ulong_t flags)
 885 {
 886         long err = 0;
 887         unsigned long __user *nm = NULL;
 888         unsigned long nr_bits, alloc_size;
 889         nodemask_t bm;
 890
 891         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 892         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 893
 894         if (nmask) {
 895                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 896                 nm = compat_alloc_user_space(alloc_size);
 897                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 898         }
 899
 900         if (err)
 901                 return -EFAULT;
 902
 903         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 904 }
 905
 906 #endif
 907
 908 /* Return effective policy for a VMA */
 909 struct mempolicy *
 910 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 911 {
 912         struct mempolicy *pol = task->mempolicy;
 913
 914         if (vma) {
 915                 if (vma->vm_ops && vma->vm_ops->get_policy)
 916                         pol = vma->vm_ops->get_policy(vma, addr);
 917                 else if (vma->vm_policy &&
 918                                 vma->vm_policy->policy != MPOL_DEFAULT)
 919                         pol = vma->vm_policy;
 920         }
 921         if (!pol)
 922                 pol = &default_policy;
 923         return pol;
 924 }
 925
 926 /* Return a zonelist representing a mempolicy */
 927 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 928 {
 929         int nd;
 930
 931         switch (policy->policy) {
 932         case MPOL_PREFERRED:
 933                 nd = policy->v.preferred_node;
 934                 if (nd < 0)
 935                         nd = numa_node_id();
 936                 break;
 937         case MPOL_BIND:
 938                 /* Lower zones don't get a policy applied */
 939                 /* Careful: current->mems_allowed might have moved */
 940                 if (gfp_zone(gfp) >= policy_zone)
 941                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 942                                 return policy->v.zonelist;
 943                 /*FALL THROUGH*/
 944         case MPOL_INTERLEAVE: /* should not happen */
 945         case MPOL_DEFAULT:
 946                 nd = numa_node_id();
 947                 break;
 948         default:
 949                 nd = 0;
 950                 BUG();
 951         }
 952         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 953 }
 954
 955 /* Do dynamic interleaving for a process */
 956 static unsigned interleave_nodes(struct mempolicy *policy)
 957 {
 958         unsigned nid, next;
 959         struct task_struct *me = current;
 960
 961         nid = me->il_next;
 962         next = next_node(nid, policy->v.nodes);
 963         if (next >= MAX_NUMNODES)
 964                 next = first_node(policy->v.nodes);
 965         me->il_next = next;
 966         return nid;
 967 }
 968
 969 /* Do static interleaving for a VMA with known offset. */
 970 static unsigned offset_il_node(struct mempolicy *pol,
 971                 struct vm_area_struct *vma, unsigned long off)
 972 {
 973         unsigned nnodes = nodes_weight(pol->v.nodes);
 974         unsigned target = (unsigned)off % nnodes;
 975         int c;
 976         int nid = -1;
 977
 978         c = 0;
 979         do {
 980                 nid = next_node(nid, pol->v.nodes);
 981                 c++;
 982         } while (c <= target);
 983         return nid;
 984 }
 985
 986 /* Determine a node number for interleave */
 987 static inline unsigned interleave_nid(struct mempolicy *pol,
 988                  struct vm_area_struct *vma, unsigned long addr, int shift)
 989 {
 990         if (vma) {
 991                 unsigned long off;
 992
 993                 off = vma->vm_pgoff;
 994                 off += (addr - vma->vm_start) >> shift;
 995                 return offset_il_node(pol, vma, off);
 996         } else
 997                 return interleave_nodes(pol);
 998 }
 999
1000 /* Return a zonelist suitable for a huge page allocation. */
1001 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1002 {
1003         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1004
1005         if (pol->policy == MPOL_INTERLEAVE) {
1006                 unsigned nid;
1007
1008                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1009                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1010         }
1011         return zonelist_policy(GFP_HIGHUSER, pol);
1012 }
1013
1014 /* Allocate a page in interleaved policy.
1015    Own path because it needs to do special accounting. */
1016 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1017                                         unsigned nid)
1018 {
1019         struct zonelist *zl;
1020         struct page *page;
1021
1022         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1023         page = __alloc_pages(gfp, order, zl);
1024         if (page && page_zone(page) == zl->zones[0]) {
1025                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1026                 put_cpu();
1027         }
1028         return page;
1029 }
1030
1031 /**
1032  *      alloc_page_vma  - Allocate a page for a VMA.
1033  *
1034  *      @gfp:
1035  *      %GFP_USER    user allocation.
1036  *      %GFP_KERNEL  kernel allocations,
1037  *      %GFP_HIGHMEM highmem/user allocations,
1038  *      %GFP_FS      allocation should not call back into a file system.
1039  *      %GFP_ATOMIC  don't sleep.
1040  *
1041  *      @vma:  Pointer to VMA or NULL if not available.
1042  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1043  *
1044  *      This function allocates a page from the kernel page pool and applies
1045  *      a NUMA policy associated with the VMA or the current process.
1046  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1047  *      mm_struct of the VMA to prevent it from going away. Should be used for
1048  *      all allocations for pages that will be mapped into
1049  *      user space. Returns NULL when no page can be allocated.
1050  *
1051  *      Should be called with the mm_sem of the vma hold.
1052  */
1053 struct page *
1054 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1055 {
1056         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1057
1058         cpuset_update_current_mems_allowed();
1059
1060         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1061                 unsigned nid;
1062
1063                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1064                 return alloc_page_interleave(gfp, 0, nid);
1065         }
1066         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1067 }
1068
1069 /**
1070  *      alloc_pages_current - Allocate pages.
1071  *
1072  *      @gfp:
1073  *              %GFP_USER   user allocation,
1074  *              %GFP_KERNEL kernel allocation,
1075  *              %GFP_HIGHMEM highmem allocation,
1076  *              %GFP_FS     don't call back into a file system.
1077  *              %GFP_ATOMIC don't sleep.
1078  *      @order: Power of two of allocation size in pages. 0 is a single page.
1079  *
1080  *      Allocate a page from the kernel page pool.  When not in
1081  *      interrupt context and apply the current process NUMA policy.
1082  *      Returns NULL when no page can be allocated.
1083  *
1084  *      Don't call cpuset_update_current_mems_allowed() unless
1085  *      1) it's ok to take cpuset_sem (can WAIT), and
1086  *      2) allocating for current task (not interrupt).
1087  */
1088 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1089 {
1090         struct mempolicy *pol = current->mempolicy;
1091
1092         if ((gfp & __GFP_WAIT) && !in_interrupt())
1093                 cpuset_update_current_mems_allowed();
1094         if (!pol || in_interrupt())
1095                 pol = &default_policy;
1096         if (pol->policy == MPOL_INTERLEAVE)
1097                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1098         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1099 }
1100 EXPORT_SYMBOL(alloc_pages_current);
1101
1102 /* Slow path of a mempolicy copy */
1103 struct mempolicy *__mpol_copy(struct mempolicy *old)
1104 {
1105         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1106
1107         if (!new)
1108                 return ERR_PTR(-ENOMEM);
1109         *new = *old;
1110         atomic_set(&new->refcnt, 1);
1111         if (new->policy == MPOL_BIND) {
1112                 int sz = ksize(old->v.zonelist);
1113                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1114                 if (!new->v.zonelist) {
1115                         kmem_cache_free(policy_cache, new);
1116                         return ERR_PTR(-ENOMEM);
1117                 }
1118                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1119         }
1120         return new;
1121 }
1122
1123 /* Slow path of a mempolicy comparison */
1124 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1125 {
1126         if (!a || !b)
1127                 return 0;
1128         if (a->policy != b->policy)
1129                 return 0;
1130         switch (a->policy) {
1131         case MPOL_DEFAULT:
1132                 return 1;
1133         case MPOL_INTERLEAVE:
1134                 return nodes_equal(a->v.nodes, b->v.nodes);
1135         case MPOL_PREFERRED:
1136                 return a->v.preferred_node == b->v.preferred_node;
1137         case MPOL_BIND: {
1138                 int i;
1139                 for (i = 0; a->v.zonelist->zones[i]; i++)
1140                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1141                                 return 0;
1142                 return b->v.zonelist->zones[i] == NULL;
1143         }
1144         default:
1145                 BUG();
1146                 return 0;
1147         }
1148 }
1149
1150 /* Slow path of a mpol destructor. */
1151 void __mpol_free(struct mempolicy *p)
1152 {
1153         if (!atomic_dec_and_test(&p->refcnt))
1154                 return;
1155         if (p->policy == MPOL_BIND)
1156                 kfree(p->v.zonelist);
1157         p->policy = MPOL_DEFAULT;
1158         kmem_cache_free(policy_cache, p);
1159 }
1160
1161 /*
1162  * Shared memory backing store policy support.
1163  *
1164  * Remember policies even when nobody has shared memory mapped.
1165  * The policies are kept in Red-Black tree linked from the inode.
1166  * They are protected by the sp->lock spinlock, which should be held
1167  * for any accesses to the tree.
1168  */
1169
1170 /* lookup first element intersecting start-end */
1171 /* Caller holds sp->lock */
1172 static struct sp_node *
1173 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1174 {
1175         struct rb_node *n = sp->root.rb_node;
1176
1177         while (n) {
1178                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1179
1180                 if (start >= p->end)
1181                         n = n->rb_right;
1182                 else if (end <= p->start)
1183                         n = n->rb_left;
1184                 else
1185                         break;
1186         }
1187         if (!n)
1188                 return NULL;
1189         for (;;) {
1190                 struct sp_node *w = NULL;
1191                 struct rb_node *prev = rb_prev(n);
1192                 if (!prev)
1193                         break;
1194                 w = rb_entry(prev, struct sp_node, nd);
1195                 if (w->end <= start)
1196                         break;
1197                 n = prev;
1198         }
1199         return rb_entry(n, struct sp_node, nd);
1200 }
1201
1202 /* Insert a new shared policy into the list. */
1203 /* Caller holds sp->lock */
1204 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1205 {
1206         struct rb_node **p = &sp->root.rb_node;
1207         struct rb_node *parent = NULL;
1208         struct sp_node *nd;
1209
1210         while (*p) {
1211                 parent = *p;
1212                 nd = rb_entry(parent, struct sp_node, nd);
1213                 if (new->start < nd->start)
1214                         p = &(*p)->rb_left;
1215                 else if (new->end > nd->end)
1216                         p = &(*p)->rb_right;
1217                 else
1218                         BUG();
1219         }
1220         rb_link_node(&new->nd, parent, p);
1221         rb_insert_color(&new->nd, &sp->root);
1222         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1223                  new->policy ? new->policy->policy : 0);
1224 }
1225
1226 /* Find shared policy intersecting idx */
1227 struct mempolicy *
1228 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1229 {
1230         struct mempolicy *pol = NULL;
1231         struct sp_node *sn;
1232
1233         if (!sp->root.rb_node)
1234                 return NULL;
1235         spin_lock(&sp->lock);
1236         sn = sp_lookup(sp, idx, idx+1);
1237         if (sn) {
1238                 mpol_get(sn->policy);
1239                 pol = sn->policy;
1240         }
1241         spin_unlock(&sp->lock);
1242         return pol;
1243 }
1244
1245 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1246 {
1247         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1248         rb_erase(&n->nd, &sp->root);
1249         mpol_free(n->policy);
1250         kmem_cache_free(sn_cache, n);
1251 }
1252
1253 struct sp_node *
1254 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1255 {
1256         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1257
1258         if (!n)
1259                 return NULL;
1260         n->start = start;
1261         n->end = end;
1262         mpol_get(pol);
1263         n->policy = pol;
1264         return n;
1265 }
1266
1267 /* Replace a policy range. */
1268 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1269                                  unsigned long end, struct sp_node *new)
1270 {
1271         struct sp_node *n, *new2 = NULL;
1272
1273 restart:
1274         spin_lock(&sp->lock);
1275         n = sp_lookup(sp, start, end);
1276         /* Take care of old policies in the same range. */
1277         while (n && n->start < end) {
1278                 struct rb_node *next = rb_next(&n->nd);
1279                 if (n->start >= start) {
1280                         if (n->end <= end)
1281                                 sp_delete(sp, n);
1282                         else
1283                                 n->start = end;
1284                 } else {
1285                         /* Old policy spanning whole new range. */
1286                         if (n->end > end) {
1287                                 if (!new2) {
1288                                         spin_unlock(&sp->lock);
1289                                         new2 = sp_alloc(end, n->end, n->policy);
1290                                         if (!new2)
1291                                                 return -ENOMEM;
1292                                         goto restart;
1293                                 }
1294                                 n->end = start;
1295                                 sp_insert(sp, new2);
1296                                 new2 = NULL;
1297                                 break;
1298                         } else
1299                                 n->end = start;
1300                 }
1301                 if (!next)
1302                         break;
1303                 n = rb_entry(next, struct sp_node, nd);
1304         }
1305         if (new)
1306                 sp_insert(sp, new);
1307         spin_unlock(&sp->lock);
1308         if (new2) {
1309                 mpol_free(new2->policy);
1310                 kmem_cache_free(sn_cache, new2);
1311         }
1312         return 0;
1313 }
1314
1315 int mpol_set_shared_policy(struct shared_policy *info,
1316                         struct vm_area_struct *vma, struct mempolicy *npol)
1317 {
1318         int err;
1319         struct sp_node *new = NULL;
1320         unsigned long sz = vma_pages(vma);
1321
1322         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1323                  vma->vm_pgoff,
1324                  sz, npol? npol->policy : -1,
1325                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1326
1327         if (npol) {
1328                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1329                 if (!new)
1330                         return -ENOMEM;
1331         }
1332         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1333         if (err && new)
1334                 kmem_cache_free(sn_cache, new);
1335         return err;
1336 }
1337
1338 /* Free a backing policy store on inode delete. */
1339 void mpol_free_shared_policy(struct shared_policy *p)
1340 {
1341         struct sp_node *n;
1342         struct rb_node *next;
1343
1344         if (!p->root.rb_node)
1345                 return;
1346         spin_lock(&p->lock);
1347         next = rb_first(&p->root);
1348         while (next) {
1349                 n = rb_entry(next, struct sp_node, nd);
1350                 next = rb_next(&n->nd);
1351                 rb_erase(&n->nd, &p->root);
1352                 mpol_free(n->policy);
1353                 kmem_cache_free(sn_cache, n);
1354         }
1355         spin_unlock(&p->lock);
1356 }
1357
1358 /* assumes fs == KERNEL_DS */
1359 void __init numa_policy_init(void)
1360 {
1361         policy_cache = kmem_cache_create("numa_policy",
1362                                          sizeof(struct mempolicy),
1363                                          0, SLAB_PANIC, NULL, NULL);
1364
1365         sn_cache = kmem_cache_create("shared_policy_node",
1366                                      sizeof(struct sp_node),
1367                                      0, SLAB_PANIC, NULL, NULL);
1368
1369         /* Set interleaving policy for system init. This way not all
1370            the data structures allocated at system boot end up in node zero. */
1371
1372         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1373                 printk("numa_policy_init: interleaving failed\n");
1374 }
1375
1376 /* Reset policy of current process to default */
1377 void numa_default_policy(void)
1378 {
1379         do_set_mempolicy(MPOL_DEFAULT, NULL);
1380 }
1381
1382 /* Migrate a policy to a different set of nodes */
1383 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1384                                                         const nodemask_t *new)
1385 {
1386         nodemask_t tmp;
1387
1388         if (!pol)
1389                 return;
1390
1391         switch (pol->policy) {
1392         case MPOL_DEFAULT:
1393                 break;
1394         case MPOL_INTERLEAVE:
1395                 nodes_remap(tmp, pol->v.nodes, *old, *new);
1396                 pol->v.nodes = tmp;
1397                 current->il_next = node_remap(current->il_next, *old, *new);
1398                 break;
1399         case MPOL_PREFERRED:
1400                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1401                                                                 *old, *new);
1402                 break;
1403         case MPOL_BIND: {
1404                 nodemask_t nodes;
1405                 struct zone **z;
1406                 struct zonelist *zonelist;
1407
1408                 nodes_clear(nodes);
1409                 for (z = pol->v.zonelist->zones; *z; z++)
1410                         node_set((*z)->zone_pgdat->node_id, nodes);
1411                 nodes_remap(tmp, nodes, *old, *new);
1412                 nodes = tmp;
1413
1414                 zonelist = bind_zonelist(&nodes);
1415
1416                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1417                  * If that old zonelist has no remaining mems_allowed nodes,
1418                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1419                  */
1420
1421                 if (zonelist) {
1422                         /* Good - got mem - substitute new zonelist */
1423                         kfree(pol->v.zonelist);
1424                         pol->v.zonelist = zonelist;
1425                 }
1426                 break;
1427         }
1428         default:
1429                 BUG();
1430                 break;
1431         }
1432 }
1433
1434 /*
1435  * Someone moved this task to different nodes.  Fixup mempolicies.
1436  *
1437  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1438  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1439  */
1440 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1441 {
1442         rebind_policy(current->mempolicy, old, new);
1443 }