X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=mm%2Fmempolicy.c;h=5abc57c2b8bdd3804708267832fd20d29a65be71;hb=ba5c4f1bae89eba7b03e58a5448e8b28a006d4df;hp=8bc0be1c9efdaee5259986c616052353462286cf;hpb=dfcd3c0dc426bb75770c34b40e14f2da8845ea62;p=powerpc.git diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8bc0be1c9e..5abc57c2b8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2,6 +2,7 @@ * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * Subject to the GNU Public License, version 2. * * NUMA policy allows the user to give hints in which node(s) memory should @@ -17,13 +18,19 @@ * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. + * * bind Only allocate memory on a specific set of nodes, * no fallback. + * FIXME: memory is allocated starting with the first node + * to the last. It would be better if bind would truly restrict + * the allocation to memory nodes instead + * * preferred Try a specific node first before normal fallback. * As a special case node -1 here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -113,56 +120,6 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) } return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; } - -/* Copy a node mask from user space. */ -static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, - unsigned long maxnode, int mode) -{ - unsigned long k; - unsigned long nlongs; - unsigned long endmask; - - --maxnode; - nodes_clear(*nodes); - if (maxnode == 0 || !nmask) - return 0; - - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - if (nlongs > PAGE_SIZE/sizeof(long)) - return -EINVAL; - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; - } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; - } - - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes_addr(*nodes)[nlongs-1] &= endmask; - /* Update current mems_allowed */ - cpuset_update_current_mems_allowed(); - /* Ignore nodes not set in current->mems_allowed */ - /* AK: shouldn't this error out instead? */ - cpuset_restrict_to_mems_allowed(nodes_addr(*nodes)); - return mpol_check_policy(mode, nodes); -} - /* Generate a custom zonelist for the BIND policy. */ static struct zonelist *bind_zonelist(nodemask_t *nodes) { @@ -185,7 +142,6 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) policy_zone = k; } } - BUG_ON(num >= max); zl->zones[num] = NULL; return zl; } @@ -224,14 +180,14 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) } /* Ensure all existing pages follow the policy. */ -static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, +static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, nodemask_t *nodes) { pte_t *orig_pte; pte_t *pte; + spinlock_t *ptl; - spin_lock(&mm->page_table_lock); - orig_pte = pte = pte_offset_map(pmd, addr); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { unsigned long pfn; unsigned int nid; @@ -239,18 +195,19 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, if (!pte_present(*pte)) continue; pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) + if (!pfn_valid(pfn)) { + print_bad_pte(vma, *pte, addr); continue; + } nid = pfn_to_nid(pfn); if (!node_isset(nid, *nodes)) break; } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(orig_pte); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(orig_pte, ptl); return addr != end; } -static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, +static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, nodemask_t *nodes) { pmd_t *pmd; @@ -261,13 +218,13 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - if (check_pte_range(mm, pmd, addr, next, nodes)) + if (check_pte_range(vma, pmd, addr, next, nodes)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } -static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, +static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, nodemask_t *nodes) { pud_t *pud; @@ -278,24 +235,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(mm, pud, addr, next, nodes)) + if (check_pmd_range(vma, pud, addr, next, nodes)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } -static inline int check_pgd_range(struct mm_struct *mm, +static inline int check_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, nodemask_t *nodes) { pgd_t *pgd; unsigned long next; - pgd = pgd_offset(mm, addr); + pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(mm, pgd, addr, next, nodes)) + if (check_pud_range(vma, pgd, addr, next, nodes)) return -EIO; } while (pgd++, addr = next, addr != end); return 0; @@ -312,6 +269,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); + if (first->vm_flags & VM_RESERVED) + return ERR_PTR(-EACCES); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { if (!vma->vm_next && vma->vm_end < end) @@ -324,8 +283,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, endvma = end; if (vma->vm_start > start) start = vma->vm_start; - err = check_pgd_range(vma->vm_mm, - start, endvma, nodes); + err = check_pgd_range(vma, start, endvma, nodes); if (err) { first = ERR_PTR(err); break; @@ -379,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, return err; } -/* Change policy for a memory range */ -asmlinkage long sys_mbind(unsigned long start, unsigned long len, - unsigned long mode, - unsigned long __user *nmask, unsigned long maxnode, - unsigned flags) +static int contextualize_policy(int mode, nodemask_t *nodes) +{ + if (!nodes) + return 0; + + /* Update current mems_allowed */ + cpuset_update_current_mems_allowed(); + /* Ignore nodes not set in current->mems_allowed */ + cpuset_restrict_to_mems_allowed(nodes->bits); + return mpol_check_policy(mode, nodes); +} + +long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; - nodemask_t nodes; int err; if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) @@ -404,12 +370,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, return -EINVAL; if (end == start) return 0; - - err = get_nodes(&nodes, nmask, maxnode, mode); - if (err) - return err; - - new = mpol_new(mode, &nodes); + if (mpol_check_policy(mode, nmask)) + return -EINVAL; + new = mpol_new(mode, nmask); if (IS_ERR(new)) return PTR_ERR(new); @@ -417,7 +380,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, mode,nodes_addr(nodes)[0]); down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, &nodes, flags); + vma = check_range(mm, start, end, nmask, flags); err = PTR_ERR(vma); if (!IS_ERR(vma)) err = mbind_range(vma, start, end, new); @@ -427,19 +390,13 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, } /* Set the process memory policy */ -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, - unsigned long maxnode) +long do_set_mempolicy(int mode, nodemask_t *nodes) { - int err; struct mempolicy *new; - nodemask_t nodes; - if (mode < 0 || mode > MPOL_MAX) + if (contextualize_policy(mode, nodes)) return -EINVAL; - err = get_nodes(&nodes, nmask, maxnode, mode); - if (err) - return err; - new = mpol_new(mode, &nodes); + new = mpol_new(mode, nodes); if (IS_ERR(new)) return PTR_ERR(new); mpol_free(current->mempolicy); @@ -458,7 +415,8 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes); + node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, + *nodes); break; case MPOL_DEFAULT: break; @@ -490,38 +448,18 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) return err; } -/* Copy a kernel node mask to user space */ -static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, - nodemask_t *nodes) -{ - unsigned long copy = ALIGN(maxnode-1, 64) / 8; - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); - - if (copy > nbytes) { - if (copy > PAGE_SIZE) - return -EINVAL; - if (clear_user((char __user *)mask + nbytes, copy - nbytes)) - return -EFAULT; - copy = nbytes; - } - return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; -} - /* Retrieve NUMA policy */ -asmlinkage long sys_get_mempolicy(int __user *policy, - unsigned long __user *nmask, - unsigned long maxnode, - unsigned long addr, unsigned long flags) +long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) { - int err, pval; + int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; + cpuset_update_current_mems_allowed(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; - if (nmask != NULL && maxnode < MAX_NUMNODES) - return -EINVAL; if (flags & MPOL_F_ADDR) { down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); @@ -544,31 +482,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy, err = lookup_node(mm, addr); if (err < 0) goto out; - pval = err; + *policy = err; } else if (pol == current->mempolicy && pol->policy == MPOL_INTERLEAVE) { - pval = current->il_next; + *policy = current->il_next; } else { err = -EINVAL; goto out; } } else - pval = pol->policy; + *policy = pol->policy; if (vma) { up_read(¤t->mm->mmap_sem); vma = NULL; } - if (policy && put_user(pval, policy)) - return -EFAULT; - err = 0; - if (nmask) { - nodemask_t nodes; - get_zonemask(pol, &nodes); - err = copy_nodes_to_user(nmask, maxnode, &nodes); - } + if (nmask) + get_zonemask(pol, nmask); out: if (vma) @@ -576,6 +508,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy, return err; } +/* + * User space interface with variable sized bitmaps for nodelists. + */ + +/* Copy a node mask from user space. */ +static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long k; + unsigned long nlongs; + unsigned long endmask; + + --maxnode; + nodes_clear(*nodes); + if (maxnode == 0 || !nmask) + return 0; + + nlongs = BITS_TO_LONGS(maxnode); + if ((maxnode % BITS_PER_LONG) == 0) + endmask = ~0UL; + else + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + + /* When the user specified more nodes than supported just check + if the non supported part is all zero. */ + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { + if (nlongs > PAGE_SIZE/sizeof(long)) + return -EINVAL; + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { + unsigned long t; + if (get_user(t, nmask + k)) + return -EFAULT; + if (k == nlongs - 1) { + if (t & endmask) + return -EINVAL; + } else if (t) + return -EINVAL; + } + nlongs = BITS_TO_LONGS(MAX_NUMNODES); + endmask = ~0UL; + } + + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) + return -EFAULT; + nodes_addr(*nodes)[nlongs-1] &= endmask; + return 0; +} + +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, + nodemask_t *nodes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + + if (copy > nbytes) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) + return -EFAULT; + copy = nbytes; + } + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; +} + +asmlinkage long sys_mbind(unsigned long start, unsigned long len, + unsigned long mode, + unsigned long __user *nmask, unsigned long maxnode, + unsigned flags) +{ + nodemask_t nodes; + int err; + + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_mbind(start, len, mode, &nodes, flags); +} + +/* Set the process memory policy */ +asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, + unsigned long maxnode) +{ + int err; + nodemask_t nodes; + + if (mode < 0 || mode > MPOL_MAX) + return -EINVAL; + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_set_mempolicy(mode, &nodes); +} + +/* Retrieve NUMA policy */ +asmlinkage long sys_get_mempolicy(int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, unsigned long flags) +{ + int err, pval; + nodemask_t nodes; + + if (nmask != NULL && maxnode < MAX_NUMNODES) + return -EINVAL; + + err = do_get_mempolicy(&pval, &nodes, addr, flags); + + if (err) + return err; + + if (policy && put_user(pval, policy)) + return -EFAULT; + + if (nmask) + err = copy_nodes_to_user(nmask, maxnode, &nodes); + + return err; +} + #ifdef CONFIG_COMPAT asmlinkage long compat_sys_get_mempolicy(int __user *policy, @@ -663,7 +715,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); + pol = vma->vm_ops->get_policy(vma, addr); else if (vma->vm_policy && vma->vm_policy->policy != MPOL_DEFAULT) pol = vma->vm_policy; @@ -709,7 +761,6 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - BUG_ON(nid >= MAX_NUMNODES); next = next_node(nid, policy->v.nodes); if (next >= MAX_NUMNODES) next = first_node(policy->v.nodes); @@ -731,18 +782,17 @@ static unsigned offset_il_node(struct mempolicy *pol, nid = next_node(nid, pol->v.nodes); c++; } while (c <= target); - BUG_ON(nid >= MAX_NUMNODES); return nid; } /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) +static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, + unsigned nid) { struct zonelist *zl; struct page *page; - BUG_ON(!node_online(nid)); zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); if (page && page_zone(page) == zl->zones[0]) { @@ -785,8 +835,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) unsigned nid; if (vma) { unsigned long off; - BUG_ON(addr >= vma->vm_end); - BUG_ON(addr < vma->vm_start); off = vma->vm_pgoff; off += (addr - vma->vm_start) >> PAGE_SHIFT; nid = offset_il_node(pol, vma, off); @@ -1150,14 +1198,75 @@ void __init numa_policy_init(void) /* Set interleaving policy for system init. This way not all the data structures allocated at system boot end up in node zero. */ - if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), - MAX_NUMNODES) < 0) + if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) printk("numa_policy_init: interleaving failed\n"); } -/* Reset policy of current process to default. - * Assumes fs == KERNEL_DS */ +/* Reset policy of current process to default */ void numa_default_policy(void) { - sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); + do_set_mempolicy(MPOL_DEFAULT, NULL); +} + +/* Migrate a policy to a different set of nodes */ +static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, + const nodemask_t *new) +{ + nodemask_t tmp; + + if (!pol) + return; + + switch (pol->policy) { + case MPOL_DEFAULT: + break; + case MPOL_INTERLEAVE: + nodes_remap(tmp, pol->v.nodes, *old, *new); + pol->v.nodes = tmp; + current->il_next = node_remap(current->il_next, *old, *new); + break; + case MPOL_PREFERRED: + pol->v.preferred_node = node_remap(pol->v.preferred_node, + *old, *new); + break; + case MPOL_BIND: { + nodemask_t nodes; + struct zone **z; + struct zonelist *zonelist; + + nodes_clear(nodes); + for (z = pol->v.zonelist->zones; *z; z++) + node_set((*z)->zone_pgdat->node_id, nodes); + nodes_remap(tmp, nodes, *old, *new); + nodes = tmp; + + zonelist = bind_zonelist(&nodes); + + /* If no mem, then zonelist is NULL and we keep old zonelist. + * If that old zonelist has no remaining mems_allowed nodes, + * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. + */ + + if (zonelist) { + /* Good - got mem - substitute new zonelist */ + kfree(pol->v.zonelist); + pol->v.zonelist = zonelist; + } + break; + } + default: + BUG(); + break; + } +} + +/* + * Someone moved this task to different nodes. Fixup mempolicies. + * + * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, + * once we have a cpuset mechanism to mark which cpuset subtree is migrating. + */ +void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) +{ + rebind_policy(current->mempolicy, old, new); }