X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=kernel%2Fcpuset.c;h=2a75e44e1a41355bc55cc4819f167e4bc63f0cab;hb=a4fc7ab1d065a9dd89ed0e74439ef87d4a16e980;hp=5a06fef669f80706cf3ad389379ccae3b905af49;hpb=3e0d98b9f1eb757fc98efc84e74e54a08308aa73;p=powerpc.git diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5a06fef669..2a75e44e1a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -54,7 +55,14 @@ #include #include -#define CPUSET_SUPER_MAGIC 0x27e0eb +#define CPUSET_SUPER_MAGIC 0x27e0eb + +/* + * Tracks how many cpusets are currently defined in system. + * When there is only one cpuset (the root cpuset) we can + * short circuit some hooks. + */ +int number_of_cpusets __read_mostly; /* See "Frequency meter" comments, below. */ @@ -154,9 +162,6 @@ static struct cpuset top_cpuset = { .count = ATOMIC_INIT(0), .sibling = LIST_HEAD_INIT(top_cpuset.sibling), .children = LIST_HEAD_INIT(top_cpuset.children), - .parent = NULL, - .dentry = NULL, - .mems_generation = 0, }; static struct vfsmount *cpuset_mount; @@ -244,6 +249,11 @@ static struct super_block *cpuset_sb; * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task->alloc_lock) already in the task_struct routinely used for * such matters. + * + * P.S. One more locking exception. RCU is used to guard the + * update of a tasks cpuset pointer by attach_task() and the + * access of task->cpuset->mems_generation via that pointer in + * the routine cpuset_update_task_memory_state(). */ static DECLARE_MUTEX(manage_sem); @@ -321,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) spin_lock(&dcache_lock); node = dentry->d_subdirs.next; while (node != &dentry->d_subdirs) { - struct dentry *d = list_entry(node, struct dentry, d_child); + struct dentry *d = list_entry(node, struct dentry, d_u.d_child); list_del_init(node); if (d->d_inode) { d = dget_locked(d); @@ -333,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) } node = dentry->d_subdirs.next; } - list_del_init(&dentry->d_child); + list_del_init(&dentry->d_u.d_child); spin_unlock(&dcache_lock); remove_dir(dentry); } @@ -587,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) BUG_ON(!nodes_intersects(*pmask, node_online_map)); } -/* - * Refresh current tasks mems_allowed and mems_generation from current - * tasks cpuset. +/** + * cpuset_update_task_memory_state - update task memory placement + * + * If the current tasks cpusets mems_allowed changed behind our + * backs, update current->mems_allowed, mems_generation and task NUMA + * mempolicy to the new value. * - * Call without callback_sem or task_lock() held. May be called with - * or without manage_sem held. Will acquire task_lock() and might - * acquire callback_sem during call. + * Task mempolicy is updated by rebinding it relative to the + * current->cpuset if a task has its memory placement changed. + * Do not call this routine if in_interrupt(). * - * The task_lock() is required to dereference current->cpuset safely. - * Without it, we could pick up the pointer value of current->cpuset - * in one instruction, and then attach_task could give us a different - * cpuset, and then the cpuset we had could be removed and freed, - * and then on our next instruction, we could dereference a no longer - * valid cpuset pointer to get its mems_generation field. + * Call without callback_sem or task_lock() held. May be called + * with or without manage_sem held. Doesn't need task_lock to guard + * against another task changing a non-NULL cpuset pointer to NULL, + * as that is only done by a task on itself, and if the current task + * is here, it is not simultaneously in the exit code NULL'ing its + * cpuset pointer. This routine also might acquire callback_sem and + * current->mm->mmap_sem during call. + * + * Reading current->cpuset->mems_generation doesn't need task_lock + * to guard the current->cpuset derefence, because it is guarded + * from concurrent freeing of current->cpuset by attach_task(), + * using RCU. + * + * The rcu_dereference() is technically probably not needed, + * as I don't actually mind if I see a new cpuset pointer but + * an old value of mems_generation. However this really only + * matters on alpha systems using cpusets heavily. If I dropped + * that rcu_dereference(), it would save them a memory barrier. + * For all other arch's, rcu_dereference is a no-op anyway, and for + * alpha systems not using cpusets, another planned optimization, + * avoiding the rcu critical section for tasks in the root cpuset + * which is statically allocated, so can't vanish, will make this + * irrelevant. Better to use RCU as intended, than to engage in + * some cute trick to save a memory barrier that is impossible to + * test, for alpha systems using cpusets heavily, which might not + * even exist. * * This routine is needed to update the per-task mems_allowed data, * within the tasks context, when it is trying to allocate memory @@ -608,35 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * task has been modifying its cpuset. */ -static void refresh_mems(void) +void cpuset_update_task_memory_state() { int my_cpusets_mem_gen; + struct task_struct *tsk = current; + struct cpuset *cs; - task_lock(current); - my_cpusets_mem_gen = current->cpuset->mems_generation; - task_unlock(current); - - if (current->cpuset_mems_generation != my_cpusets_mem_gen) { - struct cpuset *cs; - nodemask_t oldmem = current->mems_allowed; - int migrate; + if (tsk->cpuset == &top_cpuset) { + /* Don't need rcu for top_cpuset. It's never freed. */ + my_cpusets_mem_gen = top_cpuset.mems_generation; + } else { + rcu_read_lock(); + cs = rcu_dereference(tsk->cpuset); + my_cpusets_mem_gen = cs->mems_generation; + rcu_read_unlock(); + } + if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { down(&callback_sem); - task_lock(current); - cs = current->cpuset; - migrate = is_memory_migrate(cs); - guarantee_online_mems(cs, ¤t->mems_allowed); - current->cpuset_mems_generation = cs->mems_generation; - task_unlock(current); + task_lock(tsk); + cs = tsk->cpuset; /* Maybe changed when task not locked */ + guarantee_online_mems(cs, &tsk->mems_allowed); + tsk->cpuset_mems_generation = cs->mems_generation; + task_unlock(tsk); up(&callback_sem); - if (!nodes_equal(oldmem, current->mems_allowed)) { - numa_policy_rebind(&oldmem, ¤t->mems_allowed); - if (migrate) { - do_migrate_pages(current->mm, &oldmem, - ¤t->mems_allowed, - MPOL_MF_MOVE_ALL); - } - } + mpol_rebind_task(tsk, &tsk->mems_allowed); } } @@ -791,29 +820,130 @@ static int update_cpumask(struct cpuset *cs, char *buf) } /* + * Handle user request to change the 'mems' memory placement + * of a cpuset. Needs to validate the request, update the + * cpusets mems_allowed and mems_generation, and for each + * task in the cpuset, rebind any vma mempolicies and if + * the cpuset is marked 'memory_migrate', migrate the tasks + * pages to the new memory. + * * Call with manage_sem held. May take callback_sem during call. + * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, + * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, char *buf) { struct cpuset trialcs; + nodemask_t oldmem; + struct task_struct *g, *p; + struct mm_struct **mmarray; + int i, n, ntasks; + int migrate; + int fudge; int retval; trialcs = *cs; retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) - return retval; + goto done; nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); - if (nodes_empty(trialcs.mems_allowed)) - return -ENOSPC; + oldmem = cs->mems_allowed; + if (nodes_equal(oldmem, trialcs.mems_allowed)) { + retval = 0; /* Too easy - nothing to do */ + goto done; + } + if (nodes_empty(trialcs.mems_allowed)) { + retval = -ENOSPC; + goto done; + } retval = validate_change(cs, &trialcs); - if (retval == 0) { - down(&callback_sem); - cs->mems_allowed = trialcs.mems_allowed; - atomic_inc(&cpuset_mems_generation); - cs->mems_generation = atomic_read(&cpuset_mems_generation); - up(&callback_sem); + if (retval < 0) + goto done; + + down(&callback_sem); + cs->mems_allowed = trialcs.mems_allowed; + atomic_inc(&cpuset_mems_generation); + cs->mems_generation = atomic_read(&cpuset_mems_generation); + up(&callback_sem); + + set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ + + fudge = 10; /* spare mmarray[] slots */ + fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ + retval = -ENOMEM; + + /* + * Allocate mmarray[] to hold mm reference for each task + * in cpuset cs. Can't kmalloc GFP_KERNEL while holding + * tasklist_lock. We could use GFP_ATOMIC, but with a + * few more lines of code, we can retry until we get a big + * enough mmarray[] w/o using GFP_ATOMIC. + */ + while (1) { + ntasks = atomic_read(&cs->count); /* guess */ + ntasks += fudge; + mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); + if (!mmarray) + goto done; + write_lock_irq(&tasklist_lock); /* block fork */ + if (atomic_read(&cs->count) <= ntasks) + break; /* got enough */ + write_unlock_irq(&tasklist_lock); /* try again */ + kfree(mmarray); } + + n = 0; + + /* Load up mmarray[] with mm reference for each task in cpuset. */ + do_each_thread(g, p) { + struct mm_struct *mm; + + if (n >= ntasks) { + printk(KERN_WARNING + "Cpuset mempolicy rebind incomplete.\n"); + continue; + } + if (p->cpuset != cs) + continue; + mm = get_task_mm(p); + if (!mm) + continue; + mmarray[n++] = mm; + } while_each_thread(g, p); + write_unlock_irq(&tasklist_lock); + + /* + * Now that we've dropped the tasklist spinlock, we can + * rebind the vma mempolicies of each mm in mmarray[] to their + * new cpuset, and release that mm. The mpol_rebind_mm() + * call takes mmap_sem, which we couldn't take while holding + * tasklist_lock. Forks can happen again now - the mpol_copy() + * cpuset_being_rebound check will catch such forks, and rebind + * their vma mempolicies too. Because we still hold the global + * cpuset manage_sem, we know that no other rebind effort will + * be contending for the global variable cpuset_being_rebound. + * It's ok if we rebind the same mm twice; mpol_rebind_mm() + * is idempotent. Also migrate pages in each mm to new nodes. + */ + migrate = is_memory_migrate(cs); + for (i = 0; i < n; i++) { + struct mm_struct *mm = mmarray[i]; + + mpol_rebind_mm(mm, &cs->mems_allowed); + if (migrate) { + do_migrate_pages(mm, &oldmem, &cs->mems_allowed, + MPOL_MF_MOVE_ALL); + } + mmput(mm); + } + + /* We're done rebinding vma's to this cpusets new mems_allowed. */ + kfree(mmarray); + set_cpuset_being_rebound(NULL); + retval = 0; +done: return retval; } @@ -985,6 +1115,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) struct cpuset *oldcs; cpumask_t cpus; nodemask_t from, to; + struct mm_struct *mm; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -1024,7 +1155,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) return -ESRCH; } atomic_inc(&cs->count); - tsk->cpuset = cs; + rcu_assign_pointer(tsk->cpuset, cs); task_unlock(tsk); guarantee_online_cpus(cs, &cpus); @@ -1034,9 +1165,17 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) to = cs->mems_allowed; up(&callback_sem); + + mm = get_task_mm(tsk); + if (mm) { + mpol_rebind_mm(mm, &to); + mmput(mm); + } + if (is_memory_migrate(cs)) do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); put_task_struct(tsk); + synchronize_rcu(); if (atomic_dec_and_test(&oldcs->count)) check_for_release(oldcs, ppathbuf); return 0; @@ -1341,7 +1480,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode) /* * cpuset_create_dir - create a directory for an object. - * cs: the cpuset we create the directory for. + * cs: the cpuset we create the directory for. * It must have a valid ->parent field * And we are going to fill its ->dentry field. * name: The name to give to the cpuset directory. Will be copied. @@ -1374,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) struct dentry *dentry; int error; - down(&dir->d_inode->i_sem); + mutex_lock(&dir->d_inode->i_mutex); dentry = cpuset_get_dentry(dir, cft->name); if (!IS_ERR(dentry)) { error = cpuset_create_file(dentry, 0644 | S_IFREG); @@ -1383,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) dput(dentry); } else error = PTR_ERR(dentry); - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); return error; } @@ -1628,7 +1767,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) return -ENOMEM; down(&manage_sem); - refresh_mems(); + cpuset_update_task_memory_state(); cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); @@ -1645,6 +1784,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) down(&callback_sem); list_add(&cs->sibling, &cs->parent->children); + number_of_cpusets++; up(&callback_sem); err = cpuset_create_dir(cs, name, mode); @@ -1653,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) /* * Release manage_sem before cpuset_populate_dir() because it - * will down() this new directory's i_sem and if we race with + * will down() this new directory's i_mutex and if we race with * another mkdir, we might deadlock. */ up(&manage_sem); @@ -1672,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) { struct cpuset *c_parent = dentry->d_parent->d_fsdata; - /* the vfs holds inode->i_sem already */ + /* the vfs holds inode->i_mutex already */ return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); } @@ -1683,10 +1823,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cpuset *parent; char *pathbuf = NULL; - /* the vfs holds both inode->i_sem already */ + /* the vfs holds both inode->i_mutex already */ down(&manage_sem); - refresh_mems(); + cpuset_update_task_memory_state(); if (atomic_read(&cs->count) > 0) { up(&manage_sem); return -EBUSY; @@ -1707,6 +1847,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) spin_unlock(&d->d_lock); cpuset_d_remove_dir(d); dput(d); + number_of_cpusets--; up(&callback_sem); if (list_empty(&parent->children)) check_for_release(parent, &pathbuf); @@ -1715,6 +1856,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) return 0; } +/* + * cpuset_init_early - just enough so that the calls to + * cpuset_update_task_memory_state() in early init code + * are harmless. + */ + +int __init cpuset_init_early(void) +{ + struct task_struct *tsk = current; + + tsk->cpuset = &top_cpuset; + tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); + return 0; +} + /** * cpuset_init - initialize cpusets at system boot * @@ -1750,6 +1906,7 @@ int __init cpuset_init(void) root->d_inode->i_nlink++; top_cpuset.dentry = root; root->d_inode->i_op = &cpuset_dir_inode_operations; + number_of_cpusets = 1; err = cpuset_populate_dir(root); /* memory_pressure_enabled is in root cpuset only */ if (err == 0) @@ -1819,15 +1976,13 @@ void cpuset_fork(struct task_struct *child) * * We don't need to task_lock() this reference to tsk->cpuset, * because tsk is already marked PF_EXITING, so attach_task() won't - * mess with it. + * mess with it, or task is a failed fork, never visible to attach_task. **/ void cpuset_exit(struct task_struct *tsk) { struct cpuset *cs; - BUG_ON(!(tsk->flags & PF_EXITING)); - cs = tsk->cpuset; tsk->cpuset = NULL; @@ -1854,14 +2009,14 @@ void cpuset_exit(struct task_struct *tsk) * tasks cpuset. **/ -cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) +cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) { cpumask_t mask; down(&callback_sem); - task_lock((struct task_struct *)tsk); + task_lock(tsk); guarantee_online_cpus(tsk->cpuset, &mask); - task_unlock((struct task_struct *)tsk); + task_unlock(tsk); up(&callback_sem); return mask; @@ -1873,33 +2028,26 @@ void cpuset_init_current_mems_allowed(void) } /** - * cpuset_update_current_mems_allowed - update mems parameters to new values - * - * If the current tasks cpusets mems_allowed changed behind our backs, - * update current->mems_allowed and mems_generation to the new value. - * Do not call this routine if in_interrupt(). + * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. * - * Call without callback_sem or task_lock() held. May be called - * with or without manage_sem held. Unless exiting, it will acquire - * task_lock(). Also might acquire callback_sem during call to - * refresh_mems(). - */ + * Description: Returns the nodemask_t mems_allowed of the cpuset + * attached to the specified @tsk. Guaranteed to return some non-empty + * subset of node_online_map, even if this means going outside the + * tasks cpuset. + **/ -void cpuset_update_current_mems_allowed(void) +nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { - struct cpuset *cs; - int need_to_refresh = 0; + nodemask_t mask; - task_lock(current); - cs = current->cpuset; - if (!cs) - goto done; - if (current->cpuset_mems_generation != cs->mems_generation) - need_to_refresh = 1; -done: - task_unlock(current); - if (need_to_refresh) - refresh_mems(); + down(&callback_sem); + task_lock(tsk); + guarantee_online_mems(tsk->cpuset, &mask); + task_unlock(tsk); + up(&callback_sem); + + return mask; } /** @@ -1972,7 +2120,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * GFP_USER - only nodes in current tasks mems allowed ok. **/ -int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { int node; /* node that zone z is on */ const struct cpuset *cs; /* current cpuset ancestors */ @@ -2049,7 +2197,7 @@ done: * cpuset file 'memory_pressure_enabled' in the root cpuset. */ -int cpuset_memory_pressure_enabled; +int cpuset_memory_pressure_enabled __read_mostly; /** * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.