kernel/fork.c

   1 /*
   2  *  linux/kernel/fork.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'fork.c' contains the help-routines for the 'fork' system call
   9  * (see also entry.S and others).
  10  * Fork is rather simple, once you get the hang of it, but the memory
  11  * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
  12  */
  13
  14 #include <linux/config.h>
  15 #include <linux/slab.h>
  16 #include <linux/init.h>
  17 #include <linux/unistd.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/module.h>
  20 #include <linux/vmalloc.h>
  21 #include <linux/completion.h>
  22 #include <linux/namespace.h>
  23 #include <linux/personality.h>
  24 #include <linux/compiler.h>
  25
  26 #include <asm/pgtable.h>
  27 #include <asm/pgalloc.h>
  28 #include <asm/uaccess.h>
  29 #include <asm/mmu_context.h>
  30 #include <asm/processor.h>
  31
  32 /* The idle threads do not count.. */
  33 int nr_threads;
  34 int nr_running;
  35
  36 int max_threads;
  37 unsigned long total_forks;      /* Handle normal Linux uptimes. */
  38 int last_pid;
  39
  40 struct task_struct *pidhash[PIDHASH_SZ];
  41
  42 void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
  43 {
  44         unsigned long flags;
  45
  46         wait->flags &= ~WQ_FLAG_EXCLUSIVE;
  47         wq_write_lock_irqsave(&q->lock, flags);
  48         __add_wait_queue(q, wait);
  49         wq_write_unlock_irqrestore(&q->lock, flags);
  50 }
  51
  52 void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
  53 {
  54         unsigned long flags;
  55
  56         wait->flags |= WQ_FLAG_EXCLUSIVE;
  57         wq_write_lock_irqsave(&q->lock, flags);
  58         __add_wait_queue_tail(q, wait);
  59         wq_write_unlock_irqrestore(&q->lock, flags);
  60 }
  61
  62 void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
  63 {
  64         unsigned long flags;
  65
  66         wq_write_lock_irqsave(&q->lock, flags);
  67         __remove_wait_queue(q, wait);
  68         wq_write_unlock_irqrestore(&q->lock, flags);
  69 }
  70
  71 void __init fork_init(unsigned long mempages)
  72 {
  73         /*
  74          * The default maximum number of threads is set to a safe
  75          * value: the thread structures can take up at most half
  76          * of memory.
  77          */
  78         max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8;
  79
  80         init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
  81         init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
  82 }
  83
  84 /* Protects next_safe and last_pid. */
  85 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
  86
  87 static int get_pid(unsigned long flags)
  88 {
  89         static int next_safe = PID_MAX;
  90         struct task_struct *p;
  91         int pid, beginpid;
  92
  93         if (flags & CLONE_PID)
  94                 return current->pid;
  95
  96         spin_lock(&lastpid_lock);
  97         beginpid = last_pid;
  98         if((++last_pid) & 0xffff8000) {
  99                 last_pid = 300;         /* Skip daemons etc. */
 100                 goto inside;
 101         }
 102         if(last_pid >= next_safe) {
 103 inside:
 104                 next_safe = PID_MAX;
 105                 read_lock(&tasklist_lock);
 106         repeat:
 107                 for_each_task(p) {
 108                         if(p->pid == last_pid   ||
 109                            p->pgrp == last_pid  ||
 110                            p->tgid == last_pid  ||
 111                            p->session == last_pid) {
 112                                 if(++last_pid >= next_safe) {
 113                                         if(last_pid & 0xffff8000)
 114                                                 last_pid = 300;
 115                                         next_safe = PID_MAX;
 116                                 }
 117                                 if(unlikely(last_pid == beginpid)) {
 118                                         next_safe = 0;
 119                                         goto nomorepids;
 120                                 }
 121                                 goto repeat;
 122                         }
 123                         if(p->pid > last_pid && next_safe > p->pid)
 124                                 next_safe = p->pid;
 125                         if(p->pgrp > last_pid && next_safe > p->pgrp)
 126                                 next_safe = p->pgrp;
 127                         if(p->tgid > last_pid && next_safe > p->tgid)
 128                                 next_safe = p->tgid;
 129                         if(p->session > last_pid && next_safe > p->session)
 130                                 next_safe = p->session;
 131                 }
 132                 read_unlock(&tasklist_lock);
 133         }
 134         pid = last_pid;
 135         spin_unlock(&lastpid_lock);
 136
 137         return pid;
 138
 139 nomorepids:
 140         read_unlock(&tasklist_lock);
 141         spin_unlock(&lastpid_lock);
 142         return 0;
 143 }
 144
 145 static inline int dup_mmap(struct mm_struct * mm)
 146 {
 147         struct vm_area_struct * mpnt, *tmp, **pprev;
 148         int retval;
 149
 150         flush_cache_mm(current->mm);
 151         mm->locked_vm = 0;
 152         mm->mmap = NULL;
 153         mm->mmap_cache = NULL;
 154         mm->map_count = 0;
 155         mm->rss = 0;
 156         mm->cpu_vm_mask = 0;
 157         mm->swap_address = 0;
 158         pprev = &mm->mmap;
 159
 160         /*
 161          * Add it to the mmlist after the parent.
 162          * Doing it this way means that we can order the list,
 163          * and fork() won't mess up the ordering significantly.
 164          * Add it first so that swapoff can see any swap entries.
 165          */
 166         spin_lock(&mmlist_lock);
 167         list_add(&mm->mmlist, &current->mm->mmlist);
 168         mmlist_nr++;
 169         spin_unlock(&mmlist_lock);
 170
 171         for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
 172                 struct file *file;
 173
 174                 retval = -ENOMEM;
 175                 if(mpnt->vm_flags & VM_DONTCOPY)
 176                         continue;
 177                 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 178                 if (!tmp)
 179                         goto fail_nomem;
 180                 *tmp = *mpnt;
 181                 tmp->vm_flags &= ~VM_LOCKED;
 182                 tmp->vm_mm = mm;
 183                 tmp->vm_next = NULL;
 184                 file = tmp->vm_file;
 185                 if (file) {
 186                         struct inode *inode = file->f_dentry->d_inode;
 187                         get_file(file);
 188                         if (tmp->vm_flags & VM_DENYWRITE)
 189                                 atomic_dec(&inode->i_writecount);
 190
 191                         /* insert tmp into the share list, just after mpnt */
 192                         spin_lock(&inode->i_mapping->i_shared_lock);
 193                         if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
 194                                 mpnt->vm_next_share->vm_pprev_share =
 195                                         &tmp->vm_next_share;
 196                         mpnt->vm_next_share = tmp;
 197                         tmp->vm_pprev_share = &mpnt->vm_next_share;
 198                         spin_unlock(&inode->i_mapping->i_shared_lock);
 199                 }
 200
 201                 /*
 202                  * Link in the new vma and copy the page table entries:
 203                  * link in first so that swapoff can see swap entries.
 204                  */
 205                 spin_lock(&mm->page_table_lock);
 206                 *pprev = tmp;
 207                 pprev = &tmp->vm_next;
 208                 mm->map_count++;
 209                 retval = copy_page_range(mm, current->mm, tmp);
 210                 spin_unlock(&mm->page_table_lock);
 211
 212                 if (tmp->vm_ops && tmp->vm_ops->open)
 213                         tmp->vm_ops->open(tmp);
 214
 215                 if (retval)
 216                         goto fail_nomem;
 217         }
 218         retval = 0;
 219         build_mmap_rb(mm);
 220
 221 fail_nomem:
 222         flush_tlb_mm(current->mm);
 223         return retval;
 224 }
 225
 226 spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;
 227 int mmlist_nr;
 228
 229 #define allocate_mm()   (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
 230 #define free_mm(mm)     (kmem_cache_free(mm_cachep, (mm)))
 231
 232 static struct mm_struct * mm_init(struct mm_struct * mm)
 233 {
 234         atomic_set(&mm->mm_users, 1);
 235         atomic_set(&mm->mm_count, 1);
 236         init_rwsem(&mm->mmap_sem);
 237         mm->page_table_lock = SPIN_LOCK_UNLOCKED;
 238         mm->pgd = pgd_alloc(mm);
 239         mm->def_flags = 0;
 240         if (mm->pgd)
 241                 return mm;
 242         free_mm(mm);
 243         return NULL;
 244 }
 245
 246
 247 /*
 248  * Allocate and initialize an mm_struct.
 249  */
 250 struct mm_struct * mm_alloc(void)
 251 {
 252         struct mm_struct * mm;
 253
 254         mm = allocate_mm();
 255         if (mm) {
 256                 memset(mm, 0, sizeof(*mm));
 257                 return mm_init(mm);
 258         }
 259         return NULL;
 260 }
 261
 262 /*
 263  * Called when the last reference to the mm
 264  * is dropped: either by a lazy thread or by
 265  * mmput. Free the page directory and the mm.
 266  */
 267 void fastcall __mmdrop(struct mm_struct *mm)
 268 {
 269         BUG_ON(mm == &init_mm);
 270         pgd_free(mm->pgd);
 271         check_pgt_cache();
 272         destroy_context(mm);
 273         free_mm(mm);
 274 }
 275
 276 /*
 277  * Decrement the use count and release all resources for an mm.
 278  */
 279 void mmput(struct mm_struct *mm)
 280 {
 281         if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
 282                 extern struct mm_struct *swap_mm;
 283                 if (swap_mm == mm)
 284                         swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 285                 list_del(&mm->mmlist);
 286                 mmlist_nr--;
 287                 spin_unlock(&mmlist_lock);
 288                 exit_mmap(mm);
 289                 mmdrop(mm);
 290         }
 291 }
 292
 293 /* Please note the differences between mmput and mm_release.
 294  * mmput is called whenever we stop holding onto a mm_struct,
 295  * error success whatever.
 296  *
 297  * mm_release is called after a mm_struct has been removed
 298  * from the current process.
 299  *
 300  * This difference is important for error handling, when we
 301  * only half set up a mm_struct for a new process and need to restore
 302  * the old one.  Because we mmput the new mm_struct before
 303  * restoring the old one. . .
 304  * Eric Biederman 10 January 1998
 305  */
 306 void mm_release(void)
 307 {
 308         struct task_struct *tsk = current;
 309         struct completion *vfork_done = tsk->vfork_done;
 310
 311         /* notify parent sleeping on vfork() */
 312         if (vfork_done) {
 313                 tsk->vfork_done = NULL;
 314                 complete(vfork_done);
 315         }
 316 }
 317
 318 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 319 {
 320         struct mm_struct * mm, *oldmm;
 321         int retval;
 322
 323         tsk->min_flt = tsk->maj_flt = 0;
 324         tsk->cmin_flt = tsk->cmaj_flt = 0;
 325         tsk->nswap = tsk->cnswap = 0;
 326
 327         tsk->mm = NULL;
 328         tsk->active_mm = NULL;
 329
 330         /*
 331          * Are we cloning a kernel thread?
 332          *
 333          * We need to steal a active VM for that..
 334          */
 335         oldmm = current->mm;
 336         if (!oldmm)
 337                 return 0;
 338
 339         if (clone_flags & CLONE_VM) {
 340                 atomic_inc(&oldmm->mm_users);
 341                 mm = oldmm;
 342                 goto good_mm;
 343         }
 344
 345         retval = -ENOMEM;
 346         mm = allocate_mm();
 347         if (!mm)
 348                 goto fail_nomem;
 349
 350         /* Copy the current MM stuff.. */
 351         memcpy(mm, oldmm, sizeof(*mm));
 352         if (!mm_init(mm))
 353                 goto fail_nomem;
 354
 355         if (init_new_context(tsk,mm))
 356                 goto free_pt;
 357
 358         down_write(&oldmm->mmap_sem);
 359         retval = dup_mmap(mm);
 360         up_write(&oldmm->mmap_sem);
 361
 362         if (retval)
 363                 goto free_pt;
 364
 365         /*
 366          * child gets a private LDT (if there was an LDT in the parent)
 367          */
 368         copy_segments(tsk, mm);
 369
 370 good_mm:
 371         tsk->mm = mm;
 372         tsk->active_mm = mm;
 373         return 0;
 374
 375 free_pt:
 376         mmput(mm);
 377 fail_nomem:
 378         return retval;
 379 }
 380
 381 static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
 382 {
 383         struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 384         /* We don't need to lock fs - think why ;-) */
 385         if (fs) {
 386                 atomic_set(&fs->count, 1);
 387                 fs->lock = RW_LOCK_UNLOCKED;
 388                 fs->umask = old->umask;
 389                 read_lock(&old->lock);
 390                 fs->rootmnt = mntget(old->rootmnt);
 391                 fs->root = dget(old->root);
 392                 fs->pwdmnt = mntget(old->pwdmnt);
 393                 fs->pwd = dget(old->pwd);
 394                 if (old->altroot) {
 395                         fs->altrootmnt = mntget(old->altrootmnt);
 396                         fs->altroot = dget(old->altroot);
 397                 } else {
 398                         fs->altrootmnt = NULL;
 399                         fs->altroot = NULL;
 400                 }
 401                 read_unlock(&old->lock);
 402         }
 403         return fs;
 404 }
 405
 406 struct fs_struct *copy_fs_struct(struct fs_struct *old)
 407 {
 408         return __copy_fs_struct(old);
 409 }
 410
 411 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 412 {
 413         if (clone_flags & CLONE_FS) {
 414                 atomic_inc(&current->fs->count);
 415                 return 0;
 416         }
 417         tsk->fs = __copy_fs_struct(current->fs);
 418         if (!tsk->fs)
 419                 return -1;
 420         return 0;
 421 }
 422
 423 static int count_open_files(struct files_struct *files, int size)
 424 {
 425         int i;
 426
 427         /* Find the last open fd */
 428         for (i = size/(8*sizeof(long)); i > 0; ) {
 429                 if (files->open_fds->fds_bits[--i])
 430                         break;
 431         }
 432         i = (i+1) * 8 * sizeof(long);
 433         return i;
 434 }
 435
 436 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 437 {
 438         struct files_struct *oldf, *newf;
 439         struct file **old_fds, **new_fds;
 440         int open_files, nfds, size, i, error = 0;
 441
 442         /*
 443          * A background process may not have any files ...
 444          */
 445         oldf = current->files;
 446         if (!oldf)
 447                 goto out;
 448
 449         if (clone_flags & CLONE_FILES) {
 450                 atomic_inc(&oldf->count);
 451                 goto out;
 452         }
 453
 454         /*
 455          * Note: we may be using current for both targets (See exec.c)
 456          * This works because we cache current->files (old) as oldf. Don't
 457          * break this.
 458          */
 459         tsk->files = NULL;
 460         error = -ENOMEM;
 461         newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
 462         if (!newf)
 463                 goto out;
 464
 465         atomic_set(&newf->count, 1);
 466
 467         newf->file_lock     = RW_LOCK_UNLOCKED;
 468         newf->next_fd       = 0;
 469         newf->max_fds       = NR_OPEN_DEFAULT;
 470         newf->max_fdset     = __FD_SETSIZE;
 471         newf->close_on_exec = &newf->close_on_exec_init;
 472         newf->open_fds      = &newf->open_fds_init;
 473         newf->fd            = &newf->fd_array[0];
 474
 475         /* We don't yet have the oldf readlock, but even if the old
 476            fdset gets grown now, we'll only copy up to "size" fds */
 477         size = oldf->max_fdset;
 478         if (size > __FD_SETSIZE) {
 479                 newf->max_fdset = 0;
 480                 write_lock(&newf->file_lock);
 481                 error = expand_fdset(newf, size-1);
 482                 write_unlock(&newf->file_lock);
 483                 if (error)
 484                         goto out_release;
 485         }
 486         read_lock(&oldf->file_lock);
 487
 488         open_files = count_open_files(oldf, size);
 489
 490         /*
 491          * Check whether we need to allocate a larger fd array.
 492          * Note: we're not a clone task, so the open count won't
 493          * change.
 494          */
 495         nfds = NR_OPEN_DEFAULT;
 496         if (open_files > nfds) {
 497                 read_unlock(&oldf->file_lock);
 498                 newf->max_fds = 0;
 499                 write_lock(&newf->file_lock);
 500                 error = expand_fd_array(newf, open_files-1);
 501                 write_unlock(&newf->file_lock);
 502                 if (error)
 503                         goto out_release;
 504                 nfds = newf->max_fds;
 505                 read_lock(&oldf->file_lock);
 506         }
 507
 508         old_fds = oldf->fd;
 509         new_fds = newf->fd;
 510
 511         memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
 512         memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
 513
 514         for (i = open_files; i != 0; i--) {
 515                 struct file *f = *old_fds++;
 516                 if (f) {
 517                         get_file(f);
 518                 } else {
 519                         /*
 520                          * The fd may be claimed in the fd bitmap but not yet
 521                          * instantiated in the files array if a sibling thread
 522                          * is partway through open().  So make sure that this
 523                          * fd is available to the new process.
 524                          */
 525                         FD_CLR(open_files - i, newf->open_fds);
 526                 }
 527                 *new_fds++ = f;
 528         }
 529         read_unlock(&oldf->file_lock);
 530
 531         /* compute the remainder to be cleared */
 532         size = (newf->max_fds - open_files) * sizeof(struct file *);
 533
 534         /* This is long word aligned thus could use a optimized version */
 535         memset(new_fds, 0, size);
 536
 537         if (newf->max_fdset > open_files) {
 538                 int left = (newf->max_fdset-open_files)/8;
 539                 int start = open_files / (8 * sizeof(unsigned long));
 540
 541                 memset(&newf->open_fds->fds_bits[start], 0, left);
 542                 memset(&newf->close_on_exec->fds_bits[start], 0, left);
 543         }
 544
 545         tsk->files = newf;
 546         error = 0;
 547 out:
 548         return error;
 549
 550 out_release:
 551         free_fdset (newf->close_on_exec, newf->max_fdset);
 552         free_fdset (newf->open_fds, newf->max_fdset);
 553         kmem_cache_free(files_cachep, newf);
 554         goto out;
 555 }
 556
 557 /*
 558  *      Helper to unshare the files of the current task.
 559  *      We don't want to expose copy_files internals to
 560  *      the exec layer of the kernel.
 561  */
 562
 563 int unshare_files(void)
 564 {
 565         struct files_struct *files  = current->files;
 566         int rc;
 567
 568         if(!files)
 569                 BUG();
 570
 571         /* This can race but the race causes us to copy when we don't
 572            need to and drop the copy */
 573         if(atomic_read(&files->count) == 1)
 574         {
 575                 atomic_inc(&files->count);
 576                 return 0;
 577         }
 578         rc = copy_files(0, current);
 579         if(rc)
 580                 current->files = files;
 581         return rc;
 582 }
 583
 584 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 585 {
 586         struct signal_struct *sig;
 587
 588         if (clone_flags & CLONE_SIGHAND) {
 589                 atomic_inc(&current->sig->count);
 590                 return 0;
 591         }
 592         sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
 593         tsk->sig = sig;
 594         if (!sig)
 595                 return -1;
 596         spin_lock_init(&sig->siglock);
 597         atomic_set(&sig->count, 1);
 598         memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
 599         return 0;
 600 }
 601
 602 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 603 {
 604         unsigned long new_flags = p->flags;
 605
 606         new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU);
 607         new_flags |= PF_FORKNOEXEC;
 608         if (!(clone_flags & CLONE_PTRACE))
 609                 p->ptrace = 0;
 610         p->flags = new_flags;
 611 }
 612
 613 long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 614 {
 615         struct task_struct *task = current;
 616         unsigned old_task_dumpable;
 617         long ret;
 618
 619         /* lock out any potential ptracer */
 620         task_lock(task);
 621         if (task->ptrace) {
 622                 task_unlock(task);
 623                 return -EPERM;
 624         }
 625
 626         old_task_dumpable = task->task_dumpable;
 627         task->task_dumpable = 0;
 628         task_unlock(task);
 629
 630         ret = arch_kernel_thread(fn, arg, flags);
 631
 632         /* never reached in child process, only in parent */
 633         current->task_dumpable = old_task_dumpable;
 634
 635         return ret;
 636 }
 637
 638 /*
 639  *  Ok, this is the main fork-routine. It copies the system process
 640  * information (task[nr]) and sets up the necessary registers. It also
 641  * copies the data segment in its entirety.  The "stack_start" and
 642  * "stack_top" arguments are simply passed along to the platform
 643  * specific copy_thread() routine.  Most platforms ignore stack_top.
 644  * For an example that's using stack_top, see
 645  * arch/ia64/kernel/process.c.
 646  */
 647 int do_fork(unsigned long clone_flags, unsigned long stack_start,
 648             struct pt_regs *regs, unsigned long stack_size)
 649 {
 650         int retval;
 651         struct task_struct *p;
 652         struct completion vfork;
 653
 654         if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 655                 return -EINVAL;
 656
 657         retval = -EPERM;
 658
 659         /*
 660          * CLONE_PID is only allowed for the initial SMP swapper
 661          * calls
 662          */
 663         if (clone_flags & CLONE_PID) {
 664                 if (current->pid)
 665                         goto fork_out;
 666         }
 667
 668         retval = -ENOMEM;
 669         p = alloc_task_struct();
 670         if (!p)
 671                 goto fork_out;
 672
 673         *p = *current;
 674
 675         retval = -EAGAIN;
 676         /*
 677          * Check if we are over our maximum process limit, but be sure to
 678          * exclude root. This is needed to make it possible for login and
 679          * friends to set the per-user process limit to something lower
 680          * than the amount of processes root is running. -- Rik
 681          */
 682         if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur
 683                       && p->user != &root_user
 684                       && !capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE))
 685                 goto bad_fork_free;
 686
 687         atomic_inc(&p->user->__count);
 688         atomic_inc(&p->user->processes);
 689
 690         /*
 691          * Counter increases are protected by
 692          * the kernel lock so nr_threads can't
 693          * increase under us (but it may decrease).
 694          */
 695         if (nr_threads >= max_threads)
 696                 goto bad_fork_cleanup_count;
 697
 698         get_exec_domain(p->exec_domain);
 699
 700         if (p->binfmt && p->binfmt->module)
 701                 __MOD_INC_USE_COUNT(p->binfmt->module);
 702
 703         p->did_exec = 0;
 704         p->swappable = 0;
 705         p->state = TASK_UNINTERRUPTIBLE;
 706
 707         copy_flags(clone_flags, p);
 708         p->pid = get_pid(clone_flags);
 709         if (p->pid == 0 && current->pid != 0)
 710                 goto bad_fork_cleanup;
 711
 712         p->run_list.next = NULL;
 713         p->run_list.prev = NULL;
 714
 715         p->p_cptr = NULL;
 716         init_waitqueue_head(&p->wait_chldexit);
 717         p->vfork_done = NULL;
 718         if (clone_flags & CLONE_VFORK) {
 719                 p->vfork_done = &vfork;
 720                 init_completion(&vfork);
 721         }
 722         spin_lock_init(&p->alloc_lock);
 723
 724         p->sigpending = 0;
 725         init_sigpending(&p->pending);
 726
 727         p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
 728         p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
 729         init_timer(&p->real_timer);
 730         p->real_timer.data = (unsigned long) p;
 731
 732         p->leader = 0;          /* session leadership doesn't inherit */
 733         p->tty_old_pgrp = 0;
 734         p->times.tms_utime = p->times.tms_stime = 0;
 735         p->times.tms_cutime = p->times.tms_cstime = 0;
 736 #ifdef CONFIG_SMP
 737         {
 738                 int i;
 739                 p->cpus_runnable = ~0UL;
 740                 p->processor = current->processor;
 741                 /* ?? should we just memset this ?? */
 742                 for(i = 0; i < smp_num_cpus; i++)
 743                         p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
 744                 spin_lock_init(&p->sigmask_lock);
 745         }
 746 #endif
 747         p->lock_depth = -1;             /* -1 = no lock */
 748         p->start_time = jiffies;
 749
 750         INIT_LIST_HEAD(&p->local_pages);
 751
 752         retval = -ENOMEM;
 753         /* copy all the process information */
 754         if (copy_files(clone_flags, p))
 755                 goto bad_fork_cleanup;
 756         if (copy_fs(clone_flags, p))
 757                 goto bad_fork_cleanup_files;
 758         if (copy_sighand(clone_flags, p))
 759                 goto bad_fork_cleanup_fs;
 760         if (copy_mm(clone_flags, p))
 761                 goto bad_fork_cleanup_sighand;
 762         retval = copy_namespace(clone_flags, p);
 763         if (retval)
 764                 goto bad_fork_cleanup_mm;
 765         retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 766         if (retval)
 767                 goto bad_fork_cleanup_namespace;
 768         p->semundo = NULL;
 769
 770         /* Our parent execution domain becomes current domain
 771            These must match for thread signalling to apply */
 772
 773         p->parent_exec_id = p->self_exec_id;
 774
 775         /* ok, now we should be set up.. */
 776         p->swappable = 1;
 777         p->exit_signal = clone_flags & CSIGNAL;
 778         p->pdeath_signal = 0;
 779
 780         /*
 781          * "share" dynamic priority between parent and child, thus the
 782          * total amount of dynamic priorities in the system doesn't change,
 783          * more scheduling fairness. This is only important in the first
 784          * timeslice, on the long run the scheduling behaviour is unchanged.
 785          */
 786         p->counter = (current->counter + 1) >> 1;
 787         current->counter >>= 1;
 788         if (!current->counter)
 789                 current->need_resched = 1;
 790
 791         /*
 792          * Ok, add it to the run-queues and make it
 793          * visible to the rest of the system.
 794          *
 795          * Let it rip!
 796          */
 797         retval = p->pid;
 798         p->tgid = retval;
 799         INIT_LIST_HEAD(&p->thread_group);
 800
 801         /* Need tasklist lock for parent etc handling! */
 802         write_lock_irq(&tasklist_lock);
 803
 804         /* CLONE_PARENT re-uses the old parent */
 805         p->p_opptr = current->p_opptr;
 806         p->p_pptr = current->p_pptr;
 807         if (!(clone_flags & CLONE_PARENT)) {
 808                 p->p_opptr = current;
 809                 if (!(p->ptrace & PT_PTRACED))
 810                         p->p_pptr = current;
 811         }
 812
 813         if (clone_flags & CLONE_THREAD) {
 814                 p->tgid = current->tgid;
 815                 list_add(&p->thread_group, &current->thread_group);
 816         }
 817
 818         SET_LINKS(p);
 819         hash_pid(p);
 820         nr_threads++;
 821         write_unlock_irq(&tasklist_lock);
 822
 823         if (p->ptrace & PT_PTRACED)
 824                 send_sig(SIGSTOP, p, 1);
 825
 826         wake_up_process(p);             /* do this last */
 827         ++total_forks;
 828         if (clone_flags & CLONE_VFORK)
 829                 wait_for_completion(&vfork);
 830
 831 fork_out:
 832         return retval;
 833
 834 bad_fork_cleanup_namespace:
 835         exit_namespace(p);
 836 bad_fork_cleanup_mm:
 837         exit_mm(p);
 838         if (p->active_mm)
 839                 mmdrop(p->active_mm);
 840 bad_fork_cleanup_sighand:
 841         exit_sighand(p);
 842 bad_fork_cleanup_fs:
 843         exit_fs(p); /* blocking */
 844 bad_fork_cleanup_files:
 845         exit_files(p); /* blocking */
 846 bad_fork_cleanup:
 847         put_exec_domain(p->exec_domain);
 848         if (p->binfmt && p->binfmt->module)
 849                 __MOD_DEC_USE_COUNT(p->binfmt->module);
 850 bad_fork_cleanup_count:
 851         atomic_dec(&p->user->processes);
 852         free_uid(p->user);
 853 bad_fork_free:
 854         free_task_struct(p);
 855         goto fork_out;
 856 }
 857
 858 /* SLAB cache for signal_struct structures (tsk->sig) */
 859 kmem_cache_t *sigact_cachep;
 860
 861 /* SLAB cache for files_struct structures (tsk->files) */
 862 kmem_cache_t *files_cachep;
 863
 864 /* SLAB cache for fs_struct structures (tsk->fs) */
 865 kmem_cache_t *fs_cachep;
 866
 867 /* SLAB cache for vm_area_struct structures */
 868 kmem_cache_t *vm_area_cachep;
 869
 870 /* SLAB cache for mm_struct structures (tsk->mm) */
 871 kmem_cache_t *mm_cachep;
 872
 873 void __init proc_caches_init(void)
 874 {
 875         sigact_cachep = kmem_cache_create("signal_act",
 876                         sizeof(struct signal_struct), 0,
 877                         SLAB_HWCACHE_ALIGN, NULL, NULL);
 878         if (!sigact_cachep)
 879                 panic("Cannot create signal action SLAB cache");
 880
 881         files_cachep = kmem_cache_create("files_cache",
 882                          sizeof(struct files_struct), 0,
 883                          SLAB_HWCACHE_ALIGN, NULL, NULL);
 884         if (!files_cachep)
 885                 panic("Cannot create files SLAB cache");
 886
 887         fs_cachep = kmem_cache_create("fs_cache",
 888                          sizeof(struct fs_struct), 0,
 889                          SLAB_HWCACHE_ALIGN, NULL, NULL);
 890         if (!fs_cachep)
 891                 panic("Cannot create fs_struct SLAB cache");
 892
 893         vm_area_cachep = kmem_cache_create("vm_area_struct",
 894                         sizeof(struct vm_area_struct), 0,
 895                         SLAB_HWCACHE_ALIGN, NULL, NULL);
 896         if(!vm_area_cachep)
 897                 panic("vma_init: Cannot alloc vm_area_struct SLAB cache");
 898
 899         mm_cachep = kmem_cache_create("mm_struct",
 900                         sizeof(struct mm_struct), 0,
 901                         SLAB_HWCACHE_ALIGN, NULL, NULL);
 902         if(!mm_cachep)
 903                 panic("vma_init: Cannot alloc mm_struct SLAB cache");
 904 }