mm/page_alloc.c

   1 /*
   2  *  linux/mm/page_alloc.c
   3  *
   4  *  Manages the free list, the system allocates free pages here.
   5  *  Note that kmalloc() lives in slab.c
   6  *
   7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8  *  Swap reorganised 29.12.95, Stephen Tweedie
   9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  13  */
  14
  15 #include <linux/config.h>
  16 #include <linux/mm.h>
  17 #include <linux/swap.h>
  18 #include <linux/swapctl.h>
  19 #include <linux/interrupt.h>
  20 #include <linux/pagemap.h>
  21 #include <linux/bootmem.h>
  22 #include <linux/slab.h>
  23 #include <linux/module.h>
  24
  25 int nr_swap_pages;
  26 int nr_active_pages;
  27 int nr_inactive_pages;
  28 LIST_HEAD(inactive_list);
  29 LIST_HEAD(active_list);
  30 pg_data_t *pgdat_list;
  31
  32 /*
  33  *
  34  * The zone_table array is used to look up the address of the
  35  * struct zone corresponding to a given zone number (ZONE_DMA,
  36  * ZONE_NORMAL, or ZONE_HIGHMEM).
  37  */
  38 zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
  39 EXPORT_SYMBOL(zone_table);
  40
  41 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  42 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
  43 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
  44 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
  45
  46 /*
  47  * Temporary debugging check.
  48  */
  49 #define BAD_RANGE(zone, page)                                           \
  50 (                                                                       \
  51         (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \
  52         || (((page) - mem_map) < (zone)->zone_start_mapnr)              \
  53         || ((zone) != page_zone(page))                                  \
  54 )
  55
  56 /*
  57  * Freeing function for a buddy system allocator.
  58  * Contrary to prior comments, this is *NOT* hairy, and there
  59  * is no reason for anyone not to understand it.
  60  *
  61  * The concept of a buddy system is to maintain direct-mapped tables
  62  * (containing bit values) for memory blocks of various "orders".
  63  * The bottom level table contains the map for the smallest allocatable
  64  * units of memory (here, pages), and each level above it describes
  65  * pairs of units from the levels below, hence, "buddies".
  66  * At a high level, all that happens here is marking the table entry
  67  * at the bottom level available, and propagating the changes upward
  68  * as necessary, plus some accounting needed to play nicely with other
  69  * parts of the VM system.
  70  * At each level, we keep one bit for each pair of blocks, which
  71  * is set to 1 iff only one of the pair is allocated.  So when we
  72  * are allocating or freeing one, we can derive the state of the
  73  * other.  That is, if we allocate a small block, and both were
  74  * free, the remainder of the region must be split into blocks.
  75  * If a block is freed, and its buddy is also free, then this
  76  * triggers coalescing into a block of larger size.
  77  *
  78  * -- wli
  79  */
  80
  81 static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
  82 static void __free_pages_ok (struct page *page, unsigned int order)
  83 {
  84         unsigned long index, page_idx, mask, flags;
  85         free_area_t *area;
  86         struct page *base;
  87         zone_t *zone;
  88
  89         /*
  90          * Yes, think what happens when other parts of the kernel take
  91          * a reference to a page in order to pin it for io. -ben
  92          */
  93         if (PageLRU(page)) {
  94                 if (unlikely(in_interrupt()))
  95                         BUG();
  96                 lru_cache_del(page);
  97         }
  98
  99         if (page->buffers)
 100                 BUG();
 101         if (page->mapping)
 102                 BUG();
 103         if (!VALID_PAGE(page))
 104                 BUG();
 105         if (PageLocked(page))
 106                 BUG();
 107         if (PageActive(page))
 108                 BUG();
 109         page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
 110
 111         if (current->flags & PF_FREE_PAGES)
 112                 goto local_freelist;
 113  back_local_freelist:
 114
 115         zone = page_zone(page);
 116
 117         mask = (~0UL) << order;
 118         base = zone->zone_mem_map;
 119         page_idx = page - base;
 120         if (page_idx & ~mask)
 121                 BUG();
 122         index = page_idx >> (1 + order);
 123
 124         area = zone->free_area + order;
 125
 126         spin_lock_irqsave(&zone->lock, flags);
 127
 128         zone->free_pages -= mask;
 129
 130         while (mask + (1 << (MAX_ORDER-1))) {
 131                 struct page *buddy1, *buddy2;
 132
 133                 if (area >= zone->free_area + MAX_ORDER)
 134                         BUG();
 135                 if (!__test_and_change_bit(index, area->map))
 136                         /*
 137                          * the buddy page is still allocated.
 138                          */
 139                         break;
 140                 /*
 141                  * Move the buddy up one level.
 142                  * This code is taking advantage of the identity:
 143                  *      -mask = 1+~mask
 144                  */
 145                 buddy1 = base + (page_idx ^ -mask);
 146                 buddy2 = base + page_idx;
 147                 if (BAD_RANGE(zone,buddy1))
 148                         BUG();
 149                 if (BAD_RANGE(zone,buddy2))
 150                         BUG();
 151
 152                 list_del(&buddy1->list);
 153                 mask <<= 1;
 154                 area++;
 155                 index >>= 1;
 156                 page_idx &= mask;
 157         }
 158         list_add(&(base + page_idx)->list, &area->free_list);
 159
 160         spin_unlock_irqrestore(&zone->lock, flags);
 161         return;
 162
 163  local_freelist:
 164         if (current->nr_local_pages)
 165                 goto back_local_freelist;
 166         if (in_interrupt())
 167                 goto back_local_freelist;
 168
 169         list_add(&page->list, &current->local_pages);
 170         page->index = order;
 171         current->nr_local_pages++;
 172 }
 173
 174 #define MARK_USED(index, order, area) \
 175         __change_bit((index) >> (1+(order)), (area)->map)
 176
 177 static inline struct page * expand (zone_t *zone, struct page *page,
 178          unsigned long index, int low, int high, free_area_t * area)
 179 {
 180         unsigned long size = 1 << high;
 181
 182         while (high > low) {
 183                 if (BAD_RANGE(zone,page))
 184                         BUG();
 185                 area--;
 186                 high--;
 187                 size >>= 1;
 188                 list_add(&(page)->list, &(area)->free_list);
 189                 MARK_USED(index, high, area);
 190                 index += size;
 191                 page += size;
 192         }
 193         if (BAD_RANGE(zone,page))
 194                 BUG();
 195         return page;
 196 }
 197
 198 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
 199 static struct page * rmqueue(zone_t *zone, unsigned int order)
 200 {
 201         free_area_t * area = zone->free_area + order;
 202         unsigned int curr_order = order;
 203         struct list_head *head, *curr;
 204         unsigned long flags;
 205         struct page *page;
 206
 207         spin_lock_irqsave(&zone->lock, flags);
 208         do {
 209                 head = &area->free_list;
 210                 curr = head->next;
 211
 212                 if (curr != head) {
 213                         unsigned int index;
 214
 215                         page = list_entry(curr, struct page, list);
 216                         if (BAD_RANGE(zone,page))
 217                                 BUG();
 218                         list_del(curr);
 219                         index = page - zone->zone_mem_map;
 220                         if (curr_order != MAX_ORDER-1)
 221                                 MARK_USED(index, curr_order, area);
 222                         zone->free_pages -= 1UL << order;
 223
 224                         page = expand(zone, page, index, order, curr_order, area);
 225                         spin_unlock_irqrestore(&zone->lock, flags);
 226
 227                         set_page_count(page, 1);
 228                         if (BAD_RANGE(zone,page))
 229                                 BUG();
 230                         if (PageLRU(page))
 231                                 BUG();
 232                         if (PageActive(page))
 233                                 BUG();
 234                         return page;
 235                 }
 236                 curr_order++;
 237                 area++;
 238         } while (curr_order < MAX_ORDER);
 239         spin_unlock_irqrestore(&zone->lock, flags);
 240
 241         return NULL;
 242 }
 243
 244 #ifndef CONFIG_DISCONTIGMEM
 245 struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
 246 {
 247         return __alloc_pages(gfp_mask, order,
 248                 contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
 249 }
 250 #endif
 251
 252 static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
 253 static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
 254 {
 255         struct page * page = NULL;
 256         int __freed = 0;
 257
 258         if (!(gfp_mask & __GFP_WAIT))
 259                 goto out;
 260         if (in_interrupt())
 261                 BUG();
 262
 263         current->allocation_order = order;
 264         current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
 265
 266         __freed = try_to_free_pages_zone(classzone, gfp_mask);
 267
 268         current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
 269
 270         if (current->nr_local_pages) {
 271                 struct list_head * entry, * local_pages;
 272                 struct page * tmp;
 273                 int nr_pages;
 274
 275                 local_pages = &current->local_pages;
 276
 277                 if (likely(__freed)) {
 278                         /* pick from the last inserted so we're lifo */
 279                         entry = local_pages->next;
 280                         do {
 281                                 tmp = list_entry(entry, struct page, list);
 282                                 if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
 283                                         list_del(entry);
 284                                         current->nr_local_pages--;
 285                                         set_page_count(tmp, 1);
 286                                         page = tmp;
 287
 288                                         if (page->buffers)
 289                                                 BUG();
 290                                         if (page->mapping)
 291                                                 BUG();
 292                                         if (!VALID_PAGE(page))
 293                                                 BUG();
 294                                         if (PageLocked(page))
 295                                                 BUG();
 296                                         if (PageLRU(page))
 297                                                 BUG();
 298                                         if (PageActive(page))
 299                                                 BUG();
 300                                         if (PageDirty(page))
 301                                                 BUG();
 302
 303                                         break;
 304                                 }
 305                         } while ((entry = entry->next) != local_pages);
 306                 }
 307
 308                 nr_pages = current->nr_local_pages;
 309                 /* free in reverse order so that the global order will be lifo */
 310                 while ((entry = local_pages->prev) != local_pages) {
 311                         list_del(entry);
 312                         tmp = list_entry(entry, struct page, list);
 313                         __free_pages_ok(tmp, tmp->index);
 314                         if (!nr_pages--)
 315                                 BUG();
 316                 }
 317                 current->nr_local_pages = 0;
 318         }
 319  out:
 320         *freed = __freed;
 321         return page;
 322 }
 323
 324 /*
 325  * This is the 'heart' of the zoned buddy allocator:
 326  */
 327 struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
 328 {
 329         unsigned long min;
 330         zone_t **zone, * classzone;
 331         struct page * page;
 332         int freed;
 333
 334         zone = zonelist->zones;
 335         classzone = *zone;
 336         if (classzone == NULL)
 337                 return NULL;
 338         min = 1UL << order;
 339         for (;;) {
 340                 zone_t *z = *(zone++);
 341                 if (!z)
 342                         break;
 343
 344                 min += z->pages_low;
 345                 if (z->free_pages > min) {
 346                         page = rmqueue(z, order);
 347                         if (page)
 348                                 return page;
 349                 }
 350         }
 351
 352         classzone->need_balance = 1;
 353         mb();
 354         if (waitqueue_active(&kswapd_wait))
 355                 wake_up_interruptible(&kswapd_wait);
 356
 357         zone = zonelist->zones;
 358         min = 1UL << order;
 359         for (;;) {
 360                 unsigned long local_min;
 361                 zone_t *z = *(zone++);
 362                 if (!z)
 363                         break;
 364
 365                 local_min = z->pages_min;
 366                 if (!(gfp_mask & __GFP_WAIT))
 367                         local_min >>= 2;
 368                 min += local_min;
 369                 if (z->free_pages > min) {
 370                         page = rmqueue(z, order);
 371                         if (page)
 372                                 return page;
 373                 }
 374         }
 375
 376         /* here we're in the low on memory slow path */
 377
 378 rebalance:
 379         if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
 380                 zone = zonelist->zones;
 381                 for (;;) {
 382                         zone_t *z = *(zone++);
 383                         if (!z)
 384                                 break;
 385
 386                         page = rmqueue(z, order);
 387                         if (page)
 388                                 return page;
 389                 }
 390                 return NULL;
 391         }
 392
 393         /* Atomic allocations - we can't balance anything */
 394         if (!(gfp_mask & __GFP_WAIT))
 395                 return NULL;
 396
 397         page = balance_classzone(classzone, gfp_mask, order, &freed);
 398         if (page)
 399                 return page;
 400
 401         zone = zonelist->zones;
 402         min = 1UL << order;
 403         for (;;) {
 404                 zone_t *z = *(zone++);
 405                 if (!z)
 406                         break;
 407
 408                 min += z->pages_min;
 409                 if (z->free_pages > min) {
 410                         page = rmqueue(z, order);
 411                         if (page)
 412                                 return page;
 413                 }
 414         }
 415
 416         /* Don't let big-order allocations loop */
 417         if (order > 3)
 418                 return NULL;
 419
 420         /* Yield for kswapd, and try again */
 421         yield();
 422         goto rebalance;
 423 }
 424
 425 /*
 426  * Common helper functions.
 427  */
 428 unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
 429 {
 430         struct page * page;
 431
 432         page = alloc_pages(gfp_mask, order);
 433         if (!page)
 434                 return 0;
 435         return (unsigned long) page_address(page);
 436 }
 437
 438 unsigned long get_zeroed_page(unsigned int gfp_mask)
 439 {
 440         struct page * page;
 441
 442         page = alloc_pages(gfp_mask, 0);
 443         if (page) {
 444                 void *address = page_address(page);
 445                 clear_page(address);
 446                 return (unsigned long) address;
 447         }
 448         return 0;
 449 }
 450
 451 void __free_pages(struct page *page, unsigned int order)
 452 {
 453         if (!PageReserved(page) && put_page_testzero(page))
 454                 __free_pages_ok(page, order);
 455 }
 456
 457 void free_pages(unsigned long addr, unsigned int order)
 458 {
 459         if (addr != 0)
 460                 __free_pages(virt_to_page(addr), order);
 461 }
 462
 463 /*
 464  * Total amount of free (allocatable) RAM:
 465  */
 466 unsigned int nr_free_pages (void)
 467 {
 468         unsigned int sum = 0;
 469         zone_t *zone;
 470
 471         for_each_zone(zone)
 472                 sum += zone->free_pages;
 473
 474         return sum;
 475 }
 476
 477 /*
 478  * Amount of free RAM allocatable as buffer memory:
 479  */
 480 unsigned int nr_free_buffer_pages (void)
 481 {
 482         pg_data_t *pgdat;
 483         unsigned int sum = 0;
 484
 485         for_each_pgdat(pgdat) {
 486                 zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
 487                 zone_t **zonep = zonelist->zones;
 488                 zone_t *zone;
 489
 490                 for (zone = *zonep++; zone; zone = *zonep++) {
 491                         unsigned long size = zone->size;
 492                         unsigned long high = zone->pages_high;
 493                         if (size > high)
 494                                 sum += size - high;
 495                 }
 496         }
 497
 498         return sum;
 499 }
 500
 501 #if CONFIG_HIGHMEM
 502 unsigned int nr_free_highpages (void)
 503 {
 504         pg_data_t *pgdat;
 505         unsigned int pages = 0;
 506
 507         for_each_pgdat(pgdat)
 508                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 509
 510         return pages;
 511 }
 512 #endif
 513
 514 #define K(x) ((x) << (PAGE_SHIFT-10))
 515
 516 /*
 517  * Show free area list (used inside shift_scroll-lock stuff)
 518  * We also calculate the percentage fragmentation. We do this by counting the
 519  * memory on each free list with the exception of the first item on the list.
 520  */
 521 void show_free_areas_core(pg_data_t *pgdat)
 522 {
 523         unsigned int order;
 524         unsigned type;
 525         pg_data_t *tmpdat = pgdat;
 526
 527         printk("Free pages:      %6dkB (%6dkB HighMem)\n",
 528                 K(nr_free_pages()),
 529                 K(nr_free_highpages()));
 530
 531         while (tmpdat) {
 532                 zone_t *zone;
 533                 for (zone = tmpdat->node_zones;
 534                                 zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
 535                         printk("Zone:%s freepages:%6lukB min:%6lukB low:%6lukB "
 536                                        "high:%6lukB\n",
 537                                         zone->name,
 538                                         K(zone->free_pages),
 539                                         K(zone->pages_min),
 540                                         K(zone->pages_low),
 541                                         K(zone->pages_high));
 542
 543                 tmpdat = tmpdat->node_next;
 544         }
 545
 546         printk("( Active: %d, inactive: %d, free: %d )\n",
 547                nr_active_pages,
 548                nr_inactive_pages,
 549                nr_free_pages());
 550
 551         for (type = 0; type < MAX_NR_ZONES; type++) {
 552                 struct list_head *head, *curr;
 553                 zone_t *zone = pgdat->node_zones + type;
 554                 unsigned long nr, total, flags;
 555
 556                 total = 0;
 557                 if (zone->size) {
 558                         spin_lock_irqsave(&zone->lock, flags);
 559                         for (order = 0; order < MAX_ORDER; order++) {
 560                                 head = &(zone->free_area + order)->free_list;
 561                                 curr = head;
 562                                 nr = 0;
 563                                 for (;;) {
 564                                         if ((curr = curr->next) == head)
 565                                                 break;
 566                                         nr++;
 567                                 }
 568                                 total += nr * (1 << order);
 569                                 printk("%lu*%lukB ", nr, K(1UL) << order);
 570                         }
 571                         spin_unlock_irqrestore(&zone->lock, flags);
 572                 }
 573                 printk("= %lukB)\n", K(total));
 574         }
 575
 576 #ifdef SWAP_CACHE_INFO
 577         show_swap_cache_info();
 578 #endif
 579 }
 580
 581 void show_free_areas(void)
 582 {
 583         show_free_areas_core(pgdat_list);
 584 }
 585
 586 /*
 587  * Builds allocation fallback zone lists.
 588  */
 589 static inline void build_zonelists(pg_data_t *pgdat)
 590 {
 591         int i, j, k;
 592
 593         for (i = 0; i <= GFP_ZONEMASK; i++) {
 594                 zonelist_t *zonelist;
 595                 zone_t *zone;
 596
 597                 zonelist = pgdat->node_zonelists + i;
 598                 memset(zonelist, 0, sizeof(*zonelist));
 599
 600                 j = 0;
 601                 k = ZONE_NORMAL;
 602                 if (i & __GFP_HIGHMEM)
 603                         k = ZONE_HIGHMEM;
 604                 if (i & __GFP_DMA)
 605                         k = ZONE_DMA;
 606
 607                 switch (k) {
 608                         default:
 609                                 BUG();
 610                         /*
 611                          * fallthrough:
 612                          */
 613                         case ZONE_HIGHMEM:
 614                                 zone = pgdat->node_zones + ZONE_HIGHMEM;
 615                                 if (zone->size) {
 616 #ifndef CONFIG_HIGHMEM
 617                                         BUG();
 618 #endif
 619                                         zonelist->zones[j++] = zone;
 620                                 }
 621                         case ZONE_NORMAL:
 622                                 zone = pgdat->node_zones + ZONE_NORMAL;
 623                                 if (zone->size)
 624                                         zonelist->zones[j++] = zone;
 625                         case ZONE_DMA:
 626                                 zone = pgdat->node_zones + ZONE_DMA;
 627                                 if (zone->size)
 628                                         zonelist->zones[j++] = zone;
 629                 }
 630                 zonelist->zones[j++] = NULL;
 631         }
 632 }
 633
 634 /*
 635  * Helper functions to size the waitqueue hash table.
 636  * Essentially these want to choose hash table sizes sufficiently
 637  * large so that collisions trying to wait on pages are rare.
 638  * But in fact, the number of active page waitqueues on typical
 639  * systems is ridiculously low, less than 200. So this is even
 640  * conservative, even though it seems large.
 641  *
 642  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
 643  * waitqueues, i.e. the size of the waitq table given the number of pages.
 644  */
 645 #define PAGES_PER_WAITQUEUE     256
 646
 647 static inline unsigned long wait_table_size(unsigned long pages)
 648 {
 649         unsigned long size = 1;
 650
 651         pages /= PAGES_PER_WAITQUEUE;
 652
 653         while (size < pages)
 654                 size <<= 1;
 655
 656         /*
 657          * Once we have dozens or even hundreds of threads sleeping
 658          * on IO we've got bigger problems than wait queue collision.
 659          * Limit the size of the wait table to a reasonable size.
 660          */
 661         size = min(size, 4096UL);
 662
 663         return size;
 664 }
 665
 666 /*
 667  * This is an integer logarithm so that shifts can be used later
 668  * to extract the more random high bits from the multiplicative
 669  * hash function before the remainder is taken.
 670  */
 671 static inline unsigned long wait_table_bits(unsigned long size)
 672 {
 673         return ffz(~size);
 674 }
 675
 676 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 677
 678 /*
 679  * Set up the zone data structures:
 680  *   - mark all pages reserved
 681  *   - mark all memory queues empty
 682  *   - clear the memory bitmaps
 683  */
 684 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 685         unsigned long *zones_size, unsigned long zone_start_paddr,
 686         unsigned long *zholes_size, struct page *lmem_map)
 687 {
 688         unsigned long i, j;
 689         unsigned long map_size;
 690         unsigned long totalpages, offset, realtotalpages;
 691         const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
 692
 693         if (zone_start_paddr & ~PAGE_MASK)
 694                 BUG();
 695
 696         totalpages = 0;
 697         for (i = 0; i < MAX_NR_ZONES; i++) {
 698                 unsigned long size = zones_size[i];
 699                 totalpages += size;
 700         }
 701         realtotalpages = totalpages;
 702         if (zholes_size)
 703                 for (i = 0; i < MAX_NR_ZONES; i++)
 704                         realtotalpages -= zholes_size[i];
 705
 706         printk("On node %d totalpages: %lu\n", nid, realtotalpages);
 707
 708         /*
 709          * Some architectures (with lots of mem and discontinous memory
 710          * maps) have to search for a good mem_map area:
 711          * For discontigmem, the conceptual mem map array starts from
 712          * PAGE_OFFSET, we need to align the actual array onto a mem map
 713          * boundary, so that MAP_NR works.
 714          */
 715         map_size = (totalpages + 1)*sizeof(struct page);
 716         if (lmem_map == (struct page *)0) {
 717                 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
 718                 lmem_map = (struct page *)(PAGE_OFFSET +
 719                         MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
 720         }
 721         *gmap = pgdat->node_mem_map = lmem_map;
 722         pgdat->node_size = totalpages;
 723         pgdat->node_start_paddr = zone_start_paddr;
 724         pgdat->node_start_mapnr = (lmem_map - mem_map);
 725         pgdat->nr_zones = 0;
 726
 727         offset = lmem_map - mem_map;
 728         for (j = 0; j < MAX_NR_ZONES; j++) {
 729                 zone_t *zone = pgdat->node_zones + j;
 730                 unsigned long mask;
 731                 unsigned long size, realsize;
 732
 733                 zone_table[nid * MAX_NR_ZONES + j] = zone;
 734                 realsize = size = zones_size[j];
 735                 if (zholes_size)
 736                         realsize -= zholes_size[j];
 737
 738                 printk("zone(%lu): %lu pages.\n", j, size);
 739                 zone->size = size;
 740                 zone->name = zone_names[j];
 741                 zone->lock = SPIN_LOCK_UNLOCKED;
 742                 zone->zone_pgdat = pgdat;
 743                 zone->free_pages = 0;
 744                 zone->need_balance = 0;
 745                 if (!size)
 746                         continue;
 747
 748                 /*
 749                  * The per-page waitqueue mechanism uses hashed waitqueues
 750                  * per zone.
 751                  */
 752                 zone->wait_table_size = wait_table_size(size);
 753                 zone->wait_table_shift =
 754                         BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
 755                 zone->wait_table = (wait_queue_head_t *)
 756                         alloc_bootmem_node(pgdat, zone->wait_table_size
 757                                                 * sizeof(wait_queue_head_t));
 758
 759                 for(i = 0; i < zone->wait_table_size; ++i)
 760                         init_waitqueue_head(zone->wait_table + i);
 761
 762                 pgdat->nr_zones = j+1;
 763
 764                 mask = (realsize / zone_balance_ratio[j]);
 765                 if (mask < zone_balance_min[j])
 766                         mask = zone_balance_min[j];
 767                 else if (mask > zone_balance_max[j])
 768                         mask = zone_balance_max[j];
 769                 zone->pages_min = mask;
 770                 zone->pages_low = mask*2;
 771                 zone->pages_high = mask*3;
 772
 773                 zone->zone_mem_map = mem_map + offset;
 774                 zone->zone_start_mapnr = offset;
 775                 zone->zone_start_paddr = zone_start_paddr;
 776
 777                 if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
 778                         printk("BUG: wrong zone alignment, it will crash\n");
 779
 780                 /*
 781                  * Initially all pages are reserved - free ones are freed
 782                  * up by free_all_bootmem() once the early boot process is
 783                  * done. Non-atomic initialization, single-pass.
 784                  */
 785                 for (i = 0; i < size; i++) {
 786                         struct page *page = mem_map + offset + i;
 787                         set_page_zone(page, nid * MAX_NR_ZONES + j);
 788                         set_page_count(page, 0);
 789                         SetPageReserved(page);
 790                         INIT_LIST_HEAD(&page->list);
 791                         if (j != ZONE_HIGHMEM)
 792                                 set_page_address(page, __va(zone_start_paddr));
 793                         zone_start_paddr += PAGE_SIZE;
 794                 }
 795
 796                 offset += size;
 797                 for (i = 0; ; i++) {
 798                         unsigned long bitmap_size;
 799
 800                         INIT_LIST_HEAD(&zone->free_area[i].free_list);
 801                         if (i == MAX_ORDER-1) {
 802                                 zone->free_area[i].map = NULL;
 803                                 break;
 804                         }
 805
 806                         /*
 807                          * Page buddy system uses "index >> (i+1)",
 808                          * where "index" is at most "size-1".
 809                          *
 810                          * The extra "+3" is to round down to byte
 811                          * size (8 bits per byte assumption). Thus
 812                          * we get "(size-1) >> (i+4)" as the last byte
 813                          * we can access.
 814                          *
 815                          * The "+1" is because we want to round the
 816                          * byte allocation up rather than down. So
 817                          * we should have had a "+7" before we shifted
 818                          * down by three. Also, we have to add one as
 819                          * we actually _use_ the last bit (it's [0,n]
 820                          * inclusive, not [0,n[).
 821                          *
 822                          * So we actually had +7+1 before we shift
 823                          * down by 3. But (n+8) >> 3 == (n >> 3) + 1
 824                          * (modulo overflows, which we do not have).
 825                          *
 826                          * Finally, we LONG_ALIGN because all bitmap
 827                          * operations are on longs.
 828                          */
 829                         bitmap_size = (size-1) >> (i+4);
 830                         bitmap_size = LONG_ALIGN(bitmap_size+1);
 831                         zone->free_area[i].map =
 832                           (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
 833                 }
 834         }
 835         build_zonelists(pgdat);
 836 }
 837
 838 void __init free_area_init(unsigned long *zones_size)
 839 {
 840         free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
 841 }
 842
 843 static int __init setup_mem_frac(char *str)
 844 {
 845         int j = 0;
 846
 847         while (get_option(&str, &zone_balance_ratio[j++]) == 2);
 848         printk("setup_mem_frac: ");
 849         for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
 850         printk("\n");
 851         return 1;
 852 }
 853
 854 __setup("memfrac=", setup_mem_frac);