Merge branch 'master'

[powerpc.git] / arch / powerpc / mm / hugetlbpage.c
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c

index 0ea0994..54131b8 100644 (file)
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -47,10 +47,25 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
                 pu = pud_offset(pg, addr);
                 if (!pud_none(*pu)) {
                         pm = pmd_offset(pu, addr);
+#ifdef CONFIG_PPC_64K_PAGES
+                       /* Currently, we use the normal PTE offset within full
+                        * size PTE pages, thus our huge PTEs are scattered in
+                        * the PTE page and we do waste some. We may change
+                        * that in the future, but the current mecanism keeps
+                        * things much simpler
+                        */
+                       if (!pmd_none(*pm)) {
+                               /* Note: pte_offset_* are all equivalent on
+                                * ppc64 as we don't have HIGHMEM
+                                */
+                               pt = pte_offset_kernel(pm, addr);
+                               return pt;
+                       }
+#else /* CONFIG_PPC_64K_PAGES */
+                       /* On 4k pages, we put huge PTEs in the PMD page */
                         pt = (pte_t *)pm;
-                       BUG_ON(!pmd_none(*pm)
-                              && !(pte_present(*pt) && pte_huge(*pt)));
                         return pt;
+#endif /* CONFIG_PPC_64K_PAGES */
                 }
         }
  
@@ -74,9 +89,16 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
         if (pu) {
                 pm = pmd_alloc(mm, pu, addr);
                 if (pm) {
+#ifdef CONFIG_PPC_64K_PAGES
+                       /* See comment in huge_pte_offset. Note that if we ever
+                        * want to put the page size in the PMD, we would have
+                        * to open code our own pte_alloc* function in order
+                        * to populate and set the size atomically
+                        */
+                       pt = pte_alloc_map(mm, pm, addr);
+#else /* CONFIG_PPC_64K_PAGES */
                         pt = (pte_t *)pm;
-                       BUG_ON(!pmd_none(*pm)
-                              && !(pte_present(*pt) && pte_huge(*pt)));
+#endif /* CONFIG_PPC_64K_PAGES */
                         return pt;
                 }
         }
@@ -84,35 +106,29 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
         return NULL;
  }
  
-#define HUGEPTE_BATCH_SIZE     (HPAGE_SIZE / PMD_SIZE)
-
  void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                      pte_t *ptep, pte_t pte)
  {
-       int i;
-
         if (pte_present(*ptep)) {
-               pte_clear(mm, addr, ptep);
+               /* We open-code pte_clear because we need to pass the right
+                * argument to hpte_update (huge / !huge)
+                */
+               unsigned long old = pte_update(ptep, ~0UL);
+               if (old & _PAGE_HASHPTE)
+                       hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
                 flush_tlb_pending();
         }
-
-       for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
-               *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-               ptep++;
-       }
+       *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
  }
  
  pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                               pte_t *ptep)
  {
         unsigned long old = pte_update(ptep, ~0UL);
-       int i;
  
         if (old & _PAGE_HASHPTE)
-               hpte_update(mm, addr, old, 0);
-
-       for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
-               ptep[i] = __pte(0);
+               hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
+       *ptep = __pte(0);
  
         return __pte(old);
  }
@@ -132,43 +148,63 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
         return 0;
  }
  
+struct slb_flush_info {
+       struct mm_struct *mm;
+       u16 newareas;
+};
+
  static void flush_low_segments(void *parm)
  {
-       u16 areas = (unsigned long) parm;
+       struct slb_flush_info *fi = parm;
         unsigned long i;
  
-       asm volatile("isync" : : : "memory");
+       BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
+
+       if (current->active_mm != fi->mm)
+               return;
  
-       BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
+       /* Only need to do anything if this CPU is working in the same
+        * mm as the one which has changed */
  
+       /* update the paca copy of the context struct */
+       get_paca()->context = current->active_mm->context;
+
+       asm volatile("isync" : : : "memory");
         for (i = 0; i < NUM_LOW_AREAS; i++) {
-               if (! (areas & (1U << i)))
+               if (! (fi->newareas & (1U << i)))
                         continue;
                 asm volatile("slbie %0"
                              : : "r" ((i << SID_SHIFT) | SLBIE_C));
         }
-
         asm volatile("isync" : : : "memory");
  }
  
  static void flush_high_segments(void *parm)
  {
-       u16 areas = (unsigned long) parm;
+       struct slb_flush_info *fi = parm;
         unsigned long i, j;
  
-       asm volatile("isync" : : : "memory");
  
-       BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+       BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
+
+       if (current->active_mm != fi->mm)
+               return;
+
+       /* Only need to do anything if this CPU is working in the same
+        * mm as the one which has changed */
  
+       /* update the paca copy of the context struct */
+       get_paca()->context = current->active_mm->context;
+
+       asm volatile("isync" : : : "memory");
         for (i = 0; i < NUM_HIGH_AREAS; i++) {
-               if (! (areas & (1U << i)))
+               if (! (fi->newareas & (1U << i)))
                         continue;
                 for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
                         asm volatile("slbie %0"
                                      :: "r" (((i << HTLB_AREA_SHIFT)
-                                            + (j << SID_SHIFT)) | SLBIE_C));
+                                             + (j << SID_SHIFT)) | SLBIE_C));
         }
-
         asm volatile("isync" : : : "memory");
  }
  
@@ -196,6 +232,12 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
  
         BUG_ON(area >= NUM_HIGH_AREAS);
  
+       /* Hack, so that each addresses is controlled by exactly one
+        * of the high or low area bitmaps, the first high area starts
+        * at 4GB, not 0 */
+       if (start == 0)
+               start = 0x100000000UL;
+
         /* Check no VMAs are in the region */
         vma = find_vma(mm, start);
         if (vma && (vma->vm_start < end))
@@ -207,6 +249,7 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
  static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
  {
         unsigned long i;
+       struct slb_flush_info fi;
  
         BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
         BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
@@ -222,19 +265,20 @@ static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
  
         mm->context.low_htlb_areas |= newareas;
  
-       /* update the paca copy of the context struct */
-       get_paca()->context = mm->context;
-
         /* the context change must make it to memory before the flush,
          * so that further SLB misses do the right thing. */
         mb();
-       on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+
+       fi.mm = mm;
+       fi.newareas = newareas;
+       on_each_cpu(flush_low_segments, &fi, 0, 1);
  
         return 0;
  }
  
  static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
  {
+       struct slb_flush_info fi;
         unsigned long i;
  
         BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
@@ -258,22 +302,25 @@ static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
         /* the context change must make it to memory before the flush,
          * so that further SLB misses do the right thing. */
         mb();
-       on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
+
+       fi.mm = mm;
+       fi.newareas = newareas;
+       on_each_cpu(flush_high_segments, &fi, 0, 1);
  
         return 0;
  }
  
  int prepare_hugepage_range(unsigned long addr, unsigned long len)
  {
-       int err;
+       int err = 0;
  
         if ( (addr+len) < addr )
                 return -EINVAL;
  
-       if ((addr + len) < 0x100000000UL)
+       if (addr < 0x100000000UL)
                 err = open_low_hpage_areas(current->mm,
                                           LOW_ESID_MASK(addr, len));
-       else
+       if ((addr + len) > 0x100000000UL)
                 err = open_high_hpage_areas(current->mm,
                                             HTLB_AREA_MASK(addr, len));
         if (err) {
@@ -563,6 +610,8 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
         int lastshift;
         u16 areamask, curareas;
  
+       if (HPAGE_SHIFT == 0)
+               return -EINVAL;
         if (len & ~HPAGE_MASK)
                 return -EINVAL;
  
@@ -615,23 +664,47 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
         return -ENOMEM;
  }
  
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
+                                                 pte_t pte, int trap)
+{
+       struct page *page;
+       int i;
+
+       if (!pfn_valid(pte_pfn(pte)))
+               return rflags;
+
+       page = pte_page(pte);
+
+       /* page is dirty */
+       if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+               if (trap == 0x400) {
+                       for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
+                               __flush_dcache_icache(page_address(page+i));
+                       set_bit(PG_arch_1, &page->flags);
+               } else {
+                       rflags |= HPTE_R_N;
+               }
+       }
+       return rflags;
+}
+
  int hash_huge_page(struct mm_struct *mm, unsigned long access,
-                  unsigned long ea, unsigned long vsid, int local)
+                  unsigned long ea, unsigned long vsid, int local,
+                  unsigned long trap)
  {
         pte_t *ptep;
-       unsigned long va, vpn;
-       pte_t old_pte, new_pte;
-       unsigned long rflags, prpn;
+       unsigned long old_pte, new_pte;
+       unsigned long va, rflags, pa;
         long slot;
         int err = 1;
  
-       spin_lock(&mm->page_table_lock);
-
         ptep = huge_pte_offset(mm, ea);
  
         /* Search the Linux page table for a match with va */
         va = (vsid << 28) | (ea & 0x0fffffff);
-       vpn = va >> HPAGE_SHIFT;
  
         /*
          * If no pte found or not present, send the problem up to
@@ -640,8 +713,6 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
         if (unlikely(!ptep || pte_none(*ptep)))
                 goto out;
  
-/*     BUG_ON(pte_bad(*ptep)); */
-
         /* 
          * Check the user's access rights to the page.  If access should be
          * prevented then send the problem up to do_page_fault.
@@ -661,58 +732,70 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
          */
  
  
-       old_pte = *ptep;
-       new_pte = old_pte;
-
-       rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
+       do {
+               old_pte = pte_val(*ptep);
+               if (old_pte & _PAGE_BUSY)
+                       goto out;
+               new_pte = old_pte | _PAGE_BUSY |
+                       _PAGE_ACCESSED | _PAGE_HASHPTE;
+       } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+                                        old_pte, new_pte));
+
+       rflags = 0x2 | (!(new_pte & _PAGE_RW));
         /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-       rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
+       rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+               /* No CPU has hugepages but lacks no execute, so we
+                * don't need to worry about that case */
+               rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
+                                                      trap);
  
         /* Check if pte already has an hpte (case 2) */
-       if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
+       if (unlikely(old_pte & _PAGE_HASHPTE)) {
                 /* There MIGHT be an HPTE for this pte */
                 unsigned long hash, slot;
  
-               hash = hpt_hash(vpn, 1);
-               if (pte_val(old_pte) & _PAGE_SECONDARY)
+               hash = hpt_hash(va, HPAGE_SHIFT);
+               if (old_pte & _PAGE_F_SECOND)
                         hash = ~hash;
                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
+               slot += (old_pte & _PAGE_F_GIX) >> 12;
  
-               if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
-                       pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
+               if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
+                                        local) == -1)
+                       old_pte &= ~_PAGE_HPTEFLAGS;
         }
  
-       if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
-               unsigned long hash = hpt_hash(vpn, 1);
+       if (likely(!(old_pte & _PAGE_HASHPTE))) {
+               unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
                 unsigned long hpte_group;
  
-               prpn = pte_pfn(old_pte);
+               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
  
  repeat:
                 hpte_group = ((hash & htab_hash_mask) *
                               HPTES_PER_GROUP) & ~0x7UL;
  
-               /* Update the linux pte with the HPTE slot */
-               pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
-               pte_val(new_pte) |= _PAGE_HASHPTE;
+               /* clear HPTE slot informations in new PTE */
+               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
  
                 /* Add in WIMG bits */
                 /* XXX We should store these in the pte */
+               /* --BenH: I think they are ... */
                 rflags |= _PAGE_COHERENT;
  
-               slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-                                         HPTE_V_LARGE, rflags);
+               /* Insert into the hash table, primary slot */
+               slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
+                                         mmu_huge_psize);
  
                 /* Primary is full, try the secondary */
                 if (unlikely(slot == -1)) {
-                       pte_val(new_pte) |= _PAGE_SECONDARY;
+                       new_pte |= _PAGE_F_SECOND;
                         hpte_group = ((~hash & htab_hash_mask) *
                                       HPTES_PER_GROUP) & ~0x7UL; 
-                       slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-                                                 HPTE_V_LARGE |
+                       slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
                                                   HPTE_V_SECONDARY,
-                                                 rflags);
+                                                 mmu_huge_psize);
                         if (slot == -1) {
                                 if (mftb() & 0x1)
                                         hpte_group = ((hash & htab_hash_mask) *
@@ -726,20 +809,16 @@ repeat:
                 if (unlikely(slot == -2))
                         panic("hash_huge_page: pte_insert failed\n");
  
-               pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
-
-               /* 
-                * No need to use ldarx/stdcx here because all who
-                * might be updating the pte will hold the
-                * page_table_lock
-                */
-               *ptep = new_pte;
+               new_pte |= (slot << 12) & _PAGE_F_GIX;
         }
  
+       /*
+        * No need to use ldarx/stdcx here
+        */
+       *ptep = __pte(new_pte & ~_PAGE_BUSY);
+
         err = 0;
  
   out:
-       spin_unlock(&mm->page_table_lock);
-
         return err;
  }