Merge branch 'master'
[powerpc.git] / arch / powerpc / mm / hugetlbpage.c
index 0ea0994..54131b8 100644 (file)
@@ -47,10 +47,25 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
                pu = pud_offset(pg, addr);
                if (!pud_none(*pu)) {
                        pm = pmd_offset(pu, addr);
+#ifdef CONFIG_PPC_64K_PAGES
+                       /* Currently, we use the normal PTE offset within full
+                        * size PTE pages, thus our huge PTEs are scattered in
+                        * the PTE page and we do waste some. We may change
+                        * that in the future, but the current mecanism keeps
+                        * things much simpler
+                        */
+                       if (!pmd_none(*pm)) {
+                               /* Note: pte_offset_* are all equivalent on
+                                * ppc64 as we don't have HIGHMEM
+                                */
+                               pt = pte_offset_kernel(pm, addr);
+                               return pt;
+                       }
+#else /* CONFIG_PPC_64K_PAGES */
+                       /* On 4k pages, we put huge PTEs in the PMD page */
                        pt = (pte_t *)pm;
-                       BUG_ON(!pmd_none(*pm)
-                              && !(pte_present(*pt) && pte_huge(*pt)));
                        return pt;
+#endif /* CONFIG_PPC_64K_PAGES */
                }
        }
 
@@ -74,9 +89,16 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
        if (pu) {
                pm = pmd_alloc(mm, pu, addr);
                if (pm) {
+#ifdef CONFIG_PPC_64K_PAGES
+                       /* See comment in huge_pte_offset. Note that if we ever
+                        * want to put the page size in the PMD, we would have
+                        * to open code our own pte_alloc* function in order
+                        * to populate and set the size atomically
+                        */
+                       pt = pte_alloc_map(mm, pm, addr);
+#else /* CONFIG_PPC_64K_PAGES */
                        pt = (pte_t *)pm;
-                       BUG_ON(!pmd_none(*pm)
-                              && !(pte_present(*pt) && pte_huge(*pt)));
+#endif /* CONFIG_PPC_64K_PAGES */
                        return pt;
                }
        }
@@ -84,35 +106,29 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
        return NULL;
 }
 
-#define HUGEPTE_BATCH_SIZE     (HPAGE_SIZE / PMD_SIZE)
-
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t pte)
 {
-       int i;
-
        if (pte_present(*ptep)) {
-               pte_clear(mm, addr, ptep);
+               /* We open-code pte_clear because we need to pass the right
+                * argument to hpte_update (huge / !huge)
+                */
+               unsigned long old = pte_update(ptep, ~0UL);
+               if (old & _PAGE_HASHPTE)
+                       hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
                flush_tlb_pending();
        }
-
-       for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
-               *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-               ptep++;
-       }
+       *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 }
 
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep)
 {
        unsigned long old = pte_update(ptep, ~0UL);
-       int i;
 
        if (old & _PAGE_HASHPTE)
-               hpte_update(mm, addr, old, 0);
-
-       for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
-               ptep[i] = __pte(0);
+               hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
+       *ptep = __pte(0);
 
        return __pte(old);
 }
@@ -132,43 +148,63 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
        return 0;
 }
 
+struct slb_flush_info {
+       struct mm_struct *mm;
+       u16 newareas;
+};
+
 static void flush_low_segments(void *parm)
 {
-       u16 areas = (unsigned long) parm;
+       struct slb_flush_info *fi = parm;
        unsigned long i;
 
-       asm volatile("isync" : : : "memory");
+       BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
+
+       if (current->active_mm != fi->mm)
+               return;
 
-       BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
+       /* Only need to do anything if this CPU is working in the same
+        * mm as the one which has changed */
 
+       /* update the paca copy of the context struct */
+       get_paca()->context = current->active_mm->context;
+
+       asm volatile("isync" : : : "memory");
        for (i = 0; i < NUM_LOW_AREAS; i++) {
-               if (! (areas & (1U << i)))
+               if (! (fi->newareas & (1U << i)))
                        continue;
                asm volatile("slbie %0"
                             : : "r" ((i << SID_SHIFT) | SLBIE_C));
        }
-
        asm volatile("isync" : : : "memory");
 }
 
 static void flush_high_segments(void *parm)
 {
-       u16 areas = (unsigned long) parm;
+       struct slb_flush_info *fi = parm;
        unsigned long i, j;
 
-       asm volatile("isync" : : : "memory");
 
-       BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+       BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
+
+       if (current->active_mm != fi->mm)
+               return;
+
+       /* Only need to do anything if this CPU is working in the same
+        * mm as the one which has changed */
 
+       /* update the paca copy of the context struct */
+       get_paca()->context = current->active_mm->context;
+
+       asm volatile("isync" : : : "memory");
        for (i = 0; i < NUM_HIGH_AREAS; i++) {
-               if (! (areas & (1U << i)))
+               if (! (fi->newareas & (1U << i)))
                        continue;
                for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
                        asm volatile("slbie %0"
                                     :: "r" (((i << HTLB_AREA_SHIFT)
-                                            + (j << SID_SHIFT)) | SLBIE_C));
+                                             + (j << SID_SHIFT)) | SLBIE_C));
        }
-
        asm volatile("isync" : : : "memory");
 }
 
@@ -196,6 +232,12 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
 
        BUG_ON(area >= NUM_HIGH_AREAS);
 
+       /* Hack, so that each addresses is controlled by exactly one
+        * of the high or low area bitmaps, the first high area starts
+        * at 4GB, not 0 */
+       if (start == 0)
+               start = 0x100000000UL;
+
        /* Check no VMAs are in the region */
        vma = find_vma(mm, start);
        if (vma && (vma->vm_start < end))
@@ -207,6 +249,7 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
 static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
        unsigned long i;
+       struct slb_flush_info fi;
 
        BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
        BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
@@ -222,19 +265,20 @@ static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
 
        mm->context.low_htlb_areas |= newareas;
 
-       /* update the paca copy of the context struct */
-       get_paca()->context = mm->context;
-
        /* the context change must make it to memory before the flush,
         * so that further SLB misses do the right thing. */
        mb();
-       on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+
+       fi.mm = mm;
+       fi.newareas = newareas;
+       on_each_cpu(flush_low_segments, &fi, 0, 1);
 
        return 0;
 }
 
 static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
+       struct slb_flush_info fi;
        unsigned long i;
 
        BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
@@ -258,22 +302,25 @@ static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
        /* the context change must make it to memory before the flush,
         * so that further SLB misses do the right thing. */
        mb();
-       on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
+
+       fi.mm = mm;
+       fi.newareas = newareas;
+       on_each_cpu(flush_high_segments, &fi, 0, 1);
 
        return 0;
 }
 
 int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-       int err;
+       int err = 0;
 
        if ( (addr+len) < addr )
                return -EINVAL;
 
-       if ((addr + len) < 0x100000000UL)
+       if (addr < 0x100000000UL)
                err = open_low_hpage_areas(current->mm,
                                          LOW_ESID_MASK(addr, len));
-       else
+       if ((addr + len) > 0x100000000UL)
                err = open_high_hpage_areas(current->mm,
                                            HTLB_AREA_MASK(addr, len));
        if (err) {
@@ -563,6 +610,8 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
        int lastshift;
        u16 areamask, curareas;
 
+       if (HPAGE_SHIFT == 0)
+               return -EINVAL;
        if (len & ~HPAGE_MASK)
                return -EINVAL;
 
@@ -615,23 +664,47 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
        return -ENOMEM;
 }
 
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
+                                                 pte_t pte, int trap)
+{
+       struct page *page;
+       int i;
+
+       if (!pfn_valid(pte_pfn(pte)))
+               return rflags;
+
+       page = pte_page(pte);
+
+       /* page is dirty */
+       if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+               if (trap == 0x400) {
+                       for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
+                               __flush_dcache_icache(page_address(page+i));
+                       set_bit(PG_arch_1, &page->flags);
+               } else {
+                       rflags |= HPTE_R_N;
+               }
+       }
+       return rflags;
+}
+
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
-                  unsigned long ea, unsigned long vsid, int local)
+                  unsigned long ea, unsigned long vsid, int local,
+                  unsigned long trap)
 {
        pte_t *ptep;
-       unsigned long va, vpn;
-       pte_t old_pte, new_pte;
-       unsigned long rflags, prpn;
+       unsigned long old_pte, new_pte;
+       unsigned long va, rflags, pa;
        long slot;
        int err = 1;
 
-       spin_lock(&mm->page_table_lock);
-
        ptep = huge_pte_offset(mm, ea);
 
        /* Search the Linux page table for a match with va */
        va = (vsid << 28) | (ea & 0x0fffffff);
-       vpn = va >> HPAGE_SHIFT;
 
        /*
         * If no pte found or not present, send the problem up to
@@ -640,8 +713,6 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
        if (unlikely(!ptep || pte_none(*ptep)))
                goto out;
 
-/*     BUG_ON(pte_bad(*ptep)); */
-
        /* 
         * Check the user's access rights to the page.  If access should be
         * prevented then send the problem up to do_page_fault.
@@ -661,58 +732,70 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
         */
 
 
-       old_pte = *ptep;
-       new_pte = old_pte;
-
-       rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
+       do {
+               old_pte = pte_val(*ptep);
+               if (old_pte & _PAGE_BUSY)
+                       goto out;
+               new_pte = old_pte | _PAGE_BUSY |
+                       _PAGE_ACCESSED | _PAGE_HASHPTE;
+       } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+                                        old_pte, new_pte));
+
+       rflags = 0x2 | (!(new_pte & _PAGE_RW));
        /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-       rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
+       rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+               /* No CPU has hugepages but lacks no execute, so we
+                * don't need to worry about that case */
+               rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
+                                                      trap);
 
        /* Check if pte already has an hpte (case 2) */
-       if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
+       if (unlikely(old_pte & _PAGE_HASHPTE)) {
                /* There MIGHT be an HPTE for this pte */
                unsigned long hash, slot;
 
-               hash = hpt_hash(vpn, 1);
-               if (pte_val(old_pte) & _PAGE_SECONDARY)
+               hash = hpt_hash(va, HPAGE_SHIFT);
+               if (old_pte & _PAGE_F_SECOND)
                        hash = ~hash;
                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
+               slot += (old_pte & _PAGE_F_GIX) >> 12;
 
-               if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
-                       pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
+               if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
+                                        local) == -1)
+                       old_pte &= ~_PAGE_HPTEFLAGS;
        }
 
-       if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
-               unsigned long hash = hpt_hash(vpn, 1);
+       if (likely(!(old_pte & _PAGE_HASHPTE))) {
+               unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
                unsigned long hpte_group;
 
-               prpn = pte_pfn(old_pte);
+               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 
 repeat:
                hpte_group = ((hash & htab_hash_mask) *
                              HPTES_PER_GROUP) & ~0x7UL;
 
-               /* Update the linux pte with the HPTE slot */
-               pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
-               pte_val(new_pte) |= _PAGE_HASHPTE;
+               /* clear HPTE slot informations in new PTE */
+               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 
                /* Add in WIMG bits */
                /* XXX We should store these in the pte */
+               /* --BenH: I think they are ... */
                rflags |= _PAGE_COHERENT;
 
-               slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-                                         HPTE_V_LARGE, rflags);
+               /* Insert into the hash table, primary slot */
+               slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
+                                         mmu_huge_psize);
 
                /* Primary is full, try the secondary */
                if (unlikely(slot == -1)) {
-                       pte_val(new_pte) |= _PAGE_SECONDARY;
+                       new_pte |= _PAGE_F_SECOND;
                        hpte_group = ((~hash & htab_hash_mask) *
                                      HPTES_PER_GROUP) & ~0x7UL; 
-                       slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-                                                 HPTE_V_LARGE |
+                       slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
                                                  HPTE_V_SECONDARY,
-                                                 rflags);
+                                                 mmu_huge_psize);
                        if (slot == -1) {
                                if (mftb() & 0x1)
                                        hpte_group = ((hash & htab_hash_mask) *
@@ -726,20 +809,16 @@ repeat:
                if (unlikely(slot == -2))
                        panic("hash_huge_page: pte_insert failed\n");
 
-               pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
-
-               /* 
-                * No need to use ldarx/stdcx here because all who
-                * might be updating the pte will hold the
-                * page_table_lock
-                */
-               *ptep = new_pte;
+               new_pte |= (slot << 12) & _PAGE_F_GIX;
        }
 
+       /*
+        * No need to use ldarx/stdcx here
+        */
+       *ptep = __pte(new_pte & ~_PAGE_BUSY);
+
        err = 0;
 
  out:
-       spin_unlock(&mm->page_table_lock);
-
        return err;
 }