X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=arch%2Fpowerpc%2Fmm%2Fhugetlbpage.c;h=54131b877da36cb3a827000d802ca76ce82507ad;hb=ac67c6247361b3b8644b34e5301a46d5069c1373;hp=0ea0994ed974e0185bad810cc1757cc4f20f5063;hpb=b6ec995a21a9428aef620b5adf46d047a18d88b8;p=powerpc.git

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0ea0994ed9..54131b877d 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -47,10 +47,25 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 		pu = pud_offset(pg, addr);
 		if (!pud_none(*pu)) {
 			pm = pmd_offset(pu, addr);
+#ifdef CONFIG_PPC_64K_PAGES
+			/* Currently, we use the normal PTE offset within full
+			 * size PTE pages, thus our huge PTEs are scattered in
+			 * the PTE page and we do waste some. We may change
+			 * that in the future, but the current mecanism keeps
+			 * things much simpler
+			 */
+			if (!pmd_none(*pm)) {
+				/* Note: pte_offset_* are all equivalent on
+				 * ppc64 as we don't have HIGHMEM
+				 */
+				pt = pte_offset_kernel(pm, addr);
+				return pt;
+			}
+#else /* CONFIG_PPC_64K_PAGES */
+			/* On 4k pages, we put huge PTEs in the PMD page */
 			pt = (pte_t *)pm;
-			BUG_ON(!pmd_none(*pm)
-			       && !(pte_present(*pt) && pte_huge(*pt)));
 			return pt;
+#endif /* CONFIG_PPC_64K_PAGES */
 		}
 	}
 
@@ -74,9 +89,16 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	if (pu) {
 		pm = pmd_alloc(mm, pu, addr);
 		if (pm) {
+#ifdef CONFIG_PPC_64K_PAGES
+			/* See comment in huge_pte_offset. Note that if we ever
+			 * want to put the page size in the PMD, we would have
+			 * to open code our own pte_alloc* function in order
+			 * to populate and set the size atomically
+			 */
+			pt = pte_alloc_map(mm, pm, addr);
+#else /* CONFIG_PPC_64K_PAGES */
 			pt = (pte_t *)pm;
-			BUG_ON(!pmd_none(*pm)
-			       && !(pte_present(*pt) && pte_huge(*pt)));
+#endif /* CONFIG_PPC_64K_PAGES */
 			return pt;
 		}
 	}
@@ -84,35 +106,29 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return NULL;
 }
 
-#define HUGEPTE_BATCH_SIZE	(HPAGE_SIZE / PMD_SIZE)
-
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte)
 {
-	int i;
-
 	if (pte_present(*ptep)) {
-		pte_clear(mm, addr, ptep);
+		/* We open-code pte_clear because we need to pass the right
+		 * argument to hpte_update (huge / !huge)
+		 */
+		unsigned long old = pte_update(ptep, ~0UL);
+		if (old & _PAGE_HASHPTE)
+			hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
 		flush_tlb_pending();
 	}
-
-	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
-		*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-		ptep++;
-	}
+	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 }
 
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep)
 {
 	unsigned long old = pte_update(ptep, ~0UL);
-	int i;
 
 	if (old & _PAGE_HASHPTE)
-		hpte_update(mm, addr, old, 0);
-
-	for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
-		ptep[i] = __pte(0);
+		hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
+	*ptep = __pte(0);
 
 	return __pte(old);
 }
@@ -132,43 +148,63 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
+struct slb_flush_info {
+	struct mm_struct *mm;
+	u16 newareas;
+};
+
 static void flush_low_segments(void *parm)
 {
-	u16 areas = (unsigned long) parm;
+	struct slb_flush_info *fi = parm;
 	unsigned long i;
 
-	asm volatile("isync" : : : "memory");
+	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
+
+	if (current->active_mm != fi->mm)
+		return;
 
-	BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
+	/* Only need to do anything if this CPU is working in the same
+	 * mm as the one which has changed */
 
+	/* update the paca copy of the context struct */
+	get_paca()->context = current->active_mm->context;
+
+	asm volatile("isync" : : : "memory");
 	for (i = 0; i < NUM_LOW_AREAS; i++) {
-		if (! (areas & (1U << i)))
+		if (! (fi->newareas & (1U << i)))
 			continue;
 		asm volatile("slbie %0"
 			     : : "r" ((i << SID_SHIFT) | SLBIE_C));
 	}
-
 	asm volatile("isync" : : : "memory");
 }
 
 static void flush_high_segments(void *parm)
 {
-	u16 areas = (unsigned long) parm;
+	struct slb_flush_info *fi = parm;
 	unsigned long i, j;
 
-	asm volatile("isync" : : : "memory");
 
-	BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
+
+	if (current->active_mm != fi->mm)
+		return;
+
+	/* Only need to do anything if this CPU is working in the same
+	 * mm as the one which has changed */
 
+	/* update the paca copy of the context struct */
+	get_paca()->context = current->active_mm->context;
+
+	asm volatile("isync" : : : "memory");
 	for (i = 0; i < NUM_HIGH_AREAS; i++) {
-		if (! (areas & (1U << i)))
+		if (! (fi->newareas & (1U << i)))
 			continue;
 		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
 			asm volatile("slbie %0"
 				     :: "r" (((i << HTLB_AREA_SHIFT)
-					     + (j << SID_SHIFT)) | SLBIE_C));
+					      + (j << SID_SHIFT)) | SLBIE_C));
 	}
-
 	asm volatile("isync" : : : "memory");
 }
 
@@ -196,6 +232,12 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
 
 	BUG_ON(area >= NUM_HIGH_AREAS);
 
+	/* Hack, so that each addresses is controlled by exactly one
+	 * of the high or low area bitmaps, the first high area starts
+	 * at 4GB, not 0 */
+	if (start == 0)
+		start = 0x100000000UL;
+
 	/* Check no VMAs are in the region */
 	vma = find_vma(mm, start);
 	if (vma && (vma->vm_start < end))
@@ -207,6 +249,7 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
 static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
 	unsigned long i;
+	struct slb_flush_info fi;
 
 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
 	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
@@ -222,19 +265,20 @@ static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
 
 	mm->context.low_htlb_areas |= newareas;
 
-	/* update the paca copy of the context struct */
-	get_paca()->context = mm->context;
-
 	/* the context change must make it to memory before the flush,
 	 * so that further SLB misses do the right thing. */
 	mb();
-	on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+
+	fi.mm = mm;
+	fi.newareas = newareas;
+	on_each_cpu(flush_low_segments, &fi, 0, 1);
 
 	return 0;
 }
 
 static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
+	struct slb_flush_info fi;
 	unsigned long i;
 
 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
@@ -258,22 +302,25 @@ static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
 	/* the context change must make it to memory before the flush,
 	 * so that further SLB misses do the right thing. */
 	mb();
-	on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
+
+	fi.mm = mm;
+	fi.newareas = newareas;
+	on_each_cpu(flush_high_segments, &fi, 0, 1);
 
 	return 0;
 }
 
 int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-	int err;
+	int err = 0;
 
 	if ( (addr+len) < addr )
 		return -EINVAL;
 
-	if ((addr + len) < 0x100000000UL)
+	if (addr < 0x100000000UL)
 		err = open_low_hpage_areas(current->mm,
 					  LOW_ESID_MASK(addr, len));
-	else
+	if ((addr + len) > 0x100000000UL)
 		err = open_high_hpage_areas(current->mm,
 					    HTLB_AREA_MASK(addr, len));
 	if (err) {
@@ -563,6 +610,8 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	int lastshift;
 	u16 areamask, curareas;
 
+	if (HPAGE_SHIFT == 0)
+		return -EINVAL;
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 
@@ -615,23 +664,47 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	return -ENOMEM;
 }
 
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
+						  pte_t pte, int trap)
+{
+	struct page *page;
+	int i;
+
+	if (!pfn_valid(pte_pfn(pte)))
+		return rflags;
+
+	page = pte_page(pte);
+
+	/* page is dirty */
+	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+		if (trap == 0x400) {
+			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
+				__flush_dcache_icache(page_address(page+i));
+			set_bit(PG_arch_1, &page->flags);
+		} else {
+			rflags |= HPTE_R_N;
+		}
+	}
+	return rflags;
+}
+
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
-		   unsigned long ea, unsigned long vsid, int local)
+		   unsigned long ea, unsigned long vsid, int local,
+		   unsigned long trap)
 {
 	pte_t *ptep;
-	unsigned long va, vpn;
-	pte_t old_pte, new_pte;
-	unsigned long rflags, prpn;
+	unsigned long old_pte, new_pte;
+	unsigned long va, rflags, pa;
 	long slot;
 	int err = 1;
 
-	spin_lock(&mm->page_table_lock);
-
 	ptep = huge_pte_offset(mm, ea);
 
 	/* Search the Linux page table for a match with va */
 	va = (vsid << 28) | (ea & 0x0fffffff);
-	vpn = va >> HPAGE_SHIFT;
 
 	/*
 	 * If no pte found or not present, send the problem up to
@@ -640,8 +713,6 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 	if (unlikely(!ptep || pte_none(*ptep)))
 		goto out;
 
-/* 	BUG_ON(pte_bad(*ptep)); */
-
 	/* 
 	 * Check the user's access rights to the page.  If access should be
 	 * prevented then send the problem up to do_page_fault.
@@ -661,58 +732,70 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 	 */
 
 
-	old_pte = *ptep;
-	new_pte = old_pte;
-
-	rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
+	do {
+		old_pte = pte_val(*ptep);
+		if (old_pte & _PAGE_BUSY)
+			goto out;
+		new_pte = old_pte | _PAGE_BUSY |
+			_PAGE_ACCESSED | _PAGE_HASHPTE;
+	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+					 old_pte, new_pte));
+
+	rflags = 0x2 | (!(new_pte & _PAGE_RW));
  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-	rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
+	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		/* No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case */
+		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
+						       trap);
 
 	/* Check if pte already has an hpte (case 2) */
-	if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
+	if (unlikely(old_pte & _PAGE_HASHPTE)) {
 		/* There MIGHT be an HPTE for this pte */
 		unsigned long hash, slot;
 
-		hash = hpt_hash(vpn, 1);
-		if (pte_val(old_pte) & _PAGE_SECONDARY)
+		hash = hpt_hash(va, HPAGE_SHIFT);
+		if (old_pte & _PAGE_F_SECOND)
 			hash = ~hash;
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
+		slot += (old_pte & _PAGE_F_GIX) >> 12;
 
-		if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
-			pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
+		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
+					 local) == -1)
+			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
-	if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
-		unsigned long hash = hpt_hash(vpn, 1);
+	if (likely(!(old_pte & _PAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
 		unsigned long hpte_group;
 
-		prpn = pte_pfn(old_pte);
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 
 repeat:
 		hpte_group = ((hash & htab_hash_mask) *
 			      HPTES_PER_GROUP) & ~0x7UL;
 
-		/* Update the linux pte with the HPTE slot */
-		pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
-		pte_val(new_pte) |= _PAGE_HASHPTE;
+		/* clear HPTE slot informations in new PTE */
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 
 		/* Add in WIMG bits */
 		/* XXX We should store these in the pte */
+		/* --BenH: I think they are ... */
 		rflags |= _PAGE_COHERENT;
 
-		slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-					  HPTE_V_LARGE, rflags);
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
+					  mmu_huge_psize);
 
 		/* Primary is full, try the secondary */
 		if (unlikely(slot == -1)) {
-			pte_val(new_pte) |= _PAGE_SECONDARY;
+			new_pte |= _PAGE_F_SECOND;
 			hpte_group = ((~hash & htab_hash_mask) *
 				      HPTES_PER_GROUP) & ~0x7UL; 
-			slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-						  HPTE_V_LARGE |
+			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 						  HPTE_V_SECONDARY,
-						  rflags);
+						  mmu_huge_psize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
 					hpte_group = ((hash & htab_hash_mask) *
@@ -726,20 +809,16 @@ repeat:
 		if (unlikely(slot == -2))
 			panic("hash_huge_page: pte_insert failed\n");
 
-		pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
-
-		/* 
-		 * No need to use ldarx/stdcx here because all who
-		 * might be updating the pte will hold the
-		 * page_table_lock
-		 */
-		*ptep = new_pte;
+		new_pte |= (slot << 12) & _PAGE_F_GIX;
 	}
 
+	/*
+	 * No need to use ldarx/stdcx here
+	 */
+	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+
 	err = 0;
 
  out:
-	spin_unlock(&mm->page_table_lock);
-
 	return err;
 }