Merge tag 'drm-next-2019-01-05' of git://anongit.freedesktop.org/drm/drm

[linux] / mm / huge_memory.c
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 55478ab..faf357e 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -62,6 +62,16 @@ static struct shrinker deferred_split_shrinker;
  static atomic_t huge_zero_refcount;
  struct page *huge_zero_page __read_mostly;
  
+bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+       if (vma_is_anonymous(vma))
+               return __transparent_hugepage_enabled(vma);
+       if (vma_is_shmem(vma) && shmem_huge_enabled(vma))
+               return __transparent_hugepage_enabled(vma);
+
+       return false;
+}
+
  static struct page *get_huge_zero_page(void)
  {
         struct page *zero_page;
@@ -420,7 +430,7 @@ static int __init hugepage_init(void)
          * where the extra memory used could hurt more than TLB overhead
          * is likely to save.  The admin can still enable it through /sys.
          */
-       if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
+       if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
                 transparent_hugepage_flags = 0;
                 return 0;
         }
@@ -558,7 +568,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                 return VM_FAULT_FALLBACK;
         }
  
-       pgtable = pte_alloc_one(vma->vm_mm, haddr);
+       pgtable = pte_alloc_one(vma->vm_mm);
         if (unlikely(!pgtable)) {
                 ret = VM_FAULT_OOM;
                 goto release;
@@ -629,40 +639,30 @@ release:
   *         available
   * never: never stall for any thp allocation
   */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
  {
         const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
-       gfp_t this_node = 0;
-
-#ifdef CONFIG_NUMA
-       struct mempolicy *pol;
-       /*
-        * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
-        * specified, to express a general desire to stay on the current
-        * node for optimistic allocation attempts. If the defrag mode
-        * and/or madvise hint requires the direct reclaim then we prefer
-        * to fallback to other node rather than node reclaim because that
-        * can lead to excessive reclaim even though there is free memory
-        * on other nodes. We expect that NUMA preferences are specified
-        * by memory policies.
-        */
-       pol = get_vma_policy(vma, addr);
-       if (pol->mode != MPOL_BIND)
-               this_node = __GFP_THISNODE;
-       mpol_cond_put(pol);
-#endif
  
+       /* Always do synchronous compaction */
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+       /* Kick kcompactd and fail quickly */
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
+               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+       /* Synchronous compaction if madvised, otherwise kick kcompactd */
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                            __GFP_KSWAPD_RECLAIM | this_node);
+               return GFP_TRANSHUGE_LIGHT |
+                       (vma_madvised ? __GFP_DIRECT_RECLAIM :
+                                       __GFP_KSWAPD_RECLAIM);
+
+       /* Only do synchronous compaction if madvised */
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                            this_node);
-       return GFP_TRANSHUGE_LIGHT | this_node;
+               return GFP_TRANSHUGE_LIGHT |
+                      (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
+       return GFP_TRANSHUGE_LIGHT;
  }
  
  /* Caller must hold page table lock. */
@@ -702,7 +702,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                 struct page *zero_page;
                 bool set;
                 vm_fault_t ret;
-               pgtable = pte_alloc_one(vma->vm_mm, haddr);
+               pgtable = pte_alloc_one(vma->vm_mm);
                 if (unlikely(!pgtable))
                         return VM_FAULT_OOM;
                 zero_page = mm_get_huge_zero_page(vma->vm_mm);
@@ -734,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                         pte_free(vma->vm_mm, pgtable);
                 return ret;
         }
-       gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-       page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+       gfp = alloc_hugepage_direct_gfpmask(vma);
+       page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
         if (unlikely(!page)) {
                 count_vm_event(THP_FAULT_FALLBACK);
                 return VM_FAULT_FALLBACK;
@@ -791,7 +791,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                 return VM_FAULT_SIGBUS;
  
         if (arch_needs_pgtable_deposit()) {
-               pgtable = pte_alloc_one(vma->vm_mm, addr);
+               pgtable = pte_alloc_one(vma->vm_mm);
                 if (!pgtable)
                         return VM_FAULT_OOM;
         }
@@ -927,7 +927,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         if (!vma_is_anonymous(vma))
                 return 0;
  
-       pgtable = pte_alloc_one(dst_mm, addr);
+       pgtable = pte_alloc_one(dst_mm);
         if (unlikely(!pgtable))
                 goto out;
  
@@ -1144,8 +1144,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
         int i;
         vm_fault_t ret = 0;
         struct page **pages;
-       unsigned long mmun_start;       /* For mmu_notifiers */
-       unsigned long mmun_end;         /* For mmu_notifiers */
+       struct mmu_notifier_range range;
  
         pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
                               GFP_KERNEL);
@@ -1183,9 +1182,9 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
                 cond_resched();
         }
  
-       mmun_start = haddr;
-       mmun_end   = haddr + HPAGE_PMD_SIZE;
-       mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+       mmu_notifier_range_init(&range, vma->vm_mm, haddr,
+                               haddr + HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
  
         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1230,8 +1229,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
          * No need to double call mmu_notifier->invalidate_range() callback as
          * the above pmdp_huge_clear_flush_notify() did already call it.
          */
-       mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
-                                               mmun_end);
+       mmu_notifier_invalidate_range_only_end(&range);
  
         ret |= VM_FAULT_WRITE;
         put_page(page);
@@ -1241,7 +1239,7 @@ out:
  
  out_free_pages:
         spin_unlock(vmf->ptl);
-       mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+       mmu_notifier_invalidate_range_end(&range);
         for (i = 0; i < HPAGE_PMD_NR; i++) {
                 memcg = (void *)page_private(pages[i]);
                 set_page_private(pages[i], 0);
@@ -1258,8 +1256,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
         struct page *page = NULL, *new_page;
         struct mem_cgroup *memcg;
         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-       unsigned long mmun_start;       /* For mmu_notifiers */
-       unsigned long mmun_end;         /* For mmu_notifiers */
+       struct mmu_notifier_range range;
         gfp_t huge_gfp;                 /* for allocation and charge */
         vm_fault_t ret = 0;
  
@@ -1303,11 +1300,10 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
         get_page(page);
         spin_unlock(vmf->ptl);
  alloc:
-       if (transparent_hugepage_enabled(vma) &&
+       if (__transparent_hugepage_enabled(vma) &&
             !transparent_hugepage_debug_cow()) {
-               huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-               new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
-                               haddr, numa_node_id());
+               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
         } else
                 new_page = NULL;
  
@@ -1349,9 +1345,9 @@ alloc:
                                     vma, HPAGE_PMD_NR);
         __SetPageUptodate(new_page);
  
-       mmun_start = haddr;
-       mmun_end   = haddr + HPAGE_PMD_SIZE;
-       mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+       mmu_notifier_range_init(&range, vma->vm_mm, haddr,
+                               haddr + HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
  
         spin_lock(vmf->ptl);
         if (page)
@@ -1386,8 +1382,7 @@ out_mn:
          * No need to double call mmu_notifier->invalidate_range() callback as
          * the above pmdp_huge_clear_flush_notify() did already call it.
          */
-       mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
-                                              mmun_end);
+       mmu_notifier_invalidate_range_only_end(&range);
  out:
         return ret;
  out_unlock:
@@ -1501,8 +1496,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
                 if (!get_page_unless_zero(page))
                         goto out_unlock;
                 spin_unlock(vmf->ptl);
-               wait_on_page_locked(page);
-               put_page(page);
+               put_and_wait_on_page_locked(page);
                 goto out;
         }
  
@@ -1538,8 +1532,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
                 if (!get_page_unless_zero(page))
                         goto out_unlock;
                 spin_unlock(vmf->ptl);
-               wait_on_page_locked(page);
-               put_page(page);
+               put_and_wait_on_page_locked(page);
                 goto out;
         }
  
@@ -2028,14 +2021,15 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                 unsigned long address)
  {
         spinlock_t *ptl;
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long haddr = address & HPAGE_PUD_MASK;
+       struct mmu_notifier_range range;
  
-       mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
-       ptl = pud_lock(mm, pud);
+       mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
+                               (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
+       ptl = pud_lock(vma->vm_mm, pud);
         if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
                 goto out;
-       __split_huge_pud_locked(vma, pud, haddr);
+       __split_huge_pud_locked(vma, pud, range.start);
  
  out:
         spin_unlock(ptl);
@@ -2043,8 +2037,7 @@ out:
          * No need to double call mmu_notifier->invalidate_range() callback as
          * the above pudp_huge_clear_flush_notify() did already call it.
          */
-       mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
-                                              HPAGE_PUD_SIZE);
+       mmu_notifier_invalidate_range_only_end(&range);
  }
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  
@@ -2155,23 +2148,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
          */
         old_pmd = pmdp_invalidate(vma, haddr, pmd);
  
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
         pmd_migration = is_pmd_migration_entry(old_pmd);
-       if (pmd_migration) {
+       if (unlikely(pmd_migration)) {
                 swp_entry_t entry;
  
                 entry = pmd_to_swp_entry(old_pmd);
                 page = pfn_to_page(swp_offset(entry));
-       } else
-#endif
+               write = is_write_migration_entry(entry);
+               young = false;
+               soft_dirty = pmd_swp_soft_dirty(old_pmd);
+       } else {
                 page = pmd_page(old_pmd);
+               if (pmd_dirty(old_pmd))
+                       SetPageDirty(page);
+               write = pmd_write(old_pmd);
+               young = pmd_young(old_pmd);
+               soft_dirty = pmd_soft_dirty(old_pmd);
+       }
         VM_BUG_ON_PAGE(!page_count(page), page);
         page_ref_add(page, HPAGE_PMD_NR - 1);
-       if (pmd_dirty(old_pmd))
-               SetPageDirty(page);
-       write = pmd_write(old_pmd);
-       young = pmd_young(old_pmd);
-       soft_dirty = pmd_soft_dirty(old_pmd);
  
         /*
          * Withdraw the table only after we mark the pmd entry invalid.
@@ -2244,11 +2239,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long address, bool freeze, struct page *page)
  {
         spinlock_t *ptl;
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long haddr = address & HPAGE_PMD_MASK;
+       struct mmu_notifier_range range;
  
-       mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
-       ptl = pmd_lock(mm, pmd);
+       mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
+                               (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
+       ptl = pmd_lock(vma->vm_mm, pmd);
  
         /*
          * If caller asks to setup a migration entries, we need a page to check
@@ -2264,7 +2260,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                         clear_page_mlock(page);
         } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
                 goto out;
-       __split_huge_pmd_locked(vma, pmd, haddr, freeze);
+       __split_huge_pmd_locked(vma, pmd, range.start, freeze);
  out:
         spin_unlock(ptl);
         /*
@@ -2280,8 +2276,7 @@ out:
          *     any further changes to individual pte will notify. So no need
          *     to call mmu_notifier->invalidate_range()
          */
-       mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
-                                              HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_only_end(&range);
  }
  
  void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
@@ -2350,7 +2345,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
         }
  }
  
-static void freeze_page(struct page *page)
+static void unmap_page(struct page *page)
  {
         enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
                 TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
@@ -2365,7 +2360,7 @@ static void freeze_page(struct page *page)
         VM_BUG_ON_PAGE(!unmap_success, page);
  }
  
-static void unfreeze_page(struct page *page)
+static void remap_page(struct page *page)
  {
         int i;
         if (PageTransHuge(page)) {
@@ -2402,6 +2397,12 @@ static void __split_huge_page_tail(struct page *head, int tail,
                          (1L << PG_unevictable) |
                          (1L << PG_dirty)));
  
+       /* ->mapping in first tail page is compound_mapcount */
+       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+                       page_tail);
+       page_tail->mapping = head->mapping;
+       page_tail->index = head->index + tail;
+
         /* Page flags must be visible before we make the page non-compound. */
         smp_wmb();
  
@@ -2422,12 +2423,6 @@ static void __split_huge_page_tail(struct page *head, int tail,
         if (page_is_idle(head))
                 set_page_idle(page_tail);
  
-       /* ->mapping in first tail page is compound_mapcount */
-       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
-                       page_tail);
-       page_tail->mapping = head->mapping;
-
-       page_tail->index = head->index + tail;
         page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
  
         /*
@@ -2439,12 +2434,11 @@ static void __split_huge_page_tail(struct page *head, int tail,
  }
  
  static void __split_huge_page(struct page *page, struct list_head *list,
-               unsigned long flags)
+               pgoff_t end, unsigned long flags)
  {
         struct page *head = compound_head(page);
         struct zone *zone = page_zone(head);
         struct lruvec *lruvec;
-       pgoff_t end = -1;
         int i;
  
         lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
@@ -2452,9 +2446,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
         /* complete memcg works before add pages to LRU */
         mem_cgroup_split_huge_fixup(head);
  
-       if (!PageAnon(page))
-               end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
-
         for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
                 __split_huge_page_tail(head, i, lruvec, list);
                 /* Some pages can be beyond i_size: drop them from page cache */
@@ -2483,7 +2474,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
  
         spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
  
-       unfreeze_page(head);
+       remap_page(head);
  
         for (i = 0; i < HPAGE_PMD_NR; i++) {
                 struct page *subpage = head + i;
@@ -2626,6 +2617,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
         int count, mapcount, extra_pins, ret;
         bool mlocked;
         unsigned long flags;
+       pgoff_t end;
  
         VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
         VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -2648,6 +2640,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                         ret = -EBUSY;
                         goto out;
                 }
+               end = -1;
                 mapping = NULL;
                 anon_vma_lock_write(anon_vma);
         } else {
@@ -2661,10 +2654,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
  
                 anon_vma = NULL;
                 i_mmap_lock_read(mapping);
+
+               /*
+                *__split_huge_page() may need to trim off pages beyond EOF:
+                * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
+                * which cannot be nested inside the page tree lock. So note
+                * end now: i_size itself may be changed at any moment, but
+                * head page lock is good enough to serialize the trimming.
+                */
+               end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
         }
  
         /*
-        * Racy check if we can split the page, before freeze_page() will
+        * Racy check if we can split the page, before unmap_page() will
          * split PMDs
          */
         if (!can_split_huge_page(head, &extra_pins)) {
@@ -2673,7 +2675,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
         }
  
         mlocked = PageMlocked(page);
-       freeze_page(head);
+       unmap_page(head);
         VM_BUG_ON_PAGE(compound_mapcount(head), head);
  
         /* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -2707,7 +2709,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                 if (mapping)
                         __dec_node_page_state(page, NR_SHMEM_THPS);
                 spin_unlock(&pgdata->split_queue_lock);
-               __split_huge_page(page, list, flags);
+               __split_huge_page(page, list, end, flags);
                 if (PageSwapCache(head)) {
                         swp_entry_t entry = { .val = page_private(head) };
  
@@ -2727,7 +2729,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
  fail:          if (mapping)
                         xa_unlock(&mapping->i_pages);
                 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
-               unfreeze_page(head);
+               remap_page(head);
                 ret = -EBUSY;
         }