Merge tag 'drm-next-2019-01-05' of git://anongit.freedesktop.org/drm/drm
[linux] / mm / huge_memory.c
index 55478ab..faf357e 100644 (file)
@@ -62,6 +62,16 @@ static struct shrinker deferred_split_shrinker;
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 
+bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+       if (vma_is_anonymous(vma))
+               return __transparent_hugepage_enabled(vma);
+       if (vma_is_shmem(vma) && shmem_huge_enabled(vma))
+               return __transparent_hugepage_enabled(vma);
+
+       return false;
+}
+
 static struct page *get_huge_zero_page(void)
 {
        struct page *zero_page;
@@ -420,7 +430,7 @@ static int __init hugepage_init(void)
         * where the extra memory used could hurt more than TLB overhead
         * is likely to save.  The admin can still enable it through /sys.
         */
-       if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
+       if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
                transparent_hugepage_flags = 0;
                return 0;
        }
@@ -558,7 +568,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                return VM_FAULT_FALLBACK;
        }
 
-       pgtable = pte_alloc_one(vma->vm_mm, haddr);
+       pgtable = pte_alloc_one(vma->vm_mm);
        if (unlikely(!pgtable)) {
                ret = VM_FAULT_OOM;
                goto release;
@@ -629,40 +639,30 @@ release:
  *         available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 {
        const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
-       gfp_t this_node = 0;
-
-#ifdef CONFIG_NUMA
-       struct mempolicy *pol;
-       /*
-        * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
-        * specified, to express a general desire to stay on the current
-        * node for optimistic allocation attempts. If the defrag mode
-        * and/or madvise hint requires the direct reclaim then we prefer
-        * to fallback to other node rather than node reclaim because that
-        * can lead to excessive reclaim even though there is free memory
-        * on other nodes. We expect that NUMA preferences are specified
-        * by memory policies.
-        */
-       pol = get_vma_policy(vma, addr);
-       if (pol->mode != MPOL_BIND)
-               this_node = __GFP_THISNODE;
-       mpol_cond_put(pol);
-#endif
 
+       /* Always do synchronous compaction */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+       /* Kick kcompactd and fail quickly */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
+               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+       /* Synchronous compaction if madvised, otherwise kick kcompactd */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                            __GFP_KSWAPD_RECLAIM | this_node);
+               return GFP_TRANSHUGE_LIGHT |
+                       (vma_madvised ? __GFP_DIRECT_RECLAIM :
+                                       __GFP_KSWAPD_RECLAIM);
+
+       /* Only do synchronous compaction if madvised */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                            this_node);
-       return GFP_TRANSHUGE_LIGHT | this_node;
+               return GFP_TRANSHUGE_LIGHT |
+                      (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
+       return GFP_TRANSHUGE_LIGHT;
 }
 
 /* Caller must hold page table lock. */
@@ -702,7 +702,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                struct page *zero_page;
                bool set;
                vm_fault_t ret;
-               pgtable = pte_alloc_one(vma->vm_mm, haddr);
+               pgtable = pte_alloc_one(vma->vm_mm);
                if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
                zero_page = mm_get_huge_zero_page(vma->vm_mm);
@@ -734,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                        pte_free(vma->vm_mm, pgtable);
                return ret;
        }
-       gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-       page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+       gfp = alloc_hugepage_direct_gfpmask(vma);
+       page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
        if (unlikely(!page)) {
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -791,7 +791,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                return VM_FAULT_SIGBUS;
 
        if (arch_needs_pgtable_deposit()) {
-               pgtable = pte_alloc_one(vma->vm_mm, addr);
+               pgtable = pte_alloc_one(vma->vm_mm);
                if (!pgtable)
                        return VM_FAULT_OOM;
        }
@@ -927,7 +927,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (!vma_is_anonymous(vma))
                return 0;
 
-       pgtable = pte_alloc_one(dst_mm, addr);
+       pgtable = pte_alloc_one(dst_mm);
        if (unlikely(!pgtable))
                goto out;
 
@@ -1144,8 +1144,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
        int i;
        vm_fault_t ret = 0;
        struct page **pages;
-       unsigned long mmun_start;       /* For mmu_notifiers */
-       unsigned long mmun_end;         /* For mmu_notifiers */
+       struct mmu_notifier_range range;
 
        pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
                              GFP_KERNEL);
@@ -1183,9 +1182,9 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
                cond_resched();
        }
 
-       mmun_start = haddr;
-       mmun_end   = haddr + HPAGE_PMD_SIZE;
-       mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+       mmu_notifier_range_init(&range, vma->vm_mm, haddr,
+                               haddr + HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
 
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1230,8 +1229,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above pmdp_huge_clear_flush_notify() did already call it.
         */
-       mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
-                                               mmun_end);
+       mmu_notifier_invalidate_range_only_end(&range);
 
        ret |= VM_FAULT_WRITE;
        put_page(page);
@@ -1241,7 +1239,7 @@ out:
 
 out_free_pages:
        spin_unlock(vmf->ptl);
-       mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+       mmu_notifier_invalidate_range_end(&range);
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
@@ -1258,8 +1256,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
        struct page *page = NULL, *new_page;
        struct mem_cgroup *memcg;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-       unsigned long mmun_start;       /* For mmu_notifiers */
-       unsigned long mmun_end;         /* For mmu_notifiers */
+       struct mmu_notifier_range range;
        gfp_t huge_gfp;                 /* for allocation and charge */
        vm_fault_t ret = 0;
 
@@ -1303,11 +1300,10 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
        get_page(page);
        spin_unlock(vmf->ptl);
 alloc:
-       if (transparent_hugepage_enabled(vma) &&
+       if (__transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow()) {
-               huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-               new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
-                               haddr, numa_node_id());
+               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
        } else
                new_page = NULL;
 
@@ -1349,9 +1345,9 @@ alloc:
                                    vma, HPAGE_PMD_NR);
        __SetPageUptodate(new_page);
 
-       mmun_start = haddr;
-       mmun_end   = haddr + HPAGE_PMD_SIZE;
-       mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+       mmu_notifier_range_init(&range, vma->vm_mm, haddr,
+                               haddr + HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
 
        spin_lock(vmf->ptl);
        if (page)
@@ -1386,8 +1382,7 @@ out_mn:
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above pmdp_huge_clear_flush_notify() did already call it.
         */
-       mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
-                                              mmun_end);
+       mmu_notifier_invalidate_range_only_end(&range);
 out:
        return ret;
 out_unlock:
@@ -1501,8 +1496,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
                if (!get_page_unless_zero(page))
                        goto out_unlock;
                spin_unlock(vmf->ptl);
-               wait_on_page_locked(page);
-               put_page(page);
+               put_and_wait_on_page_locked(page);
                goto out;
        }
 
@@ -1538,8 +1532,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
                if (!get_page_unless_zero(page))
                        goto out_unlock;
                spin_unlock(vmf->ptl);
-               wait_on_page_locked(page);
-               put_page(page);
+               put_and_wait_on_page_locked(page);
                goto out;
        }
 
@@ -2028,14 +2021,15 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address)
 {
        spinlock_t *ptl;
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long haddr = address & HPAGE_PUD_MASK;
+       struct mmu_notifier_range range;
 
-       mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
-       ptl = pud_lock(mm, pud);
+       mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
+                               (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
+       ptl = pud_lock(vma->vm_mm, pud);
        if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
                goto out;
-       __split_huge_pud_locked(vma, pud, haddr);
+       __split_huge_pud_locked(vma, pud, range.start);
 
 out:
        spin_unlock(ptl);
@@ -2043,8 +2037,7 @@ out:
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above pudp_huge_clear_flush_notify() did already call it.
         */
-       mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
-                                              HPAGE_PUD_SIZE);
+       mmu_notifier_invalidate_range_only_end(&range);
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
@@ -2155,23 +2148,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
         */
        old_pmd = pmdp_invalidate(vma, haddr, pmd);
 
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        pmd_migration = is_pmd_migration_entry(old_pmd);
-       if (pmd_migration) {
+       if (unlikely(pmd_migration)) {
                swp_entry_t entry;
 
                entry = pmd_to_swp_entry(old_pmd);
                page = pfn_to_page(swp_offset(entry));
-       } else
-#endif
+               write = is_write_migration_entry(entry);
+               young = false;
+               soft_dirty = pmd_swp_soft_dirty(old_pmd);
+       } else {
                page = pmd_page(old_pmd);
+               if (pmd_dirty(old_pmd))
+                       SetPageDirty(page);
+               write = pmd_write(old_pmd);
+               young = pmd_young(old_pmd);
+               soft_dirty = pmd_soft_dirty(old_pmd);
+       }
        VM_BUG_ON_PAGE(!page_count(page), page);
        page_ref_add(page, HPAGE_PMD_NR - 1);
-       if (pmd_dirty(old_pmd))
-               SetPageDirty(page);
-       write = pmd_write(old_pmd);
-       young = pmd_young(old_pmd);
-       soft_dirty = pmd_soft_dirty(old_pmd);
 
        /*
         * Withdraw the table only after we mark the pmd entry invalid.
@@ -2244,11 +2239,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct page *page)
 {
        spinlock_t *ptl;
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long haddr = address & HPAGE_PMD_MASK;
+       struct mmu_notifier_range range;
 
-       mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
-       ptl = pmd_lock(mm, pmd);
+       mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
+                               (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
+       ptl = pmd_lock(vma->vm_mm, pmd);
 
        /*
         * If caller asks to setup a migration entries, we need a page to check
@@ -2264,7 +2260,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        clear_page_mlock(page);
        } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
                goto out;
-       __split_huge_pmd_locked(vma, pmd, haddr, freeze);
+       __split_huge_pmd_locked(vma, pmd, range.start, freeze);
 out:
        spin_unlock(ptl);
        /*
@@ -2280,8 +2276,7 @@ out:
         *     any further changes to individual pte will notify. So no need
         *     to call mmu_notifier->invalidate_range()
         */
-       mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
-                                              HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_only_end(&range);
 }
 
 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
@@ -2350,7 +2345,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
        }
 }
 
-static void freeze_page(struct page *page)
+static void unmap_page(struct page *page)
 {
        enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
                TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
@@ -2365,7 +2360,7 @@ static void freeze_page(struct page *page)
        VM_BUG_ON_PAGE(!unmap_success, page);
 }
 
-static void unfreeze_page(struct page *page)
+static void remap_page(struct page *page)
 {
        int i;
        if (PageTransHuge(page)) {
@@ -2402,6 +2397,12 @@ static void __split_huge_page_tail(struct page *head, int tail,
                         (1L << PG_unevictable) |
                         (1L << PG_dirty)));
 
+       /* ->mapping in first tail page is compound_mapcount */
+       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+                       page_tail);
+       page_tail->mapping = head->mapping;
+       page_tail->index = head->index + tail;
+
        /* Page flags must be visible before we make the page non-compound. */
        smp_wmb();
 
@@ -2422,12 +2423,6 @@ static void __split_huge_page_tail(struct page *head, int tail,
        if (page_is_idle(head))
                set_page_idle(page_tail);
 
-       /* ->mapping in first tail page is compound_mapcount */
-       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
-                       page_tail);
-       page_tail->mapping = head->mapping;
-
-       page_tail->index = head->index + tail;
        page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
 
        /*
@@ -2439,12 +2434,11 @@ static void __split_huge_page_tail(struct page *head, int tail,
 }
 
 static void __split_huge_page(struct page *page, struct list_head *list,
-               unsigned long flags)
+               pgoff_t end, unsigned long flags)
 {
        struct page *head = compound_head(page);
        struct zone *zone = page_zone(head);
        struct lruvec *lruvec;
-       pgoff_t end = -1;
        int i;
 
        lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
@@ -2452,9 +2446,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
        /* complete memcg works before add pages to LRU */
        mem_cgroup_split_huge_fixup(head);
 
-       if (!PageAnon(page))
-               end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
-
        for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
                __split_huge_page_tail(head, i, lruvec, list);
                /* Some pages can be beyond i_size: drop them from page cache */
@@ -2483,7 +2474,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 
        spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
 
-       unfreeze_page(head);
+       remap_page(head);
 
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                struct page *subpage = head + i;
@@ -2626,6 +2617,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
        int count, mapcount, extra_pins, ret;
        bool mlocked;
        unsigned long flags;
+       pgoff_t end;
 
        VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -2648,6 +2640,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                        ret = -EBUSY;
                        goto out;
                }
+               end = -1;
                mapping = NULL;
                anon_vma_lock_write(anon_vma);
        } else {
@@ -2661,10 +2654,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 
                anon_vma = NULL;
                i_mmap_lock_read(mapping);
+
+               /*
+                *__split_huge_page() may need to trim off pages beyond EOF:
+                * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
+                * which cannot be nested inside the page tree lock. So note
+                * end now: i_size itself may be changed at any moment, but
+                * head page lock is good enough to serialize the trimming.
+                */
+               end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
        }
 
        /*
-        * Racy check if we can split the page, before freeze_page() will
+        * Racy check if we can split the page, before unmap_page() will
         * split PMDs
         */
        if (!can_split_huge_page(head, &extra_pins)) {
@@ -2673,7 +2675,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
        }
 
        mlocked = PageMlocked(page);
-       freeze_page(head);
+       unmap_page(head);
        VM_BUG_ON_PAGE(compound_mapcount(head), head);
 
        /* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -2707,7 +2709,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                if (mapping)
                        __dec_node_page_state(page, NR_SHMEM_THPS);
                spin_unlock(&pgdata->split_queue_lock);
-               __split_huge_page(page, list, flags);
+               __split_huge_page(page, list, end, flags);
                if (PageSwapCache(head)) {
                        swp_entry_t entry = { .val = page_private(head) };
 
@@ -2727,7 +2729,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 fail:          if (mapping)
                        xa_unlock(&mapping->i_pages);
                spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
-               unfreeze_page(head);
+               remap_page(head);
                ret = -EBUSY;
        }