Merge tag 'powerpc-4.19-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 24 Aug 2018 16:34:23 +0000 (09:34 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 24 Aug 2018 16:34:23 +0000 (09:34 -0700)
Pull powerpc fixes from Michael Ellerman:

 - An implementation for the newly added hv_ops->flush() for the OPAL
   hvc console driver backends, I forgot to apply this after merging the
   hvc driver changes before the merge window.

 - Enable all PCI bridges at boot on powernv, to avoid races when
   multiple children of a bridge try to enable it simultaneously. This
   is a workaround until the PCI core can be enhanced to fix the races.

 - A fix to query PowerVM for the correct system topology at boot before
   initialising sched domains, seen in some configurations to cause
   broken scheduling etc.

 - A fix for pte_access_permitted() on "nohash" platforms.

 - Two commits to fix SIGBUS when using remap_pfn_range() seen on Power9
   due to a workaround when using the nest MMU (GPUs, accelerators).

 - Another fix to the VFIO code used by KVM, the previous fix had some
   bugs which caused guests to not start in some configurations.

 - A handful of other minor fixes.

Thanks to: Aneesh Kumar K.V, Benjamin Herrenschmidt, Christophe Leroy,
Hari Bathini, Luke Dashjr, Mahesh Salgaonkar, Nicholas Piggin, Paul
Mackerras, Srikar Dronamraju.

* tag 'powerpc-4.19-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux:
  powerpc/mce: Fix SLB rebolting during MCE recovery path.
  KVM: PPC: Book3S: Fix guest DMA when guest partially backed by THP pages
  powerpc/mm/radix: Only need the Nest MMU workaround for R -> RW transition
  powerpc/mm/books3s: Add new pte bit to mark pte temporarily invalid.
  powerpc/nohash: fix pte_access_permitted()
  powerpc/topology: Get topology for shared processors at boot
  powerpc64/ftrace: Include ftrace.h needed for enable/disable calls
  powerpc/powernv/pci: Work around races in PCI bridge enabling
  powerpc/fadump: cleanup crash memory ranges support
  powerpc/powernv: provide a console flush operation for opal hvc driver
  powerpc/traps: Avoid rate limit messages from show unhandled signals
  powerpc/64s: Fix PACA_IRQ_HARD_DIS accounting in idle_power4()

16 files changed:
arch/powerpc/include/asm/book3s/64/pgtable.h
arch/powerpc/include/asm/nohash/pgtable.h
arch/powerpc/include/asm/opal.h
arch/powerpc/include/asm/topology.h
arch/powerpc/kernel/fadump.c
arch/powerpc/kernel/idle_power4.S
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/traps.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/mm/mmu_context_iommu.c
arch/powerpc/mm/numa.c
arch/powerpc/mm/pgtable-radix.c
arch/powerpc/mm/slb.c
arch/powerpc/platforms/powernv/opal.c
arch/powerpc/platforms/powernv/pci-ioda.c
drivers/tty/hvc/hvc_opal.c

index 6761187..13a688f 100644 (file)
 
 #define _PAGE_PTE              0x4000000000000000UL    /* distinguishes PTEs from pointers */
 #define _PAGE_PRESENT          0x8000000000000000UL    /* pte contains a translation */
+/*
+ * We need to mark a pmd pte invalid while splitting. We can do that by clearing
+ * the _PAGE_PRESENT bit. But then that will be taken as a swap pte. In order to
+ * differentiate between two use a SW field when invalidating.
+ *
+ * We do that temporary invalidate for regular pte entry in ptep_set_access_flags
+ *
+ * This is used only when _PAGE_PRESENT is cleared.
+ */
+#define _PAGE_INVALID          _RPAGE_SW0
 
 /*
  * Top and bottom bits of RPN which can be used by hash
@@ -568,7 +578,13 @@ static inline pte_t pte_clear_savedwrite(pte_t pte)
 
 static inline int pte_present(pte_t pte)
 {
-       return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT));
+       /*
+        * A pte is considerent present if _PAGE_PRESENT is set.
+        * We also need to consider the pte present which is marked
+        * invalid during ptep_set_access_flags. Hence we look for _PAGE_INVALID
+        * if we find _PAGE_PRESENT cleared.
+        */
+       return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
 }
 
 #ifdef CONFIG_PPC_MEM_KEYS
index 2160be2..b321c82 100644 (file)
@@ -51,17 +51,14 @@ static inline int pte_present(pte_t pte)
 #define pte_access_permitted pte_access_permitted
 static inline bool pte_access_permitted(pte_t pte, bool write)
 {
-       unsigned long pteval = pte_val(pte);
        /*
         * A read-only access is controlled by _PAGE_USER bit.
         * We have _PAGE_READ set for WRITE and EXECUTE
         */
-       unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_USER;
-
-       if (write)
-               need_pte_bits |= _PAGE_WRITE;
+       if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte))
+               return false;
 
-       if ((pteval & need_pte_bits) != need_pte_bits)
+       if (write && !pte_write(pte))
                return false;
 
        return true;
index 834e7e2..ff38664 100644 (file)
@@ -308,6 +308,7 @@ extern void opal_configure_cores(void);
 extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
 extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
 extern int opal_put_chars_atomic(uint32_t vtermno, const char *buf, int total_len);
+extern int opal_flush_chars(uint32_t vtermno, bool wait);
 extern int opal_flush_console(uint32_t vtermno);
 
 extern void hvc_opal_init_early(void);
index 16b0778..a4a718d 100644 (file)
@@ -92,6 +92,7 @@ extern int stop_topology_update(void);
 extern int prrn_is_enabled(void);
 extern int find_and_online_cpu_nid(int cpu);
 extern int timed_topology_update(int nsecs);
+extern void __init shared_proc_topology_init(void);
 #else
 static inline int start_topology_update(void)
 {
@@ -113,6 +114,10 @@ static inline int timed_topology_update(int nsecs)
 {
        return 0;
 }
+
+#ifdef CONFIG_SMP
+static inline void shared_proc_topology_init(void) {}
+#endif
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include <asm-generic/topology.h>
index 986ec47..a711d22 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/crash_dump.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/slab.h>
 
 #include <asm/debugfs.h>
 #include <asm/page.h>
@@ -1019,13 +1020,6 @@ static int fadump_setup_crash_memory_ranges(void)
        pr_debug("Setup crash memory ranges.\n");
        crash_mem_ranges = 0;
 
-       /* allocate memory for crash memory ranges for the first time */
-       if (!max_crash_mem_ranges) {
-               ret = allocate_crash_memory_ranges();
-               if (ret)
-                       return ret;
-       }
-
        /*
         * add the first memory chunk (RMA_START through boot_memory_size) as
         * a separate memory chunk. The reason is, at the time crash firmware
index dd7471f..a09b3c7 100644 (file)
@@ -32,6 +32,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
        cmpwi   0,r4,0
        beqlr
 
+       /* This sequence is similar to prep_irq_for_idle() */
+
        /* Hard disable interrupts */
        mfmsr   r7
        rldicl  r0,r7,48,1
@@ -41,10 +43,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
        /* Check if something happened while soft-disabled */
        lbz     r0,PACAIRQHAPPENED(r13)
        cmpwi   cr0,r0,0
-       bnelr
+       bne-    2f
 
-       /* Soft-enable interrupts */
+       /*
+        * Soft-enable interrupts. This will make power4_fixup_nap return
+        * to our caller with interrupts enabled (soft and hard). The caller
+        * can cope with either interrupts disabled or enabled upon return.
+        */
 #ifdef CONFIG_TRACE_IRQFLAGS
+       /* Tell the tracer interrupts are on, because idle responds to them. */
        mflr    r0
        std     r0,16(r1)
        stdu    r1,-128(r1)
@@ -73,3 +80,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
        isync
        b       1b
 
+2:     /* Return if an interrupt had happened while soft disabled */
+       /* Set the HARD_DIS flag because interrupts are now hard disabled */
+       ori     r0,r0,PACA_IRQ_HARD_DIS
+       stb     r0,PACAIRQHAPPENED(r13)
+       blr
index b19d832..61c1fad 100644 (file)
@@ -1160,6 +1160,11 @@ void __init smp_cpus_done(unsigned int max_cpus)
        if (smp_ops && smp_ops->bringup_done)
                smp_ops->bringup_done();
 
+       /*
+        * On a shared LPAR, associativity needs to be requested.
+        * Hence, get numa topology before dumping cpu topology
+        */
+       shared_proc_topology_init();
        dump_numa_cpu_topology();
 
        /*
index 070e96f..c85adb8 100644 (file)
@@ -315,22 +315,21 @@ void user_single_step_siginfo(struct task_struct *tsk,
        info->si_addr = (void __user *)regs->nip;
 }
 
-static bool show_unhandled_signals_ratelimited(void)
+static void show_signal_msg(int signr, struct pt_regs *regs, int code,
+                           unsigned long addr)
 {
        static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);
-       return show_unhandled_signals && __ratelimit(&rs);
-}
 
-static void show_signal_msg(int signr, struct pt_regs *regs, int code,
-                           unsigned long addr)
-{
-       if (!show_unhandled_signals_ratelimited())
+       if (!show_unhandled_signals)
                return;
 
        if (!unhandled_signal(current, signr))
                return;
 
+       if (!__ratelimit(&rs))
+               return;
+
        pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x",
                current->comm, current->pid, signame(signr), signr,
                addr, regs->nip, regs->link, code);
index 574fc1d..3e3a715 100644 (file)
@@ -46,6 +46,7 @@
 #include <linux/compiler.h>
 #include <linux/of.h>
 
+#include <asm/ftrace.h>
 #include <asm/reg.h>
 #include <asm/ppc-opcode.h>
 #include <asm/asm-prototypes.h>
index a4ca576..c9ee9e2 100644 (file)
@@ -129,6 +129,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
        long i, j, ret = 0, locked_entries = 0;
        unsigned int pageshift;
        unsigned long flags;
+       unsigned long cur_ua;
        struct page *page = NULL;
 
        mutex_lock(&mem_list_mutex);
@@ -177,7 +178,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
        }
 
        for (i = 0; i < entries; ++i) {
-               if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+               cur_ua = ua + (i << PAGE_SHIFT);
+               if (1 != get_user_pages_fast(cur_ua,
                                        1/* pages */, 1/* iswrite */, &page)) {
                        ret = -EFAULT;
                        for (j = 0; j < i; ++j)
@@ -196,7 +198,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
                if (is_migrate_cma_page(page)) {
                        if (mm_iommu_move_page_from_cma(page))
                                goto populate;
-                       if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+                       if (1 != get_user_pages_fast(cur_ua,
                                                1/* pages */, 1/* iswrite */,
                                                &page)) {
                                ret = -EFAULT;
@@ -210,20 +212,21 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
                }
 populate:
                pageshift = PAGE_SHIFT;
-               if (PageCompound(page)) {
+               if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) {
                        pte_t *pte;
                        struct page *head = compound_head(page);
                        unsigned int compshift = compound_order(head);
+                       unsigned int pteshift;
 
                        local_irq_save(flags); /* disables as well */
-                       pte = find_linux_pte(mm->pgd, ua, NULL, &pageshift);
-                       local_irq_restore(flags);
+                       pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift);
 
                        /* Double check it is still the same pinned page */
                        if (pte && pte_page(*pte) == head &&
-                                       pageshift == compshift)
-                               pageshift = max_t(unsigned int, pageshift,
+                           pteshift == compshift + PAGE_SHIFT)
+                               pageshift = max_t(unsigned int, pteshift,
                                                PAGE_SHIFT);
+                       local_irq_restore(flags);
                }
                mem->pageshift = min(mem->pageshift, pageshift);
                mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
index 0c7e05d..35ac542 100644 (file)
@@ -1078,7 +1078,6 @@ static int prrn_enabled;
 static void reset_topology_timer(void);
 static int topology_timer_secs = 1;
 static int topology_inited;
-static int topology_update_needed;
 
 /*
  * Change polling interval for associativity changes.
@@ -1306,11 +1305,8 @@ int numa_update_cpu_topology(bool cpus_locked)
        struct device *dev;
        int weight, new_nid, i = 0;
 
-       if (!prrn_enabled && !vphn_enabled) {
-               if (!topology_inited)
-                       topology_update_needed = 1;
+       if (!prrn_enabled && !vphn_enabled && topology_inited)
                return 0;
-       }
 
        weight = cpumask_weight(&cpu_associativity_changes_mask);
        if (!weight)
@@ -1423,7 +1419,6 @@ int numa_update_cpu_topology(bool cpus_locked)
 
 out:
        kfree(updates);
-       topology_update_needed = 0;
        return changed;
 }
 
@@ -1551,6 +1546,15 @@ int prrn_is_enabled(void)
        return prrn_enabled;
 }
 
+void __init shared_proc_topology_init(void)
+{
+       if (lppaca_shared_proc(get_lppaca())) {
+               bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
+                           nr_cpumask_bits);
+               numa_update_cpu_topology(false);
+       }
+}
+
 static int topology_read(struct seq_file *file, void *v)
 {
        if (vphn_enabled || prrn_enabled)
@@ -1608,10 +1612,6 @@ static int topology_update_init(void)
                return -ENOMEM;
 
        topology_inited = 1;
-       if (topology_update_needed)
-               bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
-                                       nr_cpumask_bits);
-
        return 0;
 }
 device_initcall(topology_update_init);
index 7be99fd..c879979 100644 (file)
@@ -1045,20 +1045,22 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
        struct mm_struct *mm = vma->vm_mm;
        unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
                                              _PAGE_RW | _PAGE_EXEC);
+
+       unsigned long change = pte_val(entry) ^ pte_val(*ptep);
        /*
         * To avoid NMMU hang while relaxing access, we need mark
         * the pte invalid in between.
         */
-       if (atomic_read(&mm->context.copros) > 0) {
+       if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
                unsigned long old_pte, new_pte;
 
-               old_pte = __radix_pte_update(ptep, ~0, 0);
+               old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
                /*
                 * new value of pte
                 */
                new_pte = old_pte | set;
                radix__flush_tlb_page_psize(mm, address, psize);
-               __radix_pte_update(ptep, 0, new_pte);
+               __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
        } else {
                __radix_pte_update(ptep, 0, set);
                /*
index 0b095fa..9f574e5 100644 (file)
@@ -70,7 +70,7 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
 
 static inline void slb_shadow_clear(enum slb_index index)
 {
-       WRITE_ONCE(get_slb_shadow()->save_area[index].esid, 0);
+       WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
 }
 
 static inline void create_shadowed_slbe(unsigned long ea, int ssize,
index 404c379..38fe408 100644 (file)
@@ -370,12 +370,8 @@ static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, b
        olen = cpu_to_be64(total_len);
        rc = opal_console_write(vtermno, &olen, data);
        if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
-               if (rc == OPAL_BUSY_EVENT) {
-                       mdelay(OPAL_BUSY_DELAY_MS);
+               if (rc == OPAL_BUSY_EVENT)
                        opal_poll_events(NULL);
-               } else if (rc == OPAL_BUSY_EVENT) {
-                       mdelay(OPAL_BUSY_DELAY_MS);
-               }
                written = -EAGAIN;
                goto out;
        }
@@ -401,15 +397,6 @@ out:
        if (atomic)
                spin_unlock_irqrestore(&opal_write_lock, flags);
 
-       /* In the -EAGAIN case, callers loop, so we have to flush the console
-        * here in case they have interrupts off (and we don't want to wait
-        * for async flushing if we can make immediate progress here). If
-        * necessary the API could be made entirely non-flushing if the
-        * callers had a ->flush API to use.
-        */
-       if (written == -EAGAIN)
-               opal_flush_console(vtermno);
-
        return written;
 }
 
@@ -429,40 +416,74 @@ int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len)
        return __opal_put_chars(vtermno, data, total_len, true);
 }
 
-int opal_flush_console(uint32_t vtermno)
+static s64 __opal_flush_console(uint32_t vtermno)
 {
        s64 rc;
 
        if (!opal_check_token(OPAL_CONSOLE_FLUSH)) {
                __be64 evt;
 
-               WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n");
                /*
                 * If OPAL_CONSOLE_FLUSH is not implemented in the firmware,
                 * the console can still be flushed by calling the polling
                 * function while it has OPAL_EVENT_CONSOLE_OUTPUT events.
                 */
-               do {
-                       opal_poll_events(&evt);
-               } while (be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT);
+               WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n");
+
+               opal_poll_events(&evt);
+               if (!(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT))
+                       return OPAL_SUCCESS;
+               return OPAL_BUSY;
 
-               return OPAL_SUCCESS;
+       } else {
+               rc = opal_console_flush(vtermno);
+               if (rc == OPAL_BUSY_EVENT) {
+                       opal_poll_events(NULL);
+                       rc = OPAL_BUSY;
+               }
+               return rc;
        }
 
-       do  {
-               rc = OPAL_BUSY;
-               while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
-                       rc = opal_console_flush(vtermno);
-                       if (rc == OPAL_BUSY_EVENT) {
-                               mdelay(OPAL_BUSY_DELAY_MS);
-                               opal_poll_events(NULL);
-                       } else if (rc == OPAL_BUSY) {
-                               mdelay(OPAL_BUSY_DELAY_MS);
+}
+
+/*
+ * opal_flush_console spins until the console is flushed
+ */
+int opal_flush_console(uint32_t vtermno)
+{
+       for (;;) {
+               s64 rc = __opal_flush_console(vtermno);
+
+               if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
+                       mdelay(1);
+                       continue;
+               }
+
+               return opal_error_code(rc);
+       }
+}
+
+/*
+ * opal_flush_chars is an hvc interface that sleeps until the console is
+ * flushed if wait, otherwise it will return -EBUSY if the console has data,
+ * -EAGAIN if it has data and some of it was flushed.
+ */
+int opal_flush_chars(uint32_t vtermno, bool wait)
+{
+       for (;;) {
+               s64 rc = __opal_flush_console(vtermno);
+
+               if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
+                       if (wait) {
+                               msleep(OPAL_BUSY_DELAY_MS);
+                               continue;
                        }
+                       if (rc == OPAL_PARTIAL)
+                               return -EAGAIN;
                }
-       } while (rc == OPAL_PARTIAL); /* More to flush */
 
-       return opal_error_code(rc);
+               return opal_error_code(rc);
+       }
 }
 
 static int opal_recover_mce(struct pt_regs *regs,
index 4e6302b..cde7102 100644 (file)
@@ -3228,12 +3228,49 @@ static void pnv_pci_ioda_create_dbgfs(void)
 #endif /* CONFIG_DEBUG_FS */
 }
 
+static void pnv_pci_enable_bridge(struct pci_bus *bus)
+{
+       struct pci_dev *dev = bus->self;
+       struct pci_bus *child;
+
+       /* Empty bus ? bail */
+       if (list_empty(&bus->devices))
+               return;
+
+       /*
+        * If there's a bridge associated with that bus enable it. This works
+        * around races in the generic code if the enabling is done during
+        * parallel probing. This can be removed once those races have been
+        * fixed.
+        */
+       if (dev) {
+               int rc = pci_enable_device(dev);
+               if (rc)
+                       pci_err(dev, "Error enabling bridge (%d)\n", rc);
+               pci_set_master(dev);
+       }
+
+       /* Perform the same to child busses */
+       list_for_each_entry(child, &bus->children, node)
+               pnv_pci_enable_bridge(child);
+}
+
+static void pnv_pci_enable_bridges(void)
+{
+       struct pci_controller *hose;
+
+       list_for_each_entry(hose, &hose_list, list_node)
+               pnv_pci_enable_bridge(hose->bus);
+}
+
 static void pnv_pci_ioda_fixup(void)
 {
        pnv_pci_ioda_setup_PEs();
        pnv_pci_ioda_setup_iommu_api();
        pnv_pci_ioda_create_dbgfs();
 
+       pnv_pci_enable_bridges();
+
 #ifdef CONFIG_EEH
        pnv_eeh_post_init();
 #endif
index f631f8b..77baf89 100644 (file)
@@ -52,6 +52,7 @@ static u32 hvc_opal_boot_termno;
 static const struct hv_ops hvc_opal_raw_ops = {
        .get_chars = opal_get_chars,
        .put_chars = opal_put_chars,
+       .flush = opal_flush_chars,
        .notifier_add = notifier_add_irq,
        .notifier_del = notifier_del_irq,
        .notifier_hangup = notifier_hangup_irq,
@@ -141,6 +142,7 @@ static int hvc_opal_hvsi_tiocmset(struct hvc_struct *hp, unsigned int set,
 static const struct hv_ops hvc_opal_hvsi_ops = {
        .get_chars = hvc_opal_hvsi_get_chars,
        .put_chars = hvc_opal_hvsi_put_chars,
+       .flush = opal_flush_chars,
        .notifier_add = hvc_opal_hvsi_open,
        .notifier_del = hvc_opal_hvsi_close,
        .notifier_hangup = hvc_opal_hvsi_hangup,