Merge tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 14 Aug 2018 17:23:25 +0000 (10:23 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 14 Aug 2018 17:23:25 +0000 (10:23 -0700)
Pull block updates from Jens Axboe:
 "First pull request for this merge window, there will also be a
  followup request with some stragglers.

  This pull request contains:

   - Fix for a thundering heard issue in the wbt block code (Anchal
     Agarwal)

   - A few NVMe pull requests:
      * Improved tracepoints (Keith)
      * Larger inline data support for RDMA (Steve Wise)
      * RDMA setup/teardown fixes (Sagi)
      * Effects log suppor for NVMe target (Chaitanya Kulkarni)
      * Buffered IO suppor for NVMe target (Chaitanya Kulkarni)
      * TP4004 (ANA) support (Christoph)
      * Various NVMe fixes

   - Block io-latency controller support. Much needed support for
     properly containing block devices. (Josef)

   - Series improving how we handle sense information on the stack
     (Kees)

   - Lightnvm fixes and updates/improvements (Mathias/Javier et al)

   - Zoned device support for null_blk (Matias)

   - AIX partition fixes (Mauricio Faria de Oliveira)

   - DIF checksum code made generic (Max Gurtovoy)

   - Add support for discard in iostats (Michael Callahan / Tejun)

   - Set of updates for BFQ (Paolo)

   - Removal of async write support for bsg (Christoph)

   - Bio page dirtying and clone fixups (Christoph)

   - Set of bcache fix/changes (via Coly)

   - Series improving blk-mq queue setup/teardown speed (Ming)

   - Series improving merging performance on blk-mq (Ming)

   - Lots of other fixes and cleanups from a slew of folks"

* tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block: (190 commits)
  blkcg: Make blkg_root_lookup() work for queues in bypass mode
  bcache: fix error setting writeback_rate through sysfs interface
  null_blk: add lock drop/acquire annotation
  Blk-throttle: reduce tail io latency when iops limit is enforced
  block: paride: pd: mark expected switch fall-throughs
  block: Ensure that a request queue is dissociated from the cgroup controller
  block: Introduce blk_exit_queue()
  blkcg: Introduce blkg_root_lookup()
  block: Remove two superfluous #include directives
  blk-mq: count the hctx as active before allocating tag
  block: bvec_nr_vecs() returns value for wrong slab
  bcache: trivial - remove tailing backslash in macro BTREE_FLAG
  bcache: make the pr_err statement used for ENOENT only in sysfs_attatch section
  bcache: set max writeback rate when I/O request is idle
  bcache: add code comments for bset.c
  bcache: fix mistaken comments in request.c
  bcache: fix mistaken code comments in bcache.h
  bcache: add a comment in super.c
  bcache: avoid unncessary cache prefetch bch_btree_node_get()
  bcache: display rate debug parameters to 0 when writeback is not running
  ...

21 files changed:
1  2 
block/bio.c
block/blk-core.c
block/blk-mq-tag.c
block/blk-mq.c
drivers/block/zram/zram_drv.c
drivers/nvme/host/fabrics.c
drivers/nvme/host/fc.c
drivers/nvme/host/rdma.c
drivers/nvme/target/configfs.c
drivers/nvme/target/core.c
drivers/nvme/target/loop.c
fs/block_dev.c
fs/ext4/super.c
fs/ext4/sysfs.c
include/linux/blk-mq.h
include/linux/sched.h
kernel/fork.c
mm/memcontrol.c
mm/memory.c
mm/shmem.c
mm/swapfile.c

diff --combined block/bio.c
  #include <linux/mempool.h>
  #include <linux/workqueue.h>
  #include <linux/cgroup.h>
+ #include <linux/blk-cgroup.h>
  
  #include <trace/events/block.h>
  #include "blk.h"
+ #include "blk-rq-qos.h"
  
  /*
   * Test patch to inline a certain number of bi_io_vec's inside the bio
@@@ -156,7 -158,7 +158,7 @@@ out
  
  unsigned int bvec_nr_vecs(unsigned short idx)
  {
-       return bvec_slabs[idx].nr_vecs;
+       return bvec_slabs[--idx].nr_vecs;
  }
  
  void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
@@@ -644,83 -646,6 +646,6 @@@ struct bio *bio_clone_fast(struct bio *
  }
  EXPORT_SYMBOL(bio_clone_fast);
  
- /**
-  *    bio_clone_bioset - clone a bio
-  *    @bio_src: bio to clone
-  *    @gfp_mask: allocation priority
-  *    @bs: bio_set to allocate from
-  *
-  *    Clone bio. Caller will own the returned bio, but not the actual data it
-  *    points to. Reference count of returned bio will be one.
-  */
- struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-                            struct bio_set *bs)
- {
-       struct bvec_iter iter;
-       struct bio_vec bv;
-       struct bio *bio;
-       /*
-        * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
-        * bio_src->bi_io_vec to bio->bi_io_vec.
-        *
-        * We can't do that anymore, because:
-        *
-        *  - The point of cloning the biovec is to produce a bio with a biovec
-        *    the caller can modify: bi_idx and bi_bvec_done should be 0.
-        *
-        *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
-        *    we tried to clone the whole thing bio_alloc_bioset() would fail.
-        *    But the clone should succeed as long as the number of biovecs we
-        *    actually need to allocate is fewer than BIO_MAX_PAGES.
-        *
-        *  - Lastly, bi_vcnt should not be looked at or relied upon by code
-        *    that does not own the bio - reason being drivers don't use it for
-        *    iterating over the biovec anymore, so expecting it to be kept up
-        *    to date (i.e. for clones that share the parent biovec) is just
-        *    asking for trouble and would force extra work on
-        *    __bio_clone_fast() anyways.
-        */
-       bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
-       if (!bio)
-               return NULL;
-       bio->bi_disk            = bio_src->bi_disk;
-       bio->bi_opf             = bio_src->bi_opf;
-       bio->bi_write_hint      = bio_src->bi_write_hint;
-       bio->bi_iter.bi_sector  = bio_src->bi_iter.bi_sector;
-       bio->bi_iter.bi_size    = bio_src->bi_iter.bi_size;
-       switch (bio_op(bio)) {
-       case REQ_OP_DISCARD:
-       case REQ_OP_SECURE_ERASE:
-       case REQ_OP_WRITE_ZEROES:
-               break;
-       case REQ_OP_WRITE_SAME:
-               bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
-               break;
-       default:
-               bio_for_each_segment(bv, bio_src, iter)
-                       bio->bi_io_vec[bio->bi_vcnt++] = bv;
-               break;
-       }
-       if (bio_integrity(bio_src)) {
-               int ret;
-               ret = bio_integrity_clone(bio, bio_src, gfp_mask);
-               if (ret < 0) {
-                       bio_put(bio);
-                       return NULL;
-               }
-       }
-       bio_clone_blkcg_association(bio, bio_src);
-       return bio;
- }
- EXPORT_SYMBOL(bio_clone_bioset);
  /**
   *    bio_add_pc_page -       attempt to add page to bio
   *    @q: the target queue
@@@ -903,27 -828,25 +828,27 @@@ int bio_add_page(struct bio *bio, struc
  EXPORT_SYMBOL(bio_add_page);
  
  /**
 - * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
 + * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
   * @bio: bio to add pages to
   * @iter: iov iterator describing the region to be mapped
   *
 - * Pins as many pages from *iter and appends them to @bio's bvec array. The
 + * Pins pages from *iter and appends them to @bio's bvec array. The
   * pages will have to be released using put_page() when done.
 + * For multi-segment *iter, this function only adds pages from the
 + * the next non-empty segment of the iov iterator.
   */
 -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 +static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  {
 -      unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
 +      unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx;
        struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
        struct page **pages = (struct page **)bv;
 -      size_t offset, diff;
 +      size_t offset;
        ssize_t size;
  
        size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
        if (unlikely(size <= 0))
                return size ? size : -EFAULT;
 -      nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
 +      idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
  
        /*
         * Deep magic below:  We need to walk the pinned pages backwards
        bio->bi_iter.bi_size += size;
        bio->bi_vcnt += nr_pages;
  
 -      diff = (nr_pages * PAGE_SIZE - offset) - size;
 -      while (nr_pages--) {
 -              bv[nr_pages].bv_page = pages[nr_pages];
 -              bv[nr_pages].bv_len = PAGE_SIZE;
 -              bv[nr_pages].bv_offset = 0;
 +      while (idx--) {
 +              bv[idx].bv_page = pages[idx];
 +              bv[idx].bv_len = PAGE_SIZE;
 +              bv[idx].bv_offset = 0;
        }
  
        bv[0].bv_offset += offset;
        bv[0].bv_len -= offset;
 -      if (diff)
 -              bv[bio->bi_vcnt - 1].bv_len -= diff;
 +      bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size;
  
        iov_iter_advance(iter, size);
        return 0;
  }
 +
 +/**
 + * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
 + * @bio: bio to add pages to
 + * @iter: iov iterator describing the region to be mapped
 + *
 + * Pins pages from *iter and appends them to @bio's bvec array. The
 + * pages will have to be released using put_page() when done.
 + * The function tries, but does not guarantee, to pin as many pages as
 + * fit into the bio, or are requested in *iter, whatever is smaller.
 + * If MM encounters an error pinning the requested pages, it stops.
 + * Error is returned only if 0 pages could be pinned.
 + */
 +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 +{
 +      unsigned short orig_vcnt = bio->bi_vcnt;
 +
 +      do {
 +              int ret = __bio_iov_iter_get_pages(bio, iter);
 +
 +              if (unlikely(ret))
 +                      return bio->bi_vcnt > orig_vcnt ? 0 : ret;
 +
 +      } while (iov_iter_count(iter) && !bio_full(bio));
 +
 +      return 0;
 +}
  EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
  
  static void submit_bio_wait_endio(struct bio *bio)
@@@ -1661,10 -1559,8 +1586,8 @@@ void bio_set_pages_dirty(struct bio *bi
        int i;
  
        bio_for_each_segment_all(bvec, bio, i) {
-               struct page *page = bvec->bv_page;
-               if (page && !PageCompound(page))
-                       set_page_dirty_lock(page);
+               if (!PageCompound(bvec->bv_page))
+                       set_page_dirty_lock(bvec->bv_page);
        }
  }
  EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
@@@ -1674,19 -1570,15 +1597,15 @@@ static void bio_release_pages(struct bi
        struct bio_vec *bvec;
        int i;
  
-       bio_for_each_segment_all(bvec, bio, i) {
-               struct page *page = bvec->bv_page;
-               if (page)
-                       put_page(page);
-       }
+       bio_for_each_segment_all(bvec, bio, i)
+               put_page(bvec->bv_page);
  }
  
  /*
   * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
   * If they are, then fine.  If, however, some pages are clean then they must
   * have been written out during the direct-IO read.  So we take another ref on
-  * the BIO and the offending pages and re-dirty the pages in process context.
+  * the BIO and re-dirty the pages in process context.
   *
   * It is expected that bio_check_pages_dirty() will wholly own the BIO from
   * here on.  It will run one put_page() against each page and will run one
@@@ -1704,78 -1596,70 +1623,70 @@@ static struct bio *bio_dirty_list
   */
  static void bio_dirty_fn(struct work_struct *work)
  {
-       unsigned long flags;
-       struct bio *bio;
+       struct bio *bio, *next;
  
-       spin_lock_irqsave(&bio_dirty_lock, flags);
-       bio = bio_dirty_list;
+       spin_lock_irq(&bio_dirty_lock);
+       next = bio_dirty_list;
        bio_dirty_list = NULL;
-       spin_unlock_irqrestore(&bio_dirty_lock, flags);
+       spin_unlock_irq(&bio_dirty_lock);
  
-       while (bio) {
-               struct bio *next = bio->bi_private;
+       while ((bio = next) != NULL) {
+               next = bio->bi_private;
  
                bio_set_pages_dirty(bio);
                bio_release_pages(bio);
                bio_put(bio);
-               bio = next;
        }
  }
  
  void bio_check_pages_dirty(struct bio *bio)
  {
        struct bio_vec *bvec;
-       int nr_clean_pages = 0;
+       unsigned long flags;
        int i;
  
        bio_for_each_segment_all(bvec, bio, i) {
-               struct page *page = bvec->bv_page;
-               if (PageDirty(page) || PageCompound(page)) {
-                       put_page(page);
-                       bvec->bv_page = NULL;
-               } else {
-                       nr_clean_pages++;
-               }
+               if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
+                       goto defer;
        }
  
-       if (nr_clean_pages) {
-               unsigned long flags;
-               spin_lock_irqsave(&bio_dirty_lock, flags);
-               bio->bi_private = bio_dirty_list;
-               bio_dirty_list = bio;
-               spin_unlock_irqrestore(&bio_dirty_lock, flags);
-               schedule_work(&bio_dirty_work);
-       } else {
-               bio_put(bio);
-       }
+       bio_release_pages(bio);
+       bio_put(bio);
+       return;
+ defer:
+       spin_lock_irqsave(&bio_dirty_lock, flags);
+       bio->bi_private = bio_dirty_list;
+       bio_dirty_list = bio;
+       spin_unlock_irqrestore(&bio_dirty_lock, flags);
+       schedule_work(&bio_dirty_work);
  }
  EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
  
- void generic_start_io_acct(struct request_queue *q, int rw,
+ void generic_start_io_acct(struct request_queue *q, int op,
                           unsigned long sectors, struct hd_struct *part)
  {
+       const int sgrp = op_stat_group(op);
        int cpu = part_stat_lock();
  
        part_round_stats(q, cpu, part);
-       part_stat_inc(cpu, part, ios[rw]);
-       part_stat_add(cpu, part, sectors[rw], sectors);
-       part_inc_in_flight(q, part, rw);
+       part_stat_inc(cpu, part, ios[sgrp]);
+       part_stat_add(cpu, part, sectors[sgrp], sectors);
+       part_inc_in_flight(q, part, op_is_write(op));
  
        part_stat_unlock();
  }
  EXPORT_SYMBOL(generic_start_io_acct);
  
- void generic_end_io_acct(struct request_queue *q, int rw,
+ void generic_end_io_acct(struct request_queue *q, int req_op,
                         struct hd_struct *part, unsigned long start_time)
  {
        unsigned long duration = jiffies - start_time;
+       const int sgrp = op_stat_group(req_op);
        int cpu = part_stat_lock();
  
-       part_stat_add(cpu, part, ticks[rw], duration);
+       part_stat_add(cpu, part, ticks[sgrp], duration);
        part_round_stats(q, cpu, part);
-       part_dec_in_flight(q, part, rw);
+       part_dec_in_flight(q, part, op_is_write(req_op));
  
        part_stat_unlock();
  }
@@@ -1834,6 -1718,9 +1745,9 @@@ again
        if (!bio_integrity_endio(bio))
                return;
  
+       if (bio->bi_disk)
+               rq_qos_done_bio(bio->bi_disk->queue, bio);
        /*
         * Need to have a real endio function for chained bios, otherwise
         * various corner cases will break (like stacking block devices that
@@@ -1893,7 -1780,6 +1807,7 @@@ struct bio *bio_split(struct bio *bio, 
                bio_integrity_trim(split);
  
        bio_advance(bio, split->bi_iter.bi_size);
 +      bio->bi_iter.bi_done = 0;
  
        if (bio_flagged(bio, BIO_TRACE_COMPLETION))
                bio_set_flag(split, BIO_TRACE_COMPLETION);
@@@ -2042,6 -1928,30 +1956,30 @@@ EXPORT_SYMBOL(bioset_init_from_src)
  
  #ifdef CONFIG_BLK_CGROUP
  
+ #ifdef CONFIG_MEMCG
+ /**
+  * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
+  * @bio: target bio
+  * @page: the page to lookup the blkcg from
+  *
+  * Associate @bio with the blkcg from @page's owning memcg.  This works like
+  * every other associate function wrt references.
+  */
+ int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
+ {
+       struct cgroup_subsys_state *blkcg_css;
+       if (unlikely(bio->bi_css))
+               return -EBUSY;
+       if (!page->mem_cgroup)
+               return 0;
+       blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
+                                    &io_cgrp_subsys);
+       bio->bi_css = blkcg_css;
+       return 0;
+ }
+ #endif /* CONFIG_MEMCG */
  /**
   * bio_associate_blkcg - associate a bio with the specified blkcg
   * @bio: target bio
@@@ -2064,6 -1974,24 +2002,24 @@@ int bio_associate_blkcg(struct bio *bio
  }
  EXPORT_SYMBOL_GPL(bio_associate_blkcg);
  
+ /**
+  * bio_associate_blkg - associate a bio with the specified blkg
+  * @bio: target bio
+  * @blkg: the blkg to associate
+  *
+  * Associate @bio with the blkg specified by @blkg.  This is the queue specific
+  * blkcg information associated with the @bio, a reference will be taken on the
+  * @blkg and will be freed when the bio is freed.
+  */
+ int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+ {
+       if (unlikely(bio->bi_blkg))
+               return -EBUSY;
+       blkg_get(blkg);
+       bio->bi_blkg = blkg;
+       return 0;
+ }
  /**
   * bio_disassociate_task - undo bio_associate_current()
   * @bio: target bio
@@@ -2078,6 -2006,10 +2034,10 @@@ void bio_disassociate_task(struct bio *
                css_put(bio->bi_css);
                bio->bi_css = NULL;
        }
+       if (bio->bi_blkg) {
+               blkg_put(bio->bi_blkg);
+               bio->bi_blkg = NULL;
+       }
  }
  
  /**
diff --combined block/blk-core.c
@@@ -42,7 -42,7 +42,7 @@@
  #include "blk.h"
  #include "blk-mq.h"
  #include "blk-mq-sched.h"
- #include "blk-wbt.h"
+ #include "blk-rq-qos.h"
  
  #ifdef CONFIG_DEBUG_FS
  struct dentry *blk_debugfs_root;
@@@ -715,6 -715,35 +715,35 @@@ void blk_set_queue_dying(struct request
  }
  EXPORT_SYMBOL_GPL(blk_set_queue_dying);
  
+ /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
+ void blk_exit_queue(struct request_queue *q)
+ {
+       /*
+        * Since the I/O scheduler exit code may access cgroup information,
+        * perform I/O scheduler exit before disassociating from the block
+        * cgroup controller.
+        */
+       if (q->elevator) {
+               ioc_clear_queue(q);
+               elevator_exit(q, q->elevator);
+               q->elevator = NULL;
+       }
+       /*
+        * Remove all references to @q from the block cgroup controller before
+        * restoring @q->queue_lock to avoid that restoring this pointer causes
+        * e.g. blkcg_print_blkgs() to crash.
+        */
+       blkcg_exit_queue(q);
+       /*
+        * Since the cgroup code may dereference the @q->backing_dev_info
+        * pointer, only decrease its reference count after having removed the
+        * association with the block cgroup controller.
+        */
+       bdi_put(q->backing_dev_info);
+ }
  /**
   * blk_cleanup_queue - shutdown a request queue
   * @q: request queue to shutdown
@@@ -762,9 -791,13 +791,13 @@@ void blk_cleanup_queue(struct request_q
         * make sure all in-progress dispatch are completed because
         * blk_freeze_queue() can only complete all requests, and
         * dispatch may still be in-progress since we dispatch requests
-        * from more than one contexts
+        * from more than one contexts.
+        *
+        * No need to quiesce queue if it isn't initialized yet since
+        * blk_freeze_queue() should be enough for cases of passthrough
+        * request.
         */
-       if (q->mq_ops)
+       if (q->mq_ops && blk_queue_init_done(q))
                blk_mq_quiesce_queue(q);
  
        /* for synchronous bio-based driver finish in-flight integrity i/o */
         */
        WARN_ON_ONCE(q->kobj.state_in_sysfs);
  
-       /*
-        * Since the I/O scheduler exit code may access cgroup information,
-        * perform I/O scheduler exit before disassociating from the block
-        * cgroup controller.
-        */
-       if (q->elevator) {
-               ioc_clear_queue(q);
-               elevator_exit(q, q->elevator);
-               q->elevator = NULL;
-       }
-       /*
-        * Remove all references to @q from the block cgroup controller before
-        * restoring @q->queue_lock to avoid that restoring this pointer causes
-        * e.g. blkcg_print_blkgs() to crash.
-        */
-       blkcg_exit_queue(q);
-       /*
-        * Since the cgroup code may dereference the @q->backing_dev_info
-        * pointer, only decrease its reference count after having removed the
-        * association with the block cgroup controller.
-        */
-       bdi_put(q->backing_dev_info);
+       blk_exit_queue(q);
  
        if (q->mq_ops)
                blk_mq_free_queue(q);
@@@ -1180,6 -1190,7 +1190,7 @@@ out_exit_flush_rq
                q->exit_rq_fn(q, q->fq->flush_rq);
  out_free_flush_queue:
        blk_free_flush_queue(q->fq);
+       q->fq = NULL;
        return -ENOMEM;
  }
  EXPORT_SYMBOL(blk_init_allocated_queue);
@@@ -1641,7 -1652,7 +1652,7 @@@ void blk_requeue_request(struct request
        blk_delete_timer(rq);
        blk_clear_rq_complete(rq);
        trace_block_rq_requeue(q, rq);
-       wbt_requeue(q->rq_wb, rq);
+       rq_qos_requeue(q, rq);
  
        if (rq->rq_flags & RQF_QUEUED)
                blk_queue_end_tag(q, rq);
@@@ -1748,7 -1759,7 +1759,7 @@@ void __blk_put_request(struct request_q
        /* this is a bio leak */
        WARN_ON(req->bio != NULL);
  
-       wbt_done(q->rq_wb, req);
+       rq_qos_done(q, req);
  
        /*
         * Request may not have originated from ll_rw_blk. if not,
@@@ -1982,7 -1993,6 +1993,6 @@@ static blk_qc_t blk_queue_bio(struct re
        int where = ELEVATOR_INSERT_SORT;
        struct request *req, *free;
        unsigned int request_count = 0;
-       unsigned int wb_acct;
  
        /*
         * low level driver can indicate that it wants pages above a
        }
  
  get_rq:
-       wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+       rq_qos_throttle(q, bio, q->queue_lock);
  
        /*
         * Grab a free request. This is might sleep but can not fail.
        req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
        if (IS_ERR(req)) {
                blk_queue_exit(q);
-               __wbt_done(q->rq_wb, wb_acct);
+               rq_qos_cleanup(q, bio);
                if (PTR_ERR(req) == -ENOMEM)
                        bio->bi_status = BLK_STS_RESOURCE;
                else
                goto out_unlock;
        }
  
-       wbt_track(req, wb_acct);
+       rq_qos_track(q, req, bio);
  
        /*
         * After dropping the lock and possibly sleeping here, our request
@@@ -2155,12 -2165,11 +2165,12 @@@ static inline bool bio_check_ro(struct 
        if (part->policy && op_is_write(bio_op(bio))) {
                char b[BDEVNAME_SIZE];
  
 -              printk(KERN_ERR
 +              WARN_ONCE(1,
                       "generic_make_request: Trying to write "
                        "to read-only block-device %s (partno %d)\n",
                        bio_devname(bio, b), part->partno);
 -              return true;
 +              /* Older lvm-tools actually trigger this */
 +              return false;
        }
  
        return false;
@@@ -2700,13 -2709,13 +2710,13 @@@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes)
  void blk_account_io_completion(struct request *req, unsigned int bytes)
  {
        if (blk_do_io_stat(req)) {
-               const int rw = rq_data_dir(req);
+               const int sgrp = op_stat_group(req_op(req));
                struct hd_struct *part;
                int cpu;
  
                cpu = part_stat_lock();
                part = req->part;
-               part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+               part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
                part_stat_unlock();
        }
  }
@@@ -2720,7 -2729,7 +2730,7 @@@ void blk_account_io_done(struct reques
         */
        if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
                unsigned long duration;
-               const int rw = rq_data_dir(req);
+               const int sgrp = op_stat_group(req_op(req));
                struct hd_struct *part;
                int cpu;
  
                cpu = part_stat_lock();
                part = req->part;
  
-               part_stat_inc(cpu, part, ios[rw]);
-               part_stat_add(cpu, part, ticks[rw], duration);
+               part_stat_inc(cpu, part, ios[sgrp]);
+               part_stat_add(cpu, part, ticks[sgrp], duration);
                part_round_stats(req->q, cpu, part);
-               part_dec_in_flight(req->q, part, rw);
+               part_dec_in_flight(req->q, part, rq_data_dir(req));
  
                hd_struct_put(part);
                part_stat_unlock();
@@@ -2751,9 -2760,9 +2761,9 @@@ static bool blk_pm_allow_request(struc
                return rq->rq_flags & RQF_PM;
        case RPM_SUSPENDED:
                return false;
+       default:
+               return true;
        }
-       return true;
  }
  #else
  static bool blk_pm_allow_request(struct request *rq)
@@@ -2980,7 -2989,7 +2990,7 @@@ void blk_start_request(struct request *
                req->throtl_size = blk_rq_sectors(req);
  #endif
                req->rq_flags |= RQF_STATS;
-               wbt_issue(req->q->rq_wb, req);
+               rq_qos_issue(req->q, req);
        }
  
        BUG_ON(blk_rq_is_complete(req));
@@@ -3053,6 -3062,10 +3063,10 @@@ EXPORT_SYMBOL_GPL(blk_steal_bios)
   *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
   *     %false return from this function.
   *
+  * Note:
+  *    The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
+  *    blk_rq_bytes() and in blk_update_request().
+  *
   * Return:
   *     %false - this request doesn't have any more data
   *     %true  - this request has more data
@@@ -3200,7 -3213,7 +3214,7 @@@ void blk_finish_request(struct request 
        blk_account_io_done(req, now);
  
        if (req->end_io) {
-               wbt_done(req->q->rq_wb, req);
+               rq_qos_done(q, req);
                req->end_io(req, error);
        } else {
                if (blk_bidi_rq(req))
@@@ -3763,9 -3776,11 +3777,11 @@@ EXPORT_SYMBOL(blk_finish_plug)
   */
  void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
  {
-       /* not support for RQF_PM and ->rpm_status in blk-mq yet */
-       if (q->mq_ops)
+       /* Don't enable runtime PM for blk-mq until it is ready */
+       if (q->mq_ops) {
+               pm_runtime_disable(dev);
                return;
+       }
  
        q->dev = dev;
        q->rpm_status = RPM_ACTIVE;
diff --combined block/blk-mq-tag.c
@@@ -23,6 -23,9 +23,9 @@@ bool blk_mq_has_free_tags(struct blk_mq
  
  /*
   * If a previously inactive queue goes active, bump the active user count.
+  * We need to do this before try to allocate driver tag, then even if fail
+  * to get tag when first time, the other shared-tag users could reserve
+  * budget for it.
   */
  bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
  {
@@@ -271,7 -274,7 +274,7 @@@ static bool bt_tags_iter(struct sbitma
         * test and set the bit before assining ->rqs[].
         */
        rq = tags->rqs[bitnr];
 -      if (rq && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
 +      if (rq && blk_mq_request_started(rq))
                iter_data->fn(rq, iter_data->data, reserved);
  
        return true;
@@@ -399,8 -402,6 +402,6 @@@ int blk_mq_tag_update_depth(struct blk_
        if (tdepth <= tags->nr_reserved_tags)
                return -EINVAL;
  
-       tdepth -= tags->nr_reserved_tags;
        /*
         * If we are allowed to grow beyond the original size, allocate
         * a new set of tags before freeing the old one.
                if (tdepth > 16 * BLKDEV_MAX_RQ)
                        return -EINVAL;
  
-               new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
+               new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
+                               tags->nr_reserved_tags);
                if (!new)
                        return -ENOMEM;
                ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
                 * Don't need (or can't) update reserved tags here, they
                 * remain static and should never need resizing.
                 */
-               sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+               sbitmap_queue_resize(&tags->bitmap_tags,
+                               tdepth - tags->nr_reserved_tags);
        }
  
        return 0;
diff --combined block/blk-mq.c
@@@ -34,8 -34,8 +34,8 @@@
  #include "blk-mq-debugfs.h"
  #include "blk-mq-tag.h"
  #include "blk-stat.h"
- #include "blk-wbt.h"
  #include "blk-mq-sched.h"
+ #include "blk-rq-qos.h"
  
  static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
  static void blk_mq_poll_stats_start(struct request_queue *q);
@@@ -285,7 -285,7 +285,7 @@@ static struct request *blk_mq_rq_ctx_in
                rq->tag = -1;
                rq->internal_tag = tag;
        } else {
-               if (blk_mq_tag_busy(data->hctx)) {
+               if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
                        rq_flags = RQF_MQ_INFLIGHT;
                        atomic_inc(&data->hctx->nr_active);
                }
@@@ -367,6 -367,8 +367,8 @@@ static struct request *blk_mq_get_reque
                if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
                    !(data->flags & BLK_MQ_REQ_RESERVED))
                        e->type->ops.mq.limit_depth(op, data);
+       } else {
+               blk_mq_tag_busy(data->hctx);
        }
  
        tag = blk_mq_get_tag(data);
@@@ -504,7 -506,7 +506,7 @@@ void blk_mq_free_request(struct reques
        if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
                laptop_io_completion(q->backing_dev_info);
  
-       wbt_done(q->rq_wb, rq);
+       rq_qos_done(q, rq);
  
        if (blk_rq_rl(rq))
                blk_put_rl(blk_rq_rl(rq));
@@@ -527,7 -529,7 +529,7 @@@ inline void __blk_mq_end_request(struc
        blk_account_io_done(rq, now);
  
        if (rq->end_io) {
-               wbt_done(rq->q->rq_wb, rq);
+               rq_qos_done(rq->q, rq);
                rq->end_io(rq, error);
        } else {
                if (unlikely(blk_bidi_rq(rq)))
@@@ -558,8 -560,10 +560,8 @@@ static void __blk_mq_complete_request(s
        bool shared = false;
        int cpu;
  
 -      if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) !=
 -                      MQ_RQ_IN_FLIGHT)
 +      if (!blk_mq_mark_complete(rq))
                return;
 -
        if (rq->internal_tag != -1)
                blk_mq_sched_completed_request(rq);
  
@@@ -639,7 -643,7 +641,7 @@@ void blk_mq_start_request(struct reques
                rq->throtl_size = blk_rq_sectors(rq);
  #endif
                rq->rq_flags |= RQF_STATS;
-               wbt_issue(q->rq_wb, rq);
+               rq_qos_issue(q, rq);
        }
  
        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
@@@ -665,7 -669,7 +667,7 @@@ static void __blk_mq_requeue_request(st
        blk_mq_put_driver_tag(rq);
  
        trace_block_rq_requeue(q, rq);
-       wbt_requeue(q->rq_wb, rq);
+       rq_qos_requeue(q, rq);
  
        if (blk_mq_request_started(rq)) {
                WRITE_ONCE(rq->state, MQ_RQ_IDLE);
@@@ -962,16 -966,14 +964,14 @@@ static inline unsigned int queued_to_in
        return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
  }
  
- bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
-                          bool wait)
+ bool blk_mq_get_driver_tag(struct request *rq)
  {
        struct blk_mq_alloc_data data = {
                .q = rq->q,
                .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
-               .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+               .flags = BLK_MQ_REQ_NOWAIT,
        };
-       might_sleep_if(wait);
+       bool shared;
  
        if (rq->tag != -1)
                goto done;
        if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
                data.flags |= BLK_MQ_REQ_RESERVED;
  
+       shared = blk_mq_tag_busy(data.hctx);
        rq->tag = blk_mq_get_tag(&data);
        if (rq->tag >= 0) {
-               if (blk_mq_tag_busy(data.hctx)) {
+               if (shared) {
                        rq->rq_flags |= RQF_MQ_INFLIGHT;
                        atomic_inc(&data.hctx->nr_active);
                }
        }
  
  done:
-       if (hctx)
-               *hctx = data.hctx;
        return rq->tag != -1;
  }
  
@@@ -1001,7 -1002,10 +1000,10 @@@ static int blk_mq_dispatch_wake(wait_qu
  
        hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
  
+       spin_lock(&hctx->dispatch_wait_lock);
        list_del_init(&wait->entry);
+       spin_unlock(&hctx->dispatch_wait_lock);
        blk_mq_run_hw_queue(hctx, true);
        return 1;
  }
   * restart. For both cases, take care to check the condition again after
   * marking us as waiting.
   */
- static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
+ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
                                 struct request *rq)
  {
-       struct blk_mq_hw_ctx *this_hctx = *hctx;
-       struct sbq_wait_state *ws;
+       struct wait_queue_head *wq;
        wait_queue_entry_t *wait;
        bool ret;
  
-       if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
-               if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
-                       set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
+       if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
+               if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+                       set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  
                /*
                 * It's possible that a tag was freed in the window between the
                 * Don't clear RESTART here, someone else could have set it.
                 * At most this will cost an extra queue run.
                 */
-               return blk_mq_get_driver_tag(rq, hctx, false);
+               return blk_mq_get_driver_tag(rq);
        }
  
-       wait = &this_hctx->dispatch_wait;
+       wait = &hctx->dispatch_wait;
        if (!list_empty_careful(&wait->entry))
                return false;
  
-       spin_lock(&this_hctx->lock);
+       wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
+       spin_lock_irq(&wq->lock);
+       spin_lock(&hctx->dispatch_wait_lock);
        if (!list_empty(&wait->entry)) {
-               spin_unlock(&this_hctx->lock);
+               spin_unlock(&hctx->dispatch_wait_lock);
+               spin_unlock_irq(&wq->lock);
                return false;
        }
  
-       ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
-       add_wait_queue(&ws->wait, wait);
+       wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+       __add_wait_queue(wq, wait);
  
        /*
         * It's possible that a tag was freed in the window between the
         * allocation failure and adding the hardware queue to the wait
         * queue.
         */
-       ret = blk_mq_get_driver_tag(rq, hctx, false);
+       ret = blk_mq_get_driver_tag(rq);
        if (!ret) {
-               spin_unlock(&this_hctx->lock);
+               spin_unlock(&hctx->dispatch_wait_lock);
+               spin_unlock_irq(&wq->lock);
                return false;
        }
  
         * We got a tag, remove ourselves from the wait queue to ensure
         * someone else gets the wakeup.
         */
-       spin_lock_irq(&ws->wait.lock);
        list_del_init(&wait->entry);
-       spin_unlock_irq(&ws->wait.lock);
-       spin_unlock(&this_hctx->lock);
+       spin_unlock(&hctx->dispatch_wait_lock);
+       spin_unlock_irq(&wq->lock);
  
        return true;
  }
  
+ #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
+ #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
+ /*
+  * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
+  * - EWMA is one simple way to compute running average value
+  * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
+  * - take 4 as factor for avoiding to get too small(0) result, and this
+  *   factor doesn't matter because EWMA decreases exponentially
+  */
+ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
+ {
+       unsigned int ewma;
+       if (hctx->queue->elevator)
+               return;
+       ewma = hctx->dispatch_busy;
+       if (!ewma && !busy)
+               return;
+       ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
+       if (busy)
+               ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
+       ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
+       hctx->dispatch_busy = ewma;
+ }
  #define BLK_MQ_RESOURCE_DELAY 3               /* ms units */
  
  /*
@@@ -1103,7 -1139,7 +1137,7 @@@ bool blk_mq_dispatch_rq_list(struct req
                if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
                        break;
  
-               if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+               if (!blk_mq_get_driver_tag(rq)) {
                        /*
                         * The initial allocation attempt failed, so we need to
                         * rerun the hardware queue when a tag is freed. The
                         * before we add this entry back on the dispatch list,
                         * we'll re-run it below.
                         */
-                       if (!blk_mq_mark_tag_wait(&hctx, rq)) {
+                       if (!blk_mq_mark_tag_wait(hctx, rq)) {
                                blk_mq_put_dispatch_budget(hctx);
                                /*
                                 * For non-shared tags, the RESTART check
                        bd.last = true;
                else {
                        nxt = list_first_entry(list, struct request, queuelist);
-                       bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
+                       bd.last = !blk_mq_get_driver_tag(nxt);
                }
  
                ret = q->mq_ops->queue_rq(hctx, &bd);
                else if (needs_restart && (ret == BLK_STS_RESOURCE))
                        blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
  
+               blk_mq_update_dispatch_busy(hctx, true);
                return false;
-       }
+       } else
+               blk_mq_update_dispatch_busy(hctx, false);
  
        /*
         * If the host/device is unable to accept more work, inform the
@@@ -1542,19 -1580,19 +1578,19 @@@ void blk_mq_insert_requests(struct blk_
                            struct list_head *list)
  
  {
+       struct request *rq;
        /*
         * preemption doesn't flush plug list, so it's possible ctx->cpu is
         * offline now
         */
-       spin_lock(&ctx->lock);
-       while (!list_empty(list)) {
-               struct request *rq;
-               rq = list_first_entry(list, struct request, queuelist);
+       list_for_each_entry(rq, list, queuelist) {
                BUG_ON(rq->mq_ctx != ctx);
-               list_del_init(&rq->queuelist);
-               __blk_mq_insert_req_list(hctx, rq, false);
+               trace_block_rq_insert(hctx->queue, rq);
        }
+       spin_lock(&ctx->lock);
+       list_splice_tail_init(list, &ctx->rq_list);
        blk_mq_hctx_mark_pending(hctx, ctx);
        spin_unlock(&ctx->lock);
  }
@@@ -1657,13 -1695,16 +1693,16 @@@ static blk_status_t __blk_mq_issue_dire
        ret = q->mq_ops->queue_rq(hctx, &bd);
        switch (ret) {
        case BLK_STS_OK:
+               blk_mq_update_dispatch_busy(hctx, false);
                *cookie = new_cookie;
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
+               blk_mq_update_dispatch_busy(hctx, true);
                __blk_mq_requeue_request(rq);
                break;
        default:
+               blk_mq_update_dispatch_busy(hctx, false);
                *cookie = BLK_QC_T_NONE;
                break;
        }
@@@ -1698,7 -1739,7 +1737,7 @@@ static blk_status_t __blk_mq_try_issue_
        if (!blk_mq_get_dispatch_budget(hctx))
                goto insert;
  
-       if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+       if (!blk_mq_get_driver_tag(rq)) {
                blk_mq_put_dispatch_budget(hctx);
                goto insert;
        }
@@@ -1746,6 -1787,27 +1785,27 @@@ blk_status_t blk_mq_request_issue_direc
        return ret;
  }
  
+ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+               struct list_head *list)
+ {
+       while (!list_empty(list)) {
+               blk_status_t ret;
+               struct request *rq = list_first_entry(list, struct request,
+                               queuelist);
+               list_del_init(&rq->queuelist);
+               ret = blk_mq_request_issue_directly(rq);
+               if (ret != BLK_STS_OK) {
+                       if (ret == BLK_STS_RESOURCE ||
+                                       ret == BLK_STS_DEV_RESOURCE) {
+                               list_add(&rq->queuelist, list);
+                               break;
+                       }
+                       blk_mq_end_request(rq, ret);
+               }
+       }
+ }
  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  {
        const int is_sync = op_is_sync(bio->bi_opf);
        struct blk_plug *plug;
        struct request *same_queue_rq = NULL;
        blk_qc_t cookie;
-       unsigned int wb_acct;
  
        blk_queue_bounce(q, &bio);
  
        if (blk_mq_sched_bio_merge(q, bio))
                return BLK_QC_T_NONE;
  
-       wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+       rq_qos_throttle(q, bio, NULL);
  
        trace_block_getrq(q, bio, bio->bi_opf);
  
        rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
        if (unlikely(!rq)) {
-               __wbt_done(q->rq_wb, wb_acct);
+               rq_qos_cleanup(q, bio);
                if (bio->bi_opf & REQ_NOWAIT)
                        bio_wouldblock_error(bio);
                return BLK_QC_T_NONE;
        }
  
-       wbt_track(rq, wb_acct);
+       rq_qos_track(q, rq, bio);
  
        cookie = request_to_qc_t(data.hctx, rq);
  
                        blk_mq_try_issue_directly(data.hctx, same_queue_rq,
                                        &cookie);
                }
-       } else if (q->nr_hw_queues > 1 && is_sync) {
+       } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
+                       !data.hctx->dispatch_busy)) {
                blk_mq_put_ctx(data.ctx);
                blk_mq_bio_to_request(rq, bio);
                blk_mq_try_issue_directly(data.hctx, rq, &cookie);
@@@ -2146,6 -2208,7 +2206,7 @@@ static int blk_mq_init_hctx(struct requ
  
        hctx->nr_ctx = 0;
  
+       spin_lock_init(&hctx->dispatch_wait_lock);
        init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
        INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
  
@@@ -2331,15 -2394,10 +2392,10 @@@ static void queue_set_hctx_shared(struc
        int i;
  
        queue_for_each_hw_ctx(q, hctx, i) {
-               if (shared) {
-                       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-                               atomic_inc(&q->shared_hctx_restart);
+               if (shared)
                        hctx->flags |= BLK_MQ_F_TAG_SHARED;
-               } else {
-                       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-                               atomic_dec(&q->shared_hctx_restart);
+               else
                        hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
-               }
        }
  }
  
@@@ -2370,7 -2428,6 +2426,6 @@@ static void blk_mq_del_queue_tag_set(st
                blk_mq_update_tag_set_depth(set, false);
        }
        mutex_unlock(&set->tag_list_lock);
-       synchronize_rcu();
        INIT_LIST_HEAD(&q->tag_set_list);
  }
  
@@@ -2685,7 -2742,6 +2740,6 @@@ static int blk_mq_alloc_rq_maps(struct 
  static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
  {
        if (set->ops->map_queues) {
-               int cpu;
                /*
                 * transport .map_queues is usually done in the following
                 * way:
                 * killing stale mapping since one CPU may not be mapped
                 * to any hw queue.
                 */
-               for_each_possible_cpu(cpu)
-                       set->mq_map[cpu] = 0;
+               blk_mq_clear_mq_map(set);
  
                return set->ops->map_queues(set);
        } else
  /*
   * Alloc a tag set to be associated with one or more request queues.
   * May fail with EINVAL for various error conditions. May adjust the
-  * requested depth down, if if it too large. In that case, the set
+  * requested depth down, if it's too large. In that case, the set
   * value will be stored in set->queue_depth.
   */
  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
@@@ -298,8 -298,7 +298,8 @@@ static void reset_bdev(struct zram *zra
        zram->backing_dev = NULL;
        zram->old_block_size = 0;
        zram->bdev = NULL;
 -
 +      zram->disk->queue->backing_dev_info->capabilities |=
 +                              BDI_CAP_SYNCHRONOUS_IO;
        kvfree(zram->bitmap);
        zram->bitmap = NULL;
  }
@@@ -401,18 -400,6 +401,18 @@@ static ssize_t backing_dev_store(struc
        zram->backing_dev = backing_dev;
        zram->bitmap = bitmap;
        zram->nr_pages = nr_pages;
 +      /*
 +       * With writeback feature, zram does asynchronous IO so it's no longer
 +       * synchronous device so let's remove synchronous io flag. Othewise,
 +       * upper layer(e.g., swap) could wait IO completion rather than
 +       * (submit and return), which will cause system sluggish.
 +       * Furthermore, when the IO function returns(e.g., swap_readpage),
 +       * upper layer expects IO was done so it could deallocate the page
 +       * freely but in fact, IO is going on so finally could cause
 +       * use-after-free when the IO is really done.
 +       */
 +      zram->disk->queue->backing_dev_info->capabilities &=
 +                      ~BDI_CAP_SYNCHRONOUS_IO;
        up_write(&zram->init_lock);
  
        pr_info("setup backing device %s\n", file_name);
@@@ -1287,17 -1274,16 +1287,16 @@@ static void zram_bio_discard(struct zra
   * Returns 1 if IO request was successfully submitted.
   */
  static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
-                       int offset, bool is_write, struct bio *bio)
+                       int offset, unsigned int op, struct bio *bio)
  {
        unsigned long start_time = jiffies;
-       int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
        struct request_queue *q = zram->disk->queue;
        int ret;
  
-       generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
+       generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT,
                        &zram->disk->part0);
  
-       if (!is_write) {
+       if (!op_is_write(op)) {
                atomic64_inc(&zram->stats.num_reads);
                ret = zram_bvec_read(zram, bvec, index, offset, bio);
                flush_dcache_page(bvec->bv_page);
                ret = zram_bvec_write(zram, bvec, index, offset, bio);
        }
  
-       generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
+       generic_end_io_acct(q, op, &zram->disk->part0, start_time);
  
        zram_slot_lock(zram, index);
        zram_accessed(zram, index);
        zram_slot_unlock(zram, index);
  
        if (unlikely(ret < 0)) {
-               if (!is_write)
+               if (!op_is_write(op))
                        atomic64_inc(&zram->stats.failed_reads);
                else
                        atomic64_inc(&zram->stats.failed_writes);
@@@ -1351,7 -1337,7 +1350,7 @@@ static void __zram_make_request(struct 
                        bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
                                                        unwritten);
                        if (zram_bvec_rw(zram, &bv, index, offset,
-                                       op_is_write(bio_op(bio)), bio) < 0)
+                                        bio_op(bio), bio) < 0)
                                goto out;
  
                        bv.bv_offset += bv.bv_len;
@@@ -1403,7 -1389,7 +1402,7 @@@ static void zram_slot_free_notify(struc
  }
  
  static int zram_rw_page(struct block_device *bdev, sector_t sector,
-                      struct page *page, bool is_write)
+                      struct page *page, unsigned int op)
  {
        int offset, ret;
        u32 index;
        bv.bv_len = PAGE_SIZE;
        bv.bv_offset = 0;
  
-       ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
+       ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
  out:
        /*
         * If I/O fails, just return error(ie, non-zero) without
  
        switch (ret) {
        case 0:
-               page_endio(page, is_write, 0);
+               page_endio(page, op_is_write(op), 0);
                break;
        case 1:
                ret = 0;
@@@ -474,7 -474,7 +474,7 @@@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue
  
  bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
  {
-       if (ctrl->opts->max_reconnects != -1 &&
+       if (ctrl->opts->max_reconnects == -1 ||
            ctrl->nr_reconnects < ctrl->opts->max_reconnects)
                return true;
  
@@@ -539,18 -539,14 +539,18 @@@ static struct nvmf_transport_ops *nvmf_
  /*
   * For something we're not in a state to send to the device the default action
   * is to busy it and retry it after the controller state is recovered.  However,
 - * anything marked for failfast or nvme multipath is immediately failed.
 + * if the controller is deleting or if anything is marked for failfast or
 + * nvme multipath it is immediately failed.
   *
   * Note: commands used to initialize the controller will be marked for failfast.
   * Note: nvme cli/ioctl commands are marked for failfast.
   */
 -blk_status_t nvmf_fail_nonready_command(struct request *rq)
 +blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
 +              struct request *rq)
  {
 -      if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
 +      if (ctrl->state != NVME_CTRL_DELETING &&
 +          ctrl->state != NVME_CTRL_DEAD &&
 +          !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
                return BLK_STS_RESOURCE;
        nvme_req(rq)->status = NVME_SC_ABORT_REQ;
        return BLK_STS_IOERR;
diff --combined drivers/nvme/host/fc.c
@@@ -1737,6 -1737,7 +1737,7 @@@ nvme_fc_init_request(struct blk_mq_tag_
        int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
        struct nvme_fc_queue *queue = &ctrl->queues[queue_idx];
  
+       nvme_req(rq)->ctrl = &ctrl->ctrl;
        return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
  }
  
@@@ -2272,7 -2273,7 +2273,7 @@@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *
  
        if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE ||
            !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
 -              return nvmf_fail_nonready_command(rq);
 +              return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
  
        ret = nvme_setup_cmd(ns, rq, sqe);
        if (ret)
diff --combined drivers/nvme/host/rdma.c
  
  #define NVME_RDMA_MAX_SEGMENTS                256
  
- #define NVME_RDMA_MAX_INLINE_SEGMENTS 1
+ #define NVME_RDMA_MAX_INLINE_SEGMENTS 4
  
  struct nvme_rdma_device {
        struct ib_device        *dev;
        struct ib_pd            *pd;
        struct kref             ref;
        struct list_head        entry;
+       unsigned int            num_inline_segments;
  };
  
  struct nvme_rdma_qe {
@@@ -117,6 -118,7 +118,7 @@@ struct nvme_rdma_ctrl 
        struct sockaddr_storage src_addr;
  
        struct nvme_ctrl        ctrl;
+       bool                    use_inline_data;
  };
  
  static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
@@@ -249,7 -251,7 +251,7 @@@ static int nvme_rdma_create_qp(struct n
        /* +1 for drain */
        init_attr.cap.max_recv_wr = queue->queue_size + 1;
        init_attr.cap.max_recv_sge = 1;
-       init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
+       init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
        init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
        init_attr.qp_type = IB_QPT_RC;
        init_attr.send_cq = queue->ib_cq;
@@@ -286,6 -288,7 +288,7 @@@ static int nvme_rdma_init_request(struc
        struct ib_device *ibdev = dev->dev;
        int ret;
  
+       nvme_req(rq)->ctrl = &ctrl->ctrl;
        ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
                        DMA_TO_DEVICE);
        if (ret)
@@@ -374,6 -377,8 +377,8 @@@ nvme_rdma_find_get_device(struct rdma_c
                goto out_free_pd;
        }
  
+       ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
+                                       ndev->dev->attrs.max_sge - 1);
        list_add(&ndev->entry, &device_list);
  out_unlock:
        mutex_unlock(&device_list_mutex);
@@@ -868,6 -873,31 +873,31 @@@ out_free_io_queues
        return ret;
  }
  
+ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
+               bool remove)
+ {
+       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_rdma_stop_queue(&ctrl->queues[0]);
+       blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
+                       &ctrl->ctrl);
+       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_rdma_destroy_admin_queue(ctrl, remove);
+ }
+ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
+               bool remove)
+ {
+       if (ctrl->ctrl.queue_count > 1) {
+               nvme_stop_queues(&ctrl->ctrl);
+               nvme_rdma_stop_io_queues(ctrl);
+               blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
+                               &ctrl->ctrl);
+               if (remove)
+                       nvme_start_queues(&ctrl->ctrl);
+               nvme_rdma_destroy_io_queues(ctrl, remove);
+       }
+ }
  static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
  {
        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@@ -912,21 -942,44 +942,44 @@@ static void nvme_rdma_reconnect_or_remo
        }
  }
  
- static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
  {
-       struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
-                       struct nvme_rdma_ctrl, reconnect_work);
+       int ret = -EINVAL;
        bool changed;
-       int ret;
  
-       ++ctrl->ctrl.nr_reconnects;
-       ret = nvme_rdma_configure_admin_queue(ctrl, false);
+       ret = nvme_rdma_configure_admin_queue(ctrl, new);
        if (ret)
-               goto requeue;
+               return ret;
+       if (ctrl->ctrl.icdoff) {
+               dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
+               goto destroy_admin;
+       }
+       if (!(ctrl->ctrl.sgls & (1 << 2))) {
+               dev_err(ctrl->ctrl.device,
+                       "Mandatory keyed sgls are not supported!\n");
+               goto destroy_admin;
+       }
+       if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
+               dev_warn(ctrl->ctrl.device,
+                       "queue_size %zu > ctrl sqsize %u, clamping down\n",
+                       ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
+       }
+       if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
+               dev_warn(ctrl->ctrl.device,
+                       "sqsize %u > ctrl maxcmd %u, clamping down\n",
+                       ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
+               ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
+       }
+       if (ctrl->ctrl.sgls & (1 << 20))
+               ctrl->use_inline_data = true;
  
        if (ctrl->ctrl.queue_count > 1) {
-               ret = nvme_rdma_configure_io_queues(ctrl, false);
+               ret = nvme_rdma_configure_io_queues(ctrl, new);
                if (ret)
                        goto destroy_admin;
        }
        if (!changed) {
                /* state change failure is ok if we're in DELETING state */
                WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
-               return;
+               ret = -EINVAL;
+               goto destroy_io;
        }
  
        nvme_start_ctrl(&ctrl->ctrl);
+       return 0;
+ destroy_io:
+       if (ctrl->ctrl.queue_count > 1)
+               nvme_rdma_destroy_io_queues(ctrl, new);
+ destroy_admin:
+       nvme_rdma_stop_queue(&ctrl->queues[0]);
+       nvme_rdma_destroy_admin_queue(ctrl, new);
+       return ret;
+ }
+ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+ {
+       struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
+                       struct nvme_rdma_ctrl, reconnect_work);
+       ++ctrl->ctrl.nr_reconnects;
+       if (nvme_rdma_setup_ctrl(ctrl, false))
+               goto requeue;
  
        dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
                        ctrl->ctrl.nr_reconnects);
  
        return;
  
- destroy_admin:
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       nvme_rdma_destroy_admin_queue(ctrl, false);
  requeue:
        dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
                        ctrl->ctrl.nr_reconnects);
@@@ -962,27 -1033,9 +1033,9 @@@ static void nvme_rdma_error_recovery_wo
                        struct nvme_rdma_ctrl, err_work);
  
        nvme_stop_keep_alive(&ctrl->ctrl);
-       if (ctrl->ctrl.queue_count > 1) {
-               nvme_stop_queues(&ctrl->ctrl);
-               nvme_rdma_stop_io_queues(ctrl);
-               blk_mq_tagset_busy_iter(&ctrl->tag_set,
-                                       nvme_cancel_request, &ctrl->ctrl);
-               nvme_rdma_destroy_io_queues(ctrl, false);
-       }
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
-                               nvme_cancel_request, &ctrl->ctrl);
-       nvme_rdma_destroy_admin_queue(ctrl, false);
-       /*
-        * queues are not a live anymore, so restart the queues to fail fast
-        * new IO
-        */
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_rdma_teardown_io_queues(ctrl, false);
        nvme_start_queues(&ctrl->ctrl);
+       nvme_rdma_teardown_admin_queue(ctrl, false);
  
        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
                /* state change failure is ok if we're in DELETING state */
@@@ -1090,19 -1143,27 +1143,27 @@@ static int nvme_rdma_set_sg_null(struc
  }
  
  static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
-               struct nvme_rdma_request *req, struct nvme_command *c)
+               struct nvme_rdma_request *req, struct nvme_command *c,
+               int count)
  {
        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+       struct scatterlist *sgl = req->sg_table.sgl;
+       struct ib_sge *sge = &req->sge[1];
+       u32 len = 0;
+       int i;
  
-       req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
-       req->sge[1].length = sg_dma_len(req->sg_table.sgl);
-       req->sge[1].lkey = queue->device->pd->local_dma_lkey;
+       for (i = 0; i < count; i++, sgl++, sge++) {
+               sge->addr = sg_dma_address(sgl);
+               sge->length = sg_dma_len(sgl);
+               sge->lkey = queue->device->pd->local_dma_lkey;
+               len += sge->length;
+       }
  
        sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
-       sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
+       sg->length = cpu_to_le32(len);
        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
  
-       req->num_sge++;
+       req->num_sge += count;
        return 0;
  }
  
@@@ -1195,15 -1256,16 +1256,16 @@@ static int nvme_rdma_map_data(struct nv
                goto out_free_table;
        }
  
-       if (count == 1) {
+       if (count <= dev->num_inline_segments) {
                if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
+                   queue->ctrl->use_inline_data &&
                    blk_rq_payload_bytes(rq) <=
                                nvme_rdma_inline_data_size(queue)) {
-                       ret = nvme_rdma_map_sg_inline(queue, req, c);
+                       ret = nvme_rdma_map_sg_inline(queue, req, c, count);
                        goto out;
                }
  
-               if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
+               if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
                        ret = nvme_rdma_map_sg_single(queue, req, c);
                        goto out;
                }
@@@ -1574,6 -1636,7 +1636,7 @@@ static int nvme_rdma_cm_handler(struct 
        case RDMA_CM_EVENT_CONNECT_ERROR:
        case RDMA_CM_EVENT_UNREACHABLE:
                nvme_rdma_destroy_queue_ib(queue);
+               /* fall through */
        case RDMA_CM_EVENT_ADDR_ERROR:
                dev_dbg(queue->ctrl->ctrl.device,
                        "CM error event %d\n", ev->event);
@@@ -1639,7 -1702,7 +1702,7 @@@ static blk_status_t nvme_rdma_queue_rq(
        WARN_ON_ONCE(rq->tag < 0);
  
        if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
 -              return nvmf_fail_nonready_command(rq);
 +              return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
  
        dev = queue->device->dev;
        ib_dma_sync_single_for_cpu(dev, sqe->dma,
@@@ -1736,25 -1799,12 +1799,12 @@@ static const struct blk_mq_ops nvme_rdm
  
  static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
  {
-       if (ctrl->ctrl.queue_count > 1) {
-               nvme_stop_queues(&ctrl->ctrl);
-               nvme_rdma_stop_io_queues(ctrl);
-               blk_mq_tagset_busy_iter(&ctrl->tag_set,
-                                       nvme_cancel_request, &ctrl->ctrl);
-               nvme_rdma_destroy_io_queues(ctrl, shutdown);
-       }
+       nvme_rdma_teardown_io_queues(ctrl, shutdown);
        if (shutdown)
                nvme_shutdown_ctrl(&ctrl->ctrl);
        else
                nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
-                               nvme_cancel_request, &ctrl->ctrl);
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
-       nvme_rdma_destroy_admin_queue(ctrl, shutdown);
+       nvme_rdma_teardown_admin_queue(ctrl, shutdown);
  }
  
  static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
@@@ -1766,8 -1816,6 +1816,6 @@@ static void nvme_rdma_reset_ctrl_work(s
  {
        struct nvme_rdma_ctrl *ctrl =
                container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
-       int ret;
-       bool changed;
  
        nvme_stop_ctrl(&ctrl->ctrl);
        nvme_rdma_shutdown_ctrl(ctrl, false);
                return;
        }
  
-       ret = nvme_rdma_configure_admin_queue(ctrl, false);
-       if (ret)
+       if (nvme_rdma_setup_ctrl(ctrl, false))
                goto out_fail;
  
-       if (ctrl->ctrl.queue_count > 1) {
-               ret = nvme_rdma_configure_io_queues(ctrl, false);
-               if (ret)
-                       goto out_fail;
-       }
-       changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-       if (!changed) {
-               /* state change failure is ok if we're in DELETING state */
-               WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
-               return;
-       }
-       nvme_start_ctrl(&ctrl->ctrl);
        return;
  
  out_fail:
@@@ -1959,49 -1991,10 +1991,10 @@@ static struct nvme_ctrl *nvme_rdma_crea
        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
        WARN_ON_ONCE(!changed);
  
-       ret = nvme_rdma_configure_admin_queue(ctrl, true);
+       ret = nvme_rdma_setup_ctrl(ctrl, true);
        if (ret)
                goto out_uninit_ctrl;
  
-       /* sanity check icdoff */
-       if (ctrl->ctrl.icdoff) {
-               dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
-               ret = -EINVAL;
-               goto out_remove_admin_queue;
-       }
-       /* sanity check keyed sgls */
-       if (!(ctrl->ctrl.sgls & (1 << 2))) {
-               dev_err(ctrl->ctrl.device,
-                       "Mandatory keyed sgls are not supported!\n");
-               ret = -EINVAL;
-               goto out_remove_admin_queue;
-       }
-       /* only warn if argument is too large here, will clamp later */
-       if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
-               dev_warn(ctrl->ctrl.device,
-                       "queue_size %zu > ctrl sqsize %u, clamping down\n",
-                       opts->queue_size, ctrl->ctrl.sqsize + 1);
-       }
-       /* warn if maxcmd is lower than sqsize+1 */
-       if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
-               dev_warn(ctrl->ctrl.device,
-                       "sqsize %u > ctrl maxcmd %u, clamping down\n",
-                       ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
-               ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
-       }
-       if (opts->nr_io_queues) {
-               ret = nvme_rdma_configure_io_queues(ctrl, true);
-               if (ret)
-                       goto out_remove_admin_queue;
-       }
-       changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-       WARN_ON_ONCE(!changed);
        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
                ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
  
        list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
        mutex_unlock(&nvme_rdma_ctrl_mutex);
  
-       nvme_start_ctrl(&ctrl->ctrl);
        return &ctrl->ctrl;
  
- out_remove_admin_queue:
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       nvme_rdma_destroy_admin_queue(ctrl, true);
  out_uninit_ctrl:
        nvme_uninit_ctrl(&ctrl->ctrl);
        nvme_put_ctrl(&ctrl->ctrl);
@@@ -218,6 -218,35 +218,35 @@@ static ssize_t nvmet_addr_trsvcid_store
  
  CONFIGFS_ATTR(nvmet_, addr_trsvcid);
  
+ static ssize_t nvmet_param_inline_data_size_show(struct config_item *item,
+               char *page)
+ {
+       struct nvmet_port *port = to_nvmet_port(item);
+       return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size);
+ }
+ static ssize_t nvmet_param_inline_data_size_store(struct config_item *item,
+               const char *page, size_t count)
+ {
+       struct nvmet_port *port = to_nvmet_port(item);
+       int ret;
+       if (port->enabled) {
+               pr_err("Cannot modify inline_data_size while port enabled\n");
+               pr_err("Disable the port before modifying\n");
+               return -EACCES;
+       }
+       ret = kstrtoint(page, 0, &port->inline_data_size);
+       if (ret) {
+               pr_err("Invalid value '%s' for inline_data_size\n", page);
+               return -EINVAL;
+       }
+       return count;
+ }
+ CONFIGFS_ATTR(nvmet_, param_inline_data_size);
  static ssize_t nvmet_addr_trtype_show(struct config_item *item,
                char *page)
  {
@@@ -282,7 -311,6 +311,7 @@@ static ssize_t nvmet_ns_device_path_sto
  {
        struct nvmet_ns *ns = to_nvmet_ns(item);
        struct nvmet_subsys *subsys = ns->subsys;
 +      size_t len;
        int ret;
  
        mutex_lock(&subsys->lock);
        if (ns->enabled)
                goto out_unlock;
  
 -      kfree(ns->device_path);
 +      ret = -EINVAL;
 +      len = strcspn(page, "\n");
 +      if (!len)
 +              goto out_unlock;
  
 +      kfree(ns->device_path);
        ret = -ENOMEM;
 -      ns->device_path = kstrndup(page, strcspn(page, "\n"), GFP_KERNEL);
 +      ns->device_path = kstrndup(page, len, GFP_KERNEL);
        if (!ns->device_path)
                goto out_unlock;
  
@@@ -387,6 -411,39 +416,39 @@@ out_unlock
  
  CONFIGFS_ATTR(nvmet_ns_, device_nguid);
  
+ static ssize_t nvmet_ns_ana_grpid_show(struct config_item *item, char *page)
+ {
+       return sprintf(page, "%u\n", to_nvmet_ns(item)->anagrpid);
+ }
+ static ssize_t nvmet_ns_ana_grpid_store(struct config_item *item,
+               const char *page, size_t count)
+ {
+       struct nvmet_ns *ns = to_nvmet_ns(item);
+       u32 oldgrpid, newgrpid;
+       int ret;
+       ret = kstrtou32(page, 0, &newgrpid);
+       if (ret)
+               return ret;
+       if (newgrpid < 1 || newgrpid > NVMET_MAX_ANAGRPS)
+               return -EINVAL;
+       down_write(&nvmet_ana_sem);
+       oldgrpid = ns->anagrpid;
+       nvmet_ana_group_enabled[newgrpid]++;
+       ns->anagrpid = newgrpid;
+       nvmet_ana_group_enabled[oldgrpid]--;
+       nvmet_ana_chgcnt++;
+       up_write(&nvmet_ana_sem);
+       nvmet_send_ana_event(ns->subsys, NULL);
+       return count;
+ }
+ CONFIGFS_ATTR(nvmet_ns_, ana_grpid);
  static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
  {
        return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled);
@@@ -412,11 -469,41 +474,41 @@@ static ssize_t nvmet_ns_enable_store(st
  
  CONFIGFS_ATTR(nvmet_ns_, enable);
  
+ static ssize_t nvmet_ns_buffered_io_show(struct config_item *item, char *page)
+ {
+       return sprintf(page, "%d\n", to_nvmet_ns(item)->buffered_io);
+ }
+ static ssize_t nvmet_ns_buffered_io_store(struct config_item *item,
+               const char *page, size_t count)
+ {
+       struct nvmet_ns *ns = to_nvmet_ns(item);
+       bool val;
+       if (strtobool(page, &val))
+               return -EINVAL;
+       mutex_lock(&ns->subsys->lock);
+       if (ns->enabled) {
+               pr_err("disable ns before setting buffered_io value.\n");
+               mutex_unlock(&ns->subsys->lock);
+               return -EINVAL;
+       }
+       ns->buffered_io = val;
+       mutex_unlock(&ns->subsys->lock);
+       return count;
+ }
+ CONFIGFS_ATTR(nvmet_ns_, buffered_io);
  static struct configfs_attribute *nvmet_ns_attrs[] = {
        &nvmet_ns_attr_device_path,
        &nvmet_ns_attr_device_nguid,
        &nvmet_ns_attr_device_uuid,
+       &nvmet_ns_attr_ana_grpid,
        &nvmet_ns_attr_enable,
+       &nvmet_ns_attr_buffered_io,
        NULL,
  };
  
@@@ -863,6 -950,134 +955,134 @@@ static const struct config_item_type nv
        .ct_group_ops   = &nvmet_referral_group_ops,
  };
  
+ static struct {
+       enum nvme_ana_state     state;
+       const char              *name;
+ } nvmet_ana_state_names[] = {
+       { NVME_ANA_OPTIMIZED,           "optimized" },
+       { NVME_ANA_NONOPTIMIZED,        "non-optimized" },
+       { NVME_ANA_INACCESSIBLE,        "inaccessible" },
+       { NVME_ANA_PERSISTENT_LOSS,     "persistent-loss" },
+       { NVME_ANA_CHANGE,              "change" },
+ };
+ static ssize_t nvmet_ana_group_ana_state_show(struct config_item *item,
+               char *page)
+ {
+       struct nvmet_ana_group *grp = to_ana_group(item);
+       enum nvme_ana_state state = grp->port->ana_state[grp->grpid];
+       int i;
+       for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
+               if (state != nvmet_ana_state_names[i].state)
+                       continue;
+               return sprintf(page, "%s\n", nvmet_ana_state_names[i].name);
+       }
+       return sprintf(page, "\n");
+ }
+ static ssize_t nvmet_ana_group_ana_state_store(struct config_item *item,
+               const char *page, size_t count)
+ {
+       struct nvmet_ana_group *grp = to_ana_group(item);
+       int i;
+       for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
+               if (sysfs_streq(page, nvmet_ana_state_names[i].name))
+                       goto found;
+       }
+       pr_err("Invalid value '%s' for ana_state\n", page);
+       return -EINVAL;
+ found:
+       down_write(&nvmet_ana_sem);
+       grp->port->ana_state[grp->grpid] = nvmet_ana_state_names[i].state;
+       nvmet_ana_chgcnt++;
+       up_write(&nvmet_ana_sem);
+       nvmet_port_send_ana_event(grp->port);
+       return count;
+ }
+ CONFIGFS_ATTR(nvmet_ana_group_, ana_state);
+ static struct configfs_attribute *nvmet_ana_group_attrs[] = {
+       &nvmet_ana_group_attr_ana_state,
+       NULL,
+ };
+ static void nvmet_ana_group_release(struct config_item *item)
+ {
+       struct nvmet_ana_group *grp = to_ana_group(item);
+       if (grp == &grp->port->ana_default_group)
+               return;
+       down_write(&nvmet_ana_sem);
+       grp->port->ana_state[grp->grpid] = NVME_ANA_INACCESSIBLE;
+       nvmet_ana_group_enabled[grp->grpid]--;
+       up_write(&nvmet_ana_sem);
+       nvmet_port_send_ana_event(grp->port);
+       kfree(grp);
+ }
+ static struct configfs_item_operations nvmet_ana_group_item_ops = {
+       .release                = nvmet_ana_group_release,
+ };
+ static const struct config_item_type nvmet_ana_group_type = {
+       .ct_item_ops            = &nvmet_ana_group_item_ops,
+       .ct_attrs               = nvmet_ana_group_attrs,
+       .ct_owner               = THIS_MODULE,
+ };
+ static struct config_group *nvmet_ana_groups_make_group(
+               struct config_group *group, const char *name)
+ {
+       struct nvmet_port *port = ana_groups_to_port(&group->cg_item);
+       struct nvmet_ana_group *grp;
+       u32 grpid;
+       int ret;
+       ret = kstrtou32(name, 0, &grpid);
+       if (ret)
+               goto out;
+       ret = -EINVAL;
+       if (grpid <= 1 || grpid > NVMET_MAX_ANAGRPS)
+               goto out;
+       ret = -ENOMEM;
+       grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+       if (!grp)
+               goto out;
+       grp->port = port;
+       grp->grpid = grpid;
+       down_write(&nvmet_ana_sem);
+       nvmet_ana_group_enabled[grpid]++;
+       up_write(&nvmet_ana_sem);
+       nvmet_port_send_ana_event(grp->port);
+       config_group_init_type_name(&grp->group, name, &nvmet_ana_group_type);
+       return &grp->group;
+ out:
+       return ERR_PTR(ret);
+ }
+ static struct configfs_group_operations nvmet_ana_groups_group_ops = {
+       .make_group             = nvmet_ana_groups_make_group,
+ };
+ static const struct config_item_type nvmet_ana_groups_type = {
+       .ct_group_ops           = &nvmet_ana_groups_group_ops,
+       .ct_owner               = THIS_MODULE,
+ };
  /*
   * Ports definitions.
   */
@@@ -870,6 -1085,7 +1090,7 @@@ static void nvmet_port_release(struct c
  {
        struct nvmet_port *port = to_nvmet_port(item);
  
+       kfree(port->ana_state);
        kfree(port);
  }
  
@@@ -879,6 -1095,7 +1100,7 @@@ static struct configfs_attribute *nvmet
        &nvmet_attr_addr_traddr,
        &nvmet_attr_addr_trsvcid,
        &nvmet_attr_addr_trtype,
+       &nvmet_attr_param_inline_data_size,
        NULL,
  };
  
@@@ -897,6 -1114,7 +1119,7 @@@ static struct config_group *nvmet_ports
  {
        struct nvmet_port *port;
        u16 portid;
+       u32 i;
  
        if (kstrtou16(name, 0, &portid))
                return ERR_PTR(-EINVAL);
        if (!port)
                return ERR_PTR(-ENOMEM);
  
+       port->ana_state = kcalloc(NVMET_MAX_ANAGRPS + 1,
+                       sizeof(*port->ana_state), GFP_KERNEL);
+       if (!port->ana_state) {
+               kfree(port);
+               return ERR_PTR(-ENOMEM);
+       }
+       for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) {
+               if (i == NVMET_DEFAULT_ANA_GRPID)
+                       port->ana_state[1] = NVME_ANA_OPTIMIZED;
+               else
+                       port->ana_state[i] = NVME_ANA_INACCESSIBLE;
+       }
        INIT_LIST_HEAD(&port->entry);
        INIT_LIST_HEAD(&port->subsystems);
        INIT_LIST_HEAD(&port->referrals);
+       port->inline_data_size = -1;    /* < 0 == let the transport choose */
  
        port->disc_addr.portid = cpu_to_le16(portid);
        config_group_init_type_name(&port->group, name, &nvmet_port_type);
                        "referrals", &nvmet_referrals_type);
        configfs_add_default_group(&port->referrals_group, &port->group);
  
+       config_group_init_type_name(&port->ana_groups_group,
+                       "ana_groups", &nvmet_ana_groups_type);
+       configfs_add_default_group(&port->ana_groups_group, &port->group);
+       port->ana_default_group.port = port;
+       port->ana_default_group.grpid = NVMET_DEFAULT_ANA_GRPID;
+       config_group_init_type_name(&port->ana_default_group.group,
+                       __stringify(NVMET_DEFAULT_ANA_GRPID),
+                       &nvmet_ana_group_type);
+       configfs_add_default_group(&port->ana_default_group.group,
+                       &port->ana_groups_group);
        return &port->group;
  }
  
@@@ -18,6 -18,7 +18,7 @@@
  
  #include "nvmet.h"
  
+ struct workqueue_struct *buffered_io_wq;
  static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
  static DEFINE_IDA(cntlid_ida);
  
   */
  DECLARE_RWSEM(nvmet_config_sem);
  
+ u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
+ u64 nvmet_ana_chgcnt;
+ DECLARE_RWSEM(nvmet_ana_sem);
  static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
                const char *subsysnqn);
  
@@@ -175,7 -180,7 +180,7 @@@ out_unlock
        mutex_unlock(&ctrl->lock);
  }
  
static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
+ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
  {
        struct nvmet_ctrl *ctrl;
  
        }
  }
  
+ void nvmet_send_ana_event(struct nvmet_subsys *subsys,
+               struct nvmet_port *port)
+ {
+       struct nvmet_ctrl *ctrl;
+       mutex_lock(&subsys->lock);
+       list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+               if (port && ctrl->port != port)
+                       continue;
+               if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE))
+                       continue;
+               nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+                               NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
+       }
+       mutex_unlock(&subsys->lock);
+ }
+ void nvmet_port_send_ana_event(struct nvmet_port *port)
+ {
+       struct nvmet_subsys_link *p;
+       down_read(&nvmet_config_sem);
+       list_for_each_entry(p, &port->subsystems, entry)
+               nvmet_send_ana_event(p->subsys, port);
+       up_read(&nvmet_config_sem);
+ }
  int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
  {
        int ret = 0;
@@@ -241,6 -273,10 +273,10 @@@ int nvmet_enable_port(struct nvmet_por
                return ret;
        }
  
+       /* If the transport didn't set inline_data_size, then disable it. */
+       if (port->inline_data_size < 0)
+               port->inline_data_size = 0;
        port->enabled = true;
        return 0;
  }
@@@ -332,14 -368,18 +368,18 @@@ static void nvmet_ns_dev_disable(struc
  int nvmet_ns_enable(struct nvmet_ns *ns)
  {
        struct nvmet_subsys *subsys = ns->subsys;
-       int ret = 0;
+       int ret;
  
        mutex_lock(&subsys->lock);
+       ret = -EMFILE;
+       if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
+               goto out_unlock;
+       ret = 0;
        if (ns->enabled)
                goto out_unlock;
  
        ret = nvmet_bdev_ns_enable(ns);
 -      if (ret)
 +      if (ret == -ENOTBLK)
                ret = nvmet_file_ns_enable(ns);
        if (ret)
                goto out_unlock;
  
                list_add_tail_rcu(&ns->dev_link, &old->dev_link);
        }
+       subsys->nr_namespaces++;
  
        nvmet_ns_changed(subsys, ns->nsid);
        ns->enabled = true;
@@@ -409,6 -450,7 +450,7 @@@ void nvmet_ns_disable(struct nvmet_ns *
        percpu_ref_exit(&ns->ref);
  
        mutex_lock(&subsys->lock);
+       subsys->nr_namespaces--;
        nvmet_ns_changed(subsys, ns->nsid);
        nvmet_ns_dev_disable(ns);
  out_unlock:
@@@ -419,6 -461,10 +461,10 @@@ void nvmet_ns_free(struct nvmet_ns *ns
  {
        nvmet_ns_disable(ns);
  
+       down_write(&nvmet_ana_sem);
+       nvmet_ana_group_enabled[ns->anagrpid]--;
+       up_write(&nvmet_ana_sem);
        kfree(ns->device_path);
        kfree(ns);
  }
@@@ -436,7 -482,14 +482,14 @@@ struct nvmet_ns *nvmet_ns_alloc(struct 
  
        ns->nsid = nsid;
        ns->subsys = subsys;
+       down_write(&nvmet_ana_sem);
+       ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
+       nvmet_ana_group_enabled[ns->anagrpid]++;
+       up_write(&nvmet_ana_sem);
        uuid_gen(&ns->uuid);
+       ns->buffered_io = false;
  
        return ns;
  }
@@@ -542,6 -595,35 +595,35 @@@ int nvmet_sq_init(struct nvmet_sq *sq
  }
  EXPORT_SYMBOL_GPL(nvmet_sq_init);
  
+ static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
+               struct nvmet_ns *ns)
+ {
+       enum nvme_ana_state state = port->ana_state[ns->anagrpid];
+       if (unlikely(state == NVME_ANA_INACCESSIBLE))
+               return NVME_SC_ANA_INACCESSIBLE;
+       if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
+               return NVME_SC_ANA_PERSISTENT_LOSS;
+       if (unlikely(state == NVME_ANA_CHANGE))
+               return NVME_SC_ANA_TRANSITION;
+       return 0;
+ }
+ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
+ {
+       if (unlikely(req->ns->readonly)) {
+               switch (req->cmd->common.opcode) {
+               case nvme_cmd_read:
+               case nvme_cmd_flush:
+                       break;
+               default:
+                       return NVME_SC_NS_WRITE_PROTECTED;
+               }
+       }
+       return 0;
+ }
  static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
  {
        struct nvme_command *cmd = req->cmd;
        req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
        if (unlikely(!req->ns))
                return NVME_SC_INVALID_NS | NVME_SC_DNR;
+       ret = nvmet_check_ana_state(req->port, req->ns);
+       if (unlikely(ret))
+               return ret;
+       ret = nvmet_io_cmd_check_access(req);
+       if (unlikely(ret))
+               return ret;
  
        if (req->ns->file)
                return nvmet_file_parse_io_cmd(req);
@@@ -870,6 -958,8 +958,8 @@@ u16 nvmet_alloc_ctrl(const char *subsys
  
        nvmet_init_cap(ctrl);
  
+       ctrl->port = req->port;
        INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
        INIT_LIST_HEAD(&ctrl->async_events);
  
@@@ -1109,6 -1199,15 +1199,15 @@@ static int __init nvmet_init(void
  {
        int error;
  
+       nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
+       buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
+                       WQ_MEM_RECLAIM, 0);
+       if (!buffered_io_wq) {
+               error = -ENOMEM;
+               goto out;
+       }
        error = nvmet_init_discovery();
        if (error)
                goto out;
@@@ -1129,6 -1228,7 +1228,7 @@@ static void __exit nvmet_exit(void
        nvmet_exit_configfs();
        nvmet_exit_discovery();
        ida_destroy(&cntlid_ida);
+       destroy_workqueue(buffered_io_wq);
  
        BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
        BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
@@@ -162,7 -162,7 +162,7 @@@ static blk_status_t nvme_loop_queue_rq(
        blk_status_t ret;
  
        if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready))
 -              return nvmf_fail_nonready_command(req);
 +              return nvmf_fail_nonready_command(&queue->ctrl->ctrl, req);
  
        ret = nvme_setup_cmd(ns, req, &iod->cmd);
        if (ret)
@@@ -227,6 -227,7 +227,7 @@@ static int nvme_loop_init_request(struc
  {
        struct nvme_loop_ctrl *ctrl = set->driver_data;
  
+       nvme_req(req)->ctrl = &ctrl->ctrl;
        return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req),
                        (set == &ctrl->tag_set) ? hctx_idx + 1 : 0);
  }
diff --combined fs/block_dev.c
@@@ -221,7 -221,7 +221,7 @@@ __blkdev_direct_IO_simple(struct kiocb 
  
        ret = bio_iov_iter_get_pages(&bio, iter);
        if (unlikely(ret))
 -              return ret;
 +              goto out;
        ret = bio.bi_iter.bi_size;
  
        if (iov_iter_rw(iter) == READ) {
                put_page(bvec->bv_page);
        }
  
 -      if (vecs != inline_vecs)
 -              kfree(vecs);
 -
        if (unlikely(bio.bi_status))
                ret = blk_status_to_errno(bio.bi_status);
  
 +out:
 +      if (vecs != inline_vecs)
 +              kfree(vecs);
 +
        bio_uninit(&bio);
  
        return ret;
@@@ -666,7 -665,8 +666,8 @@@ int bdev_read_page(struct block_device 
        result = blk_queue_enter(bdev->bd_queue, 0);
        if (result)
                return result;
-       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
+       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+                             REQ_OP_READ);
        blk_queue_exit(bdev->bd_queue);
        return result;
  }
@@@ -704,7 -704,8 +705,8 @@@ int bdev_write_page(struct block_devic
                return result;
  
        set_page_writeback(page);
-       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
+       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+                             REQ_OP_WRITE);
        if (result) {
                end_page_writeback(page);
        } else {
diff --combined fs/ext4/super.c
@@@ -312,24 -312,6 +312,24 @@@ void ext4_itable_unused_set(struct supe
                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
  }
  
 +static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
 +{
 +      time64_t now = ktime_get_real_seconds();
 +
 +      now = clamp_val(now, 0, (1ull << 40) - 1);
 +
 +      *lo = cpu_to_le32(lower_32_bits(now));
 +      *hi = upper_32_bits(now);
 +}
 +
 +static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
 +{
 +      return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
 +}
 +#define ext4_update_tstamp(es, tstamp) \
 +      __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 +#define ext4_get_tstamp(es, tstamp) \
 +      __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
  
  static void __save_error_info(struct super_block *sb, const char *func,
                            unsigned int line)
        if (bdev_read_only(sb->s_bdev))
                return;
        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
 -      es->s_last_error_time = cpu_to_le32(get_seconds());
 +      ext4_update_tstamp(es, s_last_error_time);
        strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
        es->s_last_error_line = cpu_to_le32(line);
        if (!es->s_first_error_time) {
                es->s_first_error_time = es->s_last_error_time;
 +              es->s_first_error_time_hi = es->s_last_error_time_hi;
                strncpy(es->s_first_error_func, func,
                        sizeof(es->s_first_error_func));
                es->s_first_error_line = cpu_to_le32(line);
@@@ -795,26 -776,26 +795,26 @@@ void ext4_mark_group_bitmap_corrupted(s
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 +      int ret;
  
 -      if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) &&
 -          !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) {
 -              percpu_counter_sub(&sbi->s_freeclusters_counter,
 -                                      grp->bb_free);
 -              set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
 -                      &grp->bb_state);
 +      if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
 +              ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
 +                                          &grp->bb_state);
 +              if (!ret)
 +                      percpu_counter_sub(&sbi->s_freeclusters_counter,
 +                                         grp->bb_free);
        }
  
 -      if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) &&
 -          !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
 -              if (gdp) {
 +      if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
 +              ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
 +                                          &grp->bb_state);
 +              if (!ret && gdp) {
                        int count;
  
                        count = ext4_free_inodes_count(sb, gdp);
                        percpu_counter_sub(&sbi->s_freeinodes_counter,
                                           count);
                }
 -              set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
 -                      &grp->bb_state);
        }
  }
  
@@@ -2193,8 -2174,8 +2193,8 @@@ static int ext4_setup_super(struct supe
                         "warning: maximal mount count reached, "
                         "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
 -              (le32_to_cpu(es->s_lastcheck) +
 -                      le32_to_cpu(es->s_checkinterval) <= get_seconds()))
 +               (ext4_get_tstamp(es, s_lastcheck) +
 +                le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
                ext4_msg(sb, KERN_WARNING,
                         "warning: checktime reached, "
                         "running e2fsck is recommended");
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
 -      es->s_mtime = cpu_to_le32(get_seconds());
 +      ext4_update_tstamp(es, s_mtime);
        ext4_update_dynamic_rev(sb);
        if (sbi->s_journal)
                ext4_set_feature_journal_needs_recovery(sb);
@@@ -2361,7 -2342,7 +2361,7 @@@ static int ext4_check_descriptors(struc
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
        ext4_fsblk_t last_block;
 -      ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0) + 1;
 +      ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
        ext4_fsblk_t block_bitmap;
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
@@@ -2894,9 -2875,8 +2894,9 @@@ static void print_daily_error_info(stru
                ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
                         le32_to_cpu(es->s_error_count));
        if (es->s_first_error_time) {
 -              printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
 -                     sb->s_id, le32_to_cpu(es->s_first_error_time),
 +              printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
 +                     sb->s_id,
 +                     ext4_get_tstamp(es, s_first_error_time),
                       (int) sizeof(es->s_first_error_func),
                       es->s_first_error_func,
                       le32_to_cpu(es->s_first_error_line));
                printk(KERN_CONT "\n");
        }
        if (es->s_last_error_time) {
 -              printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
 -                     sb->s_id, le32_to_cpu(es->s_last_error_time),
 +              printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
 +                     sb->s_id,
 +                     ext4_get_tstamp(es, s_last_error_time),
                       (int) sizeof(es->s_last_error_func),
                       es->s_last_error_func,
                       le32_to_cpu(es->s_last_error_line));
@@@ -3162,8 -3141,14 +3162,8 @@@ static ext4_group_t ext4_has_uninit_ita
                if (!gdp)
                        continue;
  
 -              if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
 -                      continue;
 -              if (group != 0)
 +              if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        break;
 -              ext4_error(sb, "Inode table for bg 0 marked as "
 -                         "needing zeroing");
 -              if (sb_rdonly(sb))
 -                      return ngroups;
        }
  
        return group;
@@@ -3529,7 -3514,7 +3529,7 @@@ static int ext4_fill_super(struct super
        sbi->s_sb_block = sb_block;
        if (sb->s_bdev->bd_part)
                sbi->s_sectors_written_start =
-                       part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+                       part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
  
        /* Cleanup superblock name */
        strreplace(sb->s_id, '/', '!');
                        goto failed_mount2;
                }
        }
 +      sbi->s_gdb_count = db_count;
        if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                ret = -EFSCORRUPTED;
                goto failed_mount2;
        }
  
 -      sbi->s_gdb_count = db_count;
 -
        timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
  
        /* Register extent status tree shrinker */
@@@ -4834,11 -4820,12 +4834,12 @@@ static int ext4_commit_super(struct sup
         * to complain and force a full file system check.
         */
        if (!(sb->s_flags & SB_RDONLY))
 -              es->s_wtime = cpu_to_le32(get_seconds());
 +              ext4_update_tstamp(es, s_wtime);
        if (sb->s_bdev->bd_part)
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
-                           ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                           ((part_stat_read(sb->s_bdev->bd_part,
+                                            sectors[STAT_WRITE]) -
                              EXT4_SB(sb)->s_sectors_written_start) >> 1));
        else
                es->s_kbytes_written =
@@@ -5101,9 -5088,6 +5102,9 @@@ static int ext4_remount(struct super_bl
  #endif
        char *orig_data = kstrdup(data, GFP_KERNEL);
  
 +      if (data && !orig_data)
 +              return -ENOMEM;
 +
        /* Store the original options */
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
  
                        if (sbi->s_journal)
                                ext4_mark_recovery_complete(sb, es);
 +                      if (sbi->s_mmp_tsk)
 +                              kthread_stop(sbi->s_mmp_tsk);
                } else {
                        /* Make sure we can mount this feature set readwrite */
                        if (ext4_has_feature_readonly(sb) ||
@@@ -5689,13 -5671,13 +5690,13 @@@ static int ext4_enable_quotas(struct su
                                DQUOT_USAGE_ENABLED |
                                (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
                        if (err) {
 -                              for (type--; type >= 0; type--)
 -                                      dquot_quota_off(sb, type);
 -
                                ext4_warning(sb,
                                        "Failed to enable quota tracking "
                                        "(type=%d, err=%d). Please run "
                                        "e2fsck to fix.", type, err);
 +                              for (type--; type >= 0; type--)
 +                                      dquot_quota_off(sb, type);
 +
                                return err;
                        }
                }
diff --combined fs/ext4/sysfs.c
@@@ -25,8 -25,6 +25,8 @@@ typedef enum 
        attr_reserved_clusters,
        attr_inode_readahead,
        attr_trigger_test_error,
 +      attr_first_error_time,
 +      attr_last_error_time,
        attr_feature,
        attr_pointer_ui,
        attr_pointer_atomic,
@@@ -58,7 -56,8 +58,8 @@@ static ssize_t session_write_kbytes_sho
        if (!sb->s_bdev->bd_part)
                return snprintf(buf, PAGE_SIZE, "0\n");
        return snprintf(buf, PAGE_SIZE, "%lu\n",
-                       (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                       (part_stat_read(sb->s_bdev->bd_part,
+                                       sectors[STAT_WRITE]) -
                         sbi->s_sectors_written_start) >> 1);
  }
  
@@@ -70,7 -69,8 +71,8 @@@ static ssize_t lifetime_write_kbytes_sh
                return snprintf(buf, PAGE_SIZE, "0\n");
        return snprintf(buf, PAGE_SIZE, "%llu\n",
                        (unsigned long long)(sbi->s_kbytes_written +
-                       ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                       ((part_stat_read(sb->s_bdev->bd_part,
+                                        sectors[STAT_WRITE]) -
                          EXT4_SB(sb)->s_sectors_written_start) >> 1)));
  }
  
@@@ -184,8 -184,8 +186,8 @@@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_b
  EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
  EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
  EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 -EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
 -EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
 +EXT4_ATTR(first_error_time, 0444, first_error_time);
 +EXT4_ATTR(last_error_time, 0444, last_error_time);
  
  static unsigned int old_bump_val = 128;
  EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@@ -251,15 -251,6 +253,15 @@@ static void *calc_ptr(struct ext4_attr 
        return NULL;
  }
  
 +static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
 +{
 +      return snprintf(buf, PAGE_SIZE, "%lld",
 +                      ((time64_t)hi << 32) + le32_to_cpu(lo));
 +}
 +
 +#define print_tstamp(buf, es, tstamp) \
 +      __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi)
 +
  static ssize_t ext4_attr_show(struct kobject *kobj,
                              struct attribute *attr, char *buf)
  {
        case attr_pointer_ui:
                if (!ptr)
                        return 0;
 -              return snprintf(buf, PAGE_SIZE, "%u\n",
 -                              *((unsigned int *) ptr));
 +              if (a->attr_ptr == ptr_ext4_super_block_offset)
 +                      return snprintf(buf, PAGE_SIZE, "%u\n",
 +                                      le32_to_cpup(ptr));
 +              else
 +                      return snprintf(buf, PAGE_SIZE, "%u\n",
 +                                      *((unsigned int *) ptr));
        case attr_pointer_atomic:
                if (!ptr)
                        return 0;
                                atomic_read((atomic_t *) ptr));
        case attr_feature:
                return snprintf(buf, PAGE_SIZE, "supported\n");
 +      case attr_first_error_time:
 +              return print_tstamp(buf, sbi->s_es, s_first_error_time);
 +      case attr_last_error_time:
 +              return print_tstamp(buf, sbi->s_es, s_last_error_time);
        }
  
        return 0;
@@@ -327,10 -310,7 +329,10 @@@ static ssize_t ext4_attr_store(struct k
                ret = kstrtoul(skip_spaces(buf), 0, &t);
                if (ret)
                        return ret;
 -              *((unsigned int *) ptr) = t;
 +              if (a->attr_ptr == ptr_ext4_super_block_offset)
 +                      *((__le32 *) ptr) = cpu_to_le32(t);
 +              else
 +                      *((unsigned int *) ptr) = t;
                return len;
        case attr_inode_readahead:
                return inode_readahead_blks_store(sbi, buf, len);
diff --combined include/linux/blk-mq.h
@@@ -35,10 -35,12 +35,12 @@@ struct blk_mq_hw_ctx 
        struct sbitmap          ctx_map;
  
        struct blk_mq_ctx       *dispatch_from;
+       unsigned int            dispatch_busy;
  
-       struct blk_mq_ctx       **ctxs;
        unsigned int            nr_ctx;
+       struct blk_mq_ctx       **ctxs;
  
+       spinlock_t              dispatch_wait_lock;
        wait_queue_entry_t      dispatch_wait;
        atomic_t                wait_index;
  
@@@ -287,20 -289,6 +289,20 @@@ void blk_mq_update_nr_hw_queues(struct 
  
  void blk_mq_quiesce_queue_nowait(struct request_queue *q);
  
 +/**
 + * blk_mq_mark_complete() - Set request state to complete
 + * @rq: request to set to complete state
 + *
 + * Returns true if request state was successfully set to complete. If
 + * successful, the caller is responsibile for seeing this request is ended, as
 + * blk_mq_complete_request will not work again.
 + */
 +static inline bool blk_mq_mark_complete(struct request *rq)
 +{
 +      return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) ==
 +                      MQ_RQ_IN_FLIGHT;
 +}
 +
  /*
   * Driver command data is immediately after the request. So subtract request
   * size to get back to the original request, add request size to get the PDU.
diff --combined include/linux/sched.h
@@@ -167,8 -167,8 +167,8 @@@ struct task_group
   *   need_sleep = false;
   *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
   *
 - * Where wake_up_state() (and all other wakeup primitives) imply enough
 - * barriers to order the store of the variable against wakeup.
 + * where wake_up_state() executes a full memory barrier before accessing the
 + * task state.
   *
   * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
   * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
@@@ -734,6 -734,10 +734,10 @@@ struct task_struct 
        /* disallow userland-initiated cgroup migration */
        unsigned                        no_cgroup_migration:1;
  #endif
+ #ifdef CONFIG_BLK_CGROUP
+       /* to be used once the psi infrastructure lands upstream. */
+       unsigned                        use_memdelay:1;
+ #endif
  
        unsigned long                   atomic_flags; /* Flags requiring atomic access. */
  
        u64                             last_sum_exec_runtime;
        struct callback_head            numa_work;
  
 -      struct list_head                numa_entry;
        struct numa_group               *numa_group;
  
        /*
        unsigned int                    memcg_nr_pages_over_high;
  #endif
  
+ #ifdef CONFIG_BLK_CGROUP
+       struct request_queue            *throttle_queue;
+ #endif
  #ifdef CONFIG_UPROBES
        struct uprobe_task              *utask;
  #endif
diff --combined kernel/fork.c
@@@ -312,8 -312,10 +312,8 @@@ struct vm_area_struct *vm_area_alloc(st
  {
        struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
  
 -      if (vma) {
 -              vma->vm_mm = mm;
 -              INIT_LIST_HEAD(&vma->anon_vma_chain);
 -      }
 +      if (vma)
 +              vma_init(vma, mm);
        return vma;
  }
  
@@@ -866,6 -868,11 +866,11 @@@ static struct task_struct *dup_task_str
        tsk->fail_nth = 0;
  #endif
  
+ #ifdef CONFIG_BLK_CGROUP
+       tsk->throttle_queue = NULL;
+       tsk->use_memdelay = 0;
+ #endif
        return tsk;
  
  free_stack:
@@@ -2276,8 -2283,6 +2281,8 @@@ static void sighand_ctor(void *data
  
  void __init proc_caches_init(void)
  {
 +      unsigned int mm_size;
 +
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
 +
        /*
 -       * FIXME! The "sizeof(struct mm_struct)" currently includes the
 -       * whole struct cpumask for the OFFSTACK case. We could change
 -       * this to *only* allocate as much of it as required by the
 -       * maximum number of CPU's we can ever have.  The cpumask_allocation
 -       * is at the end of the structure, exactly for that reason.
 +       * The mm_cpumask is located at the end of mm_struct, and is
 +       * dynamically sized based on the maximum CPU number this system
 +       * can have, taking hotplug into account (nr_cpu_ids).
         */
 +      mm_size = sizeof(struct mm_struct) + cpumask_size();
 +
        mm_cachep = kmem_cache_create_usercopy("mm_struct",
 -                      sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 +                      mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        offsetof(struct mm_struct, saved_auxv),
                        sizeof_field(struct mm_struct, saved_auxv),
diff --combined mm/memcontrol.c
@@@ -4037,14 -4037,6 +4037,14 @@@ static struct cftype mem_cgroup_legacy_
  
  static DEFINE_IDR(mem_cgroup_idr);
  
 +static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
 +{
 +      if (memcg->id.id > 0) {
 +              idr_remove(&mem_cgroup_idr, memcg->id.id);
 +              memcg->id.id = 0;
 +      }
 +}
 +
  static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
  {
        VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
@@@ -4055,7 -4047,8 +4055,7 @@@ static void mem_cgroup_id_put_many(stru
  {
        VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
        if (atomic_sub_and_test(n, &memcg->id.ref)) {
 -              idr_remove(&mem_cgroup_idr, memcg->id.id);
 -              memcg->id.id = 0;
 +              mem_cgroup_id_remove(memcg);
  
                /* Memcg ID pins CSS */
                css_put(&memcg->css);
@@@ -4192,7 -4185,8 +4192,7 @@@ static struct mem_cgroup *mem_cgroup_al
        idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
        return memcg;
  fail:
 -      if (memcg->id.id > 0)
 -              idr_remove(&mem_cgroup_idr, memcg->id.id);
 +      mem_cgroup_id_remove(memcg);
        __mem_cgroup_free(memcg);
        return NULL;
  }
@@@ -4251,7 -4245,6 +4251,7 @@@ mem_cgroup_css_alloc(struct cgroup_subs
  
        return &memcg->css;
  fail:
 +      mem_cgroup_id_remove(memcg);
        mem_cgroup_free(memcg);
        return ERR_PTR(-ENOMEM);
  }
        return ret;
  }
  
+ int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound)
+ {
+       struct mem_cgroup *memcg;
+       int ret;
+       ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
+       memcg = *memcgp;
+       mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
+       return ret;
+ }
  /**
   * mem_cgroup_commit_charge - commit a page charge
   * @page: page to charge
diff --combined mm/memory.c
@@@ -326,20 -326,16 +326,20 @@@ bool __tlb_remove_page_size(struct mmu_
  
  #ifdef CONFIG_HAVE_RCU_TABLE_FREE
  
 -/*
 - * See the comment near struct mmu_table_batch.
 - */
 -
  static void tlb_remove_table_smp_sync(void *arg)
  {
 -      /* Simply deliver the interrupt */
 +      struct mm_struct __maybe_unused *mm = arg;
 +      /*
 +       * On most architectures this does nothing. Simply delivering the
 +       * interrupt is enough to prevent races with software page table
 +       * walking like that done in get_user_pages_fast.
 +       *
 +       * See the comment near struct mmu_table_batch.
 +       */
 +      tlb_flush_remove_tables_local(mm);
  }
  
 -static void tlb_remove_table_one(void *table)
 +static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
  {
        /*
         * This isn't an RCU grace period and hence the page-tables cannot be
         * It is however sufficient for software page-table walkers that rely on
         * IRQ disabling. See the comment near struct mmu_table_batch.
         */
 -      smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 +      smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
        __tlb_remove_table(table);
  }
  
@@@ -369,8 -365,6 +369,8 @@@ void tlb_table_flush(struct mmu_gather 
  {
        struct mmu_table_batch **batch = &tlb->batch;
  
 +      tlb_flush_remove_tables(tlb->mm);
 +
        if (*batch) {
                call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
                *batch = NULL;
@@@ -393,7 -387,7 +393,7 @@@ void tlb_remove_table(struct mmu_gathe
        if (*batch == NULL) {
                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                if (*batch == NULL) {
 -                      tlb_remove_table_one(table);
 +                      tlb_remove_table_one(table, tlb);
                        return;
                }
                (*batch)->nr = 0;
@@@ -1423,9 -1417,11 +1423,9 @@@ static inline unsigned long zap_pmd_ran
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
 -                      if (next - addr != HPAGE_PMD_SIZE) {
 -                              VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
 -                                  !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
 +                      if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false, NULL);
 -                      else if (zap_huge_pmd(tlb, vma, pmd, addr))
 +                      else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
                }
@@@ -1890,9 -1886,6 +1890,9 @@@ int vm_insert_pfn_prot(struct vm_area_s
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
  
 +      if (!pfn_modify_allowed(pfn, pgprot))
 +              return -EACCES;
 +
        track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
  
        ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
@@@ -1928,9 -1921,6 +1928,9 @@@ static int __vm_insert_mixed(struct vm_
  
        track_pfn_insert(vma, &pgprot, pfn);
  
 +      if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
 +              return -EACCES;
 +
        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@@ -1992,7 -1982,6 +1992,7 @@@ static int remap_pte_range(struct mm_st
  {
        pte_t *pte;
        spinlock_t *ptl;
 +      int err = 0;
  
        pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
        arch_enter_lazy_mmu_mode();
        do {
                BUG_ON(!pte_none(*pte));
 +              if (!pfn_modify_allowed(pfn, prot)) {
 +                      err = -EACCES;
 +                      break;
 +              }
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
 -      return 0;
 +      return err;
  }
  
  static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
  {
        pmd_t *pmd;
        unsigned long next;
 +      int err;
  
        pfn -= addr >> PAGE_SHIFT;
        pmd = pmd_alloc(mm, pud, addr);
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
 -              if (remap_pte_range(mm, pmd, addr, next,
 -                              pfn + (addr >> PAGE_SHIFT), prot))
 -                      return -ENOMEM;
 +              err = remap_pte_range(mm, pmd, addr, next,
 +                              pfn + (addr >> PAGE_SHIFT), prot);
 +              if (err)
 +                      return err;
        } while (pmd++, addr = next, addr != end);
        return 0;
  }
@@@ -2041,7 -2024,6 +2041,7 @@@ static inline int remap_pud_range(struc
  {
        pud_t *pud;
        unsigned long next;
 +      int err;
  
        pfn -= addr >> PAGE_SHIFT;
        pud = pud_alloc(mm, p4d, addr);
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
 -              if (remap_pmd_range(mm, pud, addr, next,
 -                              pfn + (addr >> PAGE_SHIFT), prot))
 -                      return -ENOMEM;
 +              err = remap_pmd_range(mm, pud, addr, next,
 +                              pfn + (addr >> PAGE_SHIFT), prot);
 +              if (err)
 +                      return err;
        } while (pud++, addr = next, addr != end);
        return 0;
  }
@@@ -2063,7 -2044,6 +2063,7 @@@ static inline int remap_p4d_range(struc
  {
        p4d_t *p4d;
        unsigned long next;
 +      int err;
  
        pfn -= addr >> PAGE_SHIFT;
        p4d = p4d_alloc(mm, pgd, addr);
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
 -              if (remap_pud_range(mm, p4d, addr, next,
 -                              pfn + (addr >> PAGE_SHIFT), prot))
 -                      return -ENOMEM;
 +              err = remap_pud_range(mm, p4d, addr, next,
 +                              pfn + (addr >> PAGE_SHIFT), prot);
 +              if (err)
 +                      return err;
        } while (p4d++, addr = next, addr != end);
        return 0;
  }
@@@ -2524,7 -2503,7 +2524,7 @@@ static int wp_page_copy(struct vm_faul
                cow_user_page(new_page, old_page, vmf->address, vma);
        }
  
-       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
+       if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
                goto oom_free_new;
  
        __SetPageUptodate(new_page);
@@@ -3024,8 -3003,8 +3024,8 @@@ int do_swap_page(struct vm_fault *vmf
                goto out_page;
        }
  
-       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
-                               &memcg, false)) {
+       if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
+                                       &memcg, false)) {
                ret = VM_FAULT_OOM;
                goto out_page;
        }
@@@ -3186,7 -3165,8 +3186,8 @@@ static int do_anonymous_page(struct vm_
        if (!page)
                goto oom;
  
-       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+       if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
+                                       false))
                goto oom_free_page;
  
        /*
@@@ -3682,7 -3662,7 +3683,7 @@@ static int do_cow_fault(struct vm_faul
        if (!vmf->cow_page)
                return VM_FAULT_OOM;
  
-       if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+       if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
                                &vmf->memcg, false)) {
                put_page(vmf->cow_page);
                return VM_FAULT_OOM;
@@@ -4418,9 -4398,6 +4419,9 @@@ int generic_access_phys(struct vm_area_
                return -EINVAL;
  
        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
 +      if (!maddr)
 +              return -ENOMEM;
 +
        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
diff --combined mm/shmem.c
@@@ -1239,8 -1239,8 +1239,8 @@@ int shmem_unuse(swp_entry_t swap, struc
         * the shmem_swaplist_mutex which might hold up shmem_writepage().
         * Charged back to the user (not to caller) when swap account is used.
         */
-       error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
-                       false);
+       error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
+                                           &memcg, false);
        if (error)
                goto out;
        /* No radix_tree_preload: swap entry keeps a place for page in tree */
@@@ -1421,7 -1421,6 +1421,7 @@@ static void shmem_pseudo_vma_init(struc
  {
        /* Create a pseudo vma that just contains the policy */
        memset(vma, 0, sizeof(*vma));
 +      vma_init(vma, NULL);
        /* Bias interleave by inode number to distribute better across nodes */
        vma->vm_pgoff = index + info->vfs_inode.i_ino;
        vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
@@@ -1713,7 -1712,7 +1713,7 @@@ repeat
                                goto failed;
                }
  
-               error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+               error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
                                false);
                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
@@@ -1819,7 -1818,7 +1819,7 @@@ alloc_nohuge:           page = shmem_alloc_and_a
                if (sgp == SGP_WRITE)
                        __SetPageReferenced(page);
  
-               error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+               error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
                                PageTransHuge(page));
                if (error)
                        goto unacct;
@@@ -2292,7 -2291,7 +2292,7 @@@ static int shmem_mfill_atomic_pte(struc
        __SetPageSwapBacked(page);
        __SetPageUptodate(page);
  
-       ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+       ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
        if (ret)
                goto out_release;
  
@@@ -3897,11 -3896,18 +3897,11 @@@ EXPORT_SYMBOL_GPL(shmem_truncate_range)
  
  /* common code */
  
 -static const struct dentry_operations anon_ops = {
 -      .d_dname = simple_dname
 -};
 -
  static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
                                       unsigned long flags, unsigned int i_flags)
  {
 -      struct file *res;
        struct inode *inode;
 -      struct path path;
 -      struct super_block *sb;
 -      struct qstr this;
 +      struct file *res;
  
        if (IS_ERR(mnt))
                return ERR_CAST(mnt);
        if (shmem_acct_size(flags, size))
                return ERR_PTR(-ENOMEM);
  
 -      res = ERR_PTR(-ENOMEM);
 -      this.name = name;
 -      this.len = strlen(name);
 -      this.hash = 0; /* will go */
 -      sb = mnt->mnt_sb;
 -      path.mnt = mntget(mnt);
 -      path.dentry = d_alloc_pseudo(sb, &this);
 -      if (!path.dentry)
 -              goto put_memory;
 -      d_set_d_op(path.dentry, &anon_ops);
 -
 -      res = ERR_PTR(-ENOSPC);
 -      inode = shmem_get_inode(sb, NULL, S_IFREG | 0777, 0, flags);
 -      if (!inode)
 -              goto put_memory;
 -
 +      inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
 +                              flags);
 +      if (unlikely(!inode)) {
 +              shmem_unacct_size(flags, size);
 +              return ERR_PTR(-ENOSPC);
 +      }
        inode->i_flags |= i_flags;
 -      d_instantiate(path.dentry, inode);
        inode->i_size = size;
        clear_nlink(inode);     /* It is unlinked */
        res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
 +      if (!IS_ERR(res))
 +              res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
 +                              &shmem_file_operations);
        if (IS_ERR(res))
 -              goto put_path;
 -
 -      res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
 -                &shmem_file_operations);
 -      if (IS_ERR(res))
 -              goto put_path;
 -
 -      return res;
 -
 -put_memory:
 -      shmem_unacct_size(flags, size);
 -put_path:
 -      path_put(&path);
 +              iput(inode);
        return res;
  }
  
diff --combined mm/swapfile.c
@@@ -2909,35 -2909,6 +2909,35 @@@ static int claim_swapfile(struct swap_i
        return 0;
  }
  
 +
 +/*
 + * Find out how many pages are allowed for a single swap device. There
 + * are two limiting factors:
 + * 1) the number of bits for the swap offset in the swp_entry_t type, and
 + * 2) the number of bits in the swap pte, as defined by the different
 + * architectures.
 + *
 + * In order to find the largest possible bit mask, a swap entry with
 + * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
 + * decoded to a swp_entry_t again, and finally the swap offset is
 + * extracted.
 + *
 + * This will mask all the bits from the initial ~0UL mask that can't
 + * be encoded in either the swp_entry_t or the architecture definition
 + * of a swap pte.
 + */
 +unsigned long generic_max_swapfile_size(void)
 +{
 +      return swp_offset(pte_to_swp_entry(
 +                      swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
 +}
 +
 +/* Can be overridden by an architecture for additional checks. */
 +__weak unsigned long max_swapfile_size(void)
 +{
 +      return generic_max_swapfile_size();
 +}
 +
  static unsigned long read_swap_header(struct swap_info_struct *p,
                                        union swap_header *swap_header,
                                        struct inode *inode)
        p->cluster_next = 1;
        p->cluster_nr = 0;
  
 -      /*
 -       * Find out how many pages are allowed for a single swap
 -       * device. There are two limiting factors: 1) the number
 -       * of bits for the swap offset in the swp_entry_t type, and
 -       * 2) the number of bits in the swap pte as defined by the
 -       * different architectures. In order to find the
 -       * largest possible bit mask, a swap entry with swap type 0
 -       * and swap offset ~0UL is created, encoded to a swap pte,
 -       * decoded to a swp_entry_t again, and finally the swap
 -       * offset is extracted. This will mask all the bits from
 -       * the initial ~0UL mask that can't be encoded in either
 -       * the swp_entry_t or the architecture definition of a
 -       * swap pte.
 -       */
 -      maxpages = swp_offset(pte_to_swp_entry(
 -                      swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
 +      maxpages = max_swapfile_size();
        last_page = swap_header->info.last_page;
        if (!last_page) {
                pr_warn("Empty swap-file\n");
@@@ -3745,6 -3731,37 +3745,37 @@@ static void free_swap_count_continuatio
        }
  }
  
+ #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+                                 gfp_t gfp_mask)
+ {
+       struct swap_info_struct *si, *next;
+       if (!(gfp_mask & __GFP_IO) || !memcg)
+               return;
+       if (!blk_cgroup_congested())
+               return;
+       /*
+        * We've already scheduled a throttle, avoid taking the global swap
+        * lock.
+        */
+       if (current->throttle_queue)
+               return;
+       spin_lock(&swap_avail_lock);
+       plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
+                                 avail_lists[node]) {
+               if (si->bdev) {
+                       blkcg_schedule_throttle(bdev_get_queue(si->bdev),
+                                               true);
+                       break;
+               }
+       }
+       spin_unlock(&swap_avail_lock);
+ }
+ #endif
  static int __init swapfile_init(void)
  {
        int nid;