Merge tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 14 Aug 2018 17:23:25 +0000 (10:23 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 14 Aug 2018 17:23:25 +0000 (10:23 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 14 Aug 2018 17:23:25 +0000 (10:23 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 14 Aug 2018 17:23:25 +0000 (10:23 -0700)
diff --combined block/bio.c

index 047c5dc,04969b3..b12966e
--- 1/block/bio.c
--- 2/block/bio.c
+++ b/block/bio.c
@@@ -28,9 -28,11 +28,11 @@@
   #include <linux/mempool.h>
   #include <linux/workqueue.h>
   #include <linux/cgroup.h>
+ #include <linux/blk-cgroup.h>
   
   #include <trace/events/block.h>
   #include "blk.h"
+ #include "blk-rq-qos.h"
   
   /*
    * Test patch to inline a certain number of bi_io_vec's inside the bio
@@@ -156,7 -158,7 +158,7 @@@ out
   
   unsigned int bvec_nr_vecs(unsigned short idx)
   {
-       return bvec_slabs[idx].nr_vecs;
+       return bvec_slabs[--idx].nr_vecs;
   }
   
   void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
@@@ -644,83 -646,6 +646,6 @@@ struct bio *bio_clone_fast(struct bio *
   }
   EXPORT_SYMBOL(bio_clone_fast);
   
- /**
-  *    bio_clone_bioset - clone a bio
-  *    @bio_src: bio to clone
-  *    @gfp_mask: allocation priority
-  *    @bs: bio_set to allocate from
-  *
-  *    Clone bio. Caller will own the returned bio, but not the actual data it
-  *    points to. Reference count of returned bio will be one.
-  */
- struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-                            struct bio_set *bs)
- {
-       struct bvec_iter iter;
-       struct bio_vec bv;
-       struct bio *bio;
- 
-       /*
-        * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
-        * bio_src->bi_io_vec to bio->bi_io_vec.
-        *
-        * We can't do that anymore, because:
-        *
-        *  - The point of cloning the biovec is to produce a bio with a biovec
-        *    the caller can modify: bi_idx and bi_bvec_done should be 0.
-        *
-        *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
-        *    we tried to clone the whole thing bio_alloc_bioset() would fail.
-        *    But the clone should succeed as long as the number of biovecs we
-        *    actually need to allocate is fewer than BIO_MAX_PAGES.
-        *
-        *  - Lastly, bi_vcnt should not be looked at or relied upon by code
-        *    that does not own the bio - reason being drivers don't use it for
-        *    iterating over the biovec anymore, so expecting it to be kept up
-        *    to date (i.e. for clones that share the parent biovec) is just
-        *    asking for trouble and would force extra work on
-        *    __bio_clone_fast() anyways.
-        */
- 
-       bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
-       if (!bio)
-               return NULL;
-       bio->bi_disk            = bio_src->bi_disk;
-       bio->bi_opf             = bio_src->bi_opf;
-       bio->bi_write_hint      = bio_src->bi_write_hint;
-       bio->bi_iter.bi_sector  = bio_src->bi_iter.bi_sector;
-       bio->bi_iter.bi_size    = bio_src->bi_iter.bi_size;
- 
-       switch (bio_op(bio)) {
-       case REQ_OP_DISCARD:
-       case REQ_OP_SECURE_ERASE:
-       case REQ_OP_WRITE_ZEROES:
-               break;
-       case REQ_OP_WRITE_SAME:
-               bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
-               break;
-       default:
-               bio_for_each_segment(bv, bio_src, iter)
-                       bio->bi_io_vec[bio->bi_vcnt++] = bv;
-               break;
-       }
- 
-       if (bio_integrity(bio_src)) {
-               int ret;
- 
-               ret = bio_integrity_clone(bio, bio_src, gfp_mask);
-               if (ret < 0) {
-                       bio_put(bio);
-                       return NULL;
-               }
-       }
- 
-       bio_clone_blkcg_association(bio, bio_src);
- 
-       return bio;
- }
- EXPORT_SYMBOL(bio_clone_bioset);
- 
   /**
    *    bio_add_pc_page -       attempt to add page to bio
    *    @q: the target queue
@@@ -903,27 -828,25 +828,27 @@@ int bio_add_page(struct bio *bio, struc
   EXPORT_SYMBOL(bio_add_page);
   
   /**
- - * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+ + * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
    * @bio: bio to add pages to
    * @iter: iov iterator describing the region to be mapped
    *
- - * Pins as many pages from *iter and appends them to @bio's bvec array. The
+ + * Pins pages from *iter and appends them to @bio's bvec array. The
    * pages will have to be released using put_page() when done.
+ + * For multi-segment *iter, this function only adds pages from the
+ + * the next non-empty segment of the iov iterator.
    */
- -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+ +static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
   {
- -      unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+ +      unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx;
         struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
         struct page **pages = (struct page **)bv;
- -      size_t offset, diff;
+ +      size_t offset;
         ssize_t size;
   
         size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
         if (unlikely(size <= 0))
                 return size ? size : -EFAULT;
- -      nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
+ +      idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
   
         /*
          * Deep magic below:  We need to walk the pinned pages backwards
@@@ -936,46 -859,21 +861,46 @@@
         bio->bi_iter.bi_size += size;
         bio->bi_vcnt += nr_pages;
   
- -      diff = (nr_pages * PAGE_SIZE - offset) - size;
- -      while (nr_pages--) {
- -              bv[nr_pages].bv_page = pages[nr_pages];
- -              bv[nr_pages].bv_len = PAGE_SIZE;
- -              bv[nr_pages].bv_offset = 0;
+ +      while (idx--) {
+ +              bv[idx].bv_page = pages[idx];
+ +              bv[idx].bv_len = PAGE_SIZE;
+ +              bv[idx].bv_offset = 0;
         }
   
         bv[0].bv_offset += offset;
         bv[0].bv_len -= offset;
- -      if (diff)
- -              bv[bio->bi_vcnt - 1].bv_len -= diff;
+ +      bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size;
   
         iov_iter_advance(iter, size);
         return 0;
   }
+ +
+ +/**
+ + * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+ + * @bio: bio to add pages to
+ + * @iter: iov iterator describing the region to be mapped
+ + *
+ + * Pins pages from *iter and appends them to @bio's bvec array. The
+ + * pages will have to be released using put_page() when done.
+ + * The function tries, but does not guarantee, to pin as many pages as
+ + * fit into the bio, or are requested in *iter, whatever is smaller.
+ + * If MM encounters an error pinning the requested pages, it stops.
+ + * Error is returned only if 0 pages could be pinned.
+ + */
+ +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+ +{
+ +      unsigned short orig_vcnt = bio->bi_vcnt;
+ +
+ +      do {
+ +              int ret = __bio_iov_iter_get_pages(bio, iter);
+ +
+ +              if (unlikely(ret))
+ +                      return bio->bi_vcnt > orig_vcnt ? 0 : ret;
+ +
+ +      } while (iov_iter_count(iter) && !bio_full(bio));
+ +
+ +      return 0;
+ +}
   EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
   
   static void submit_bio_wait_endio(struct bio *bio)
@@@ -1661,10 -1559,8 +1586,8 @@@ void bio_set_pages_dirty(struct bio *bi
         int i;
   
         bio_for_each_segment_all(bvec, bio, i) {
-               struct page *page = bvec->bv_page;
- 
-               if (page && !PageCompound(page))
-                       set_page_dirty_lock(page);
+               if (!PageCompound(bvec->bv_page))
+                       set_page_dirty_lock(bvec->bv_page);
         }
   }
   EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
@@@ -1674,19 -1570,15 +1597,15 @@@ static void bio_release_pages(struct bi
         struct bio_vec *bvec;
         int i;
   
-       bio_for_each_segment_all(bvec, bio, i) {
-               struct page *page = bvec->bv_page;
- 
-               if (page)
-                       put_page(page);
-       }
+       bio_for_each_segment_all(bvec, bio, i)
+               put_page(bvec->bv_page);
   }
   
   /*
    * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
    * If they are, then fine.  If, however, some pages are clean then they must
    * have been written out during the direct-IO read.  So we take another ref on
-  * the BIO and the offending pages and re-dirty the pages in process context.
+  * the BIO and re-dirty the pages in process context.
    *
    * It is expected that bio_check_pages_dirty() will wholly own the BIO from
    * here on.  It will run one put_page() against each page and will run one
@@@ -1704,78 -1596,70 +1623,70 @@@ static struct bio *bio_dirty_list
    */
   static void bio_dirty_fn(struct work_struct *work)
   {
-       unsigned long flags;
-       struct bio *bio;
+       struct bio *bio, *next;
   
-       spin_lock_irqsave(&bio_dirty_lock, flags);
-       bio = bio_dirty_list;
+       spin_lock_irq(&bio_dirty_lock);
+       next = bio_dirty_list;
         bio_dirty_list = NULL;
-       spin_unlock_irqrestore(&bio_dirty_lock, flags);
+       spin_unlock_irq(&bio_dirty_lock);
   
-       while (bio) {
-               struct bio *next = bio->bi_private;
+       while ((bio = next) != NULL) {
+               next = bio->bi_private;
   
                 bio_set_pages_dirty(bio);
                 bio_release_pages(bio);
                 bio_put(bio);
-               bio = next;
         }
   }
   
   void bio_check_pages_dirty(struct bio *bio)
   {
         struct bio_vec *bvec;
-       int nr_clean_pages = 0;
+       unsigned long flags;
         int i;
   
         bio_for_each_segment_all(bvec, bio, i) {
-               struct page *page = bvec->bv_page;
- 
-               if (PageDirty(page) || PageCompound(page)) {
-                       put_page(page);
-                       bvec->bv_page = NULL;
-               } else {
-                       nr_clean_pages++;
-               }
+               if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
+                       goto defer;
         }
   
-       if (nr_clean_pages) {
-               unsigned long flags;
- 
-               spin_lock_irqsave(&bio_dirty_lock, flags);
-               bio->bi_private = bio_dirty_list;
-               bio_dirty_list = bio;
-               spin_unlock_irqrestore(&bio_dirty_lock, flags);
-               schedule_work(&bio_dirty_work);
-       } else {
-               bio_put(bio);
-       }
+       bio_release_pages(bio);
+       bio_put(bio);
+       return;
+ defer:
+       spin_lock_irqsave(&bio_dirty_lock, flags);
+       bio->bi_private = bio_dirty_list;
+       bio_dirty_list = bio;
+       spin_unlock_irqrestore(&bio_dirty_lock, flags);
+       schedule_work(&bio_dirty_work);
   }
   EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
   
- void generic_start_io_acct(struct request_queue *q, int rw,
+ void generic_start_io_acct(struct request_queue *q, int op,
                            unsigned long sectors, struct hd_struct *part)
   {
+       const int sgrp = op_stat_group(op);
         int cpu = part_stat_lock();
   
         part_round_stats(q, cpu, part);
-       part_stat_inc(cpu, part, ios[rw]);
-       part_stat_add(cpu, part, sectors[rw], sectors);
-       part_inc_in_flight(q, part, rw);
+       part_stat_inc(cpu, part, ios[sgrp]);
+       part_stat_add(cpu, part, sectors[sgrp], sectors);
+       part_inc_in_flight(q, part, op_is_write(op));
   
         part_stat_unlock();
   }
   EXPORT_SYMBOL(generic_start_io_acct);
   
- void generic_end_io_acct(struct request_queue *q, int rw,
+ void generic_end_io_acct(struct request_queue *q, int req_op,
                          struct hd_struct *part, unsigned long start_time)
   {
         unsigned long duration = jiffies - start_time;
+       const int sgrp = op_stat_group(req_op);
         int cpu = part_stat_lock();
   
-       part_stat_add(cpu, part, ticks[rw], duration);
+       part_stat_add(cpu, part, ticks[sgrp], duration);
         part_round_stats(q, cpu, part);
-       part_dec_in_flight(q, part, rw);
+       part_dec_in_flight(q, part, op_is_write(req_op));
   
         part_stat_unlock();
   }
@@@ -1834,6 -1718,9 +1745,9 @@@ again
         if (!bio_integrity_endio(bio))
                 return;
   
+       if (bio->bi_disk)
+               rq_qos_done_bio(bio->bi_disk->queue, bio);
+ 
         /*
          * Need to have a real endio function for chained bios, otherwise
          * various corner cases will break (like stacking block devices that
@@@ -1893,7 -1780,6 +1807,7 @@@ struct bio *bio_split(struct bio *bio, 
                 bio_integrity_trim(split);
   
         bio_advance(bio, split->bi_iter.bi_size);
+ +      bio->bi_iter.bi_done = 0;
   
         if (bio_flagged(bio, BIO_TRACE_COMPLETION))
                 bio_set_flag(split, BIO_TRACE_COMPLETION);
@@@ -2042,6 -1928,30 +1956,30 @@@ EXPORT_SYMBOL(bioset_init_from_src)
   
   #ifdef CONFIG_BLK_CGROUP
   
+ #ifdef CONFIG_MEMCG
+ /**
+  * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
+  * @bio: target bio
+  * @page: the page to lookup the blkcg from
+  *
+  * Associate @bio with the blkcg from @page's owning memcg.  This works like
+  * every other associate function wrt references.
+  */
+ int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
+ {
+       struct cgroup_subsys_state *blkcg_css;
+ 
+       if (unlikely(bio->bi_css))
+               return -EBUSY;
+       if (!page->mem_cgroup)
+               return 0;
+       blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
+                                    &io_cgrp_subsys);
+       bio->bi_css = blkcg_css;
+       return 0;
+ }
+ #endif /* CONFIG_MEMCG */
+ 
   /**
    * bio_associate_blkcg - associate a bio with the specified blkcg
    * @bio: target bio
@@@ -2064,6 -1974,24 +2002,24 @@@ int bio_associate_blkcg(struct bio *bio
   }
   EXPORT_SYMBOL_GPL(bio_associate_blkcg);
   
+ /**
+  * bio_associate_blkg - associate a bio with the specified blkg
+  * @bio: target bio
+  * @blkg: the blkg to associate
+  *
+  * Associate @bio with the blkg specified by @blkg.  This is the queue specific
+  * blkcg information associated with the @bio, a reference will be taken on the
+  * @blkg and will be freed when the bio is freed.
+  */
+ int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+ {
+       if (unlikely(bio->bi_blkg))
+               return -EBUSY;
+       blkg_get(blkg);
+       bio->bi_blkg = blkg;
+       return 0;
+ }
+ 
   /**
    * bio_disassociate_task - undo bio_associate_current()
    * @bio: target bio
@@@ -2078,6 -2006,10 +2034,10 @@@ void bio_disassociate_task(struct bio *
                 css_put(bio->bi_css);
                 bio->bi_css = NULL;
         }
+       if (bio->bi_blkg) {
+               blkg_put(bio->bi_blkg);
+               bio->bi_blkg = NULL;
+       }
   }
   
   /**
diff --combined block/blk-core.c

index ee33590,49af34b..1255034
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -42,7 -42,7 +42,7 @@@
   #include "blk.h"
   #include "blk-mq.h"
   #include "blk-mq-sched.h"
- #include "blk-wbt.h"
+ #include "blk-rq-qos.h"
   
   #ifdef CONFIG_DEBUG_FS
   struct dentry *blk_debugfs_root;
@@@ -715,6 -715,35 +715,35 @@@ void blk_set_queue_dying(struct request
   }
   EXPORT_SYMBOL_GPL(blk_set_queue_dying);
   
+ /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
+ void blk_exit_queue(struct request_queue *q)
+ {
+       /*
+        * Since the I/O scheduler exit code may access cgroup information,
+        * perform I/O scheduler exit before disassociating from the block
+        * cgroup controller.
+        */
+       if (q->elevator) {
+               ioc_clear_queue(q);
+               elevator_exit(q, q->elevator);
+               q->elevator = NULL;
+       }
+ 
+       /*
+        * Remove all references to @q from the block cgroup controller before
+        * restoring @q->queue_lock to avoid that restoring this pointer causes
+        * e.g. blkcg_print_blkgs() to crash.
+        */
+       blkcg_exit_queue(q);
+ 
+       /*
+        * Since the cgroup code may dereference the @q->backing_dev_info
+        * pointer, only decrease its reference count after having removed the
+        * association with the block cgroup controller.
+        */
+       bdi_put(q->backing_dev_info);
+ }
+ 
   /**
    * blk_cleanup_queue - shutdown a request queue
    * @q: request queue to shutdown
@@@ -762,9 -791,13 +791,13 @@@ void blk_cleanup_queue(struct request_q
          * make sure all in-progress dispatch are completed because
          * blk_freeze_queue() can only complete all requests, and
          * dispatch may still be in-progress since we dispatch requests
-        * from more than one contexts
+        * from more than one contexts.
+        *
+        * No need to quiesce queue if it isn't initialized yet since
+        * blk_freeze_queue() should be enough for cases of passthrough
+        * request.
          */
-       if (q->mq_ops)
+       if (q->mq_ops && blk_queue_init_done(q))
                 blk_mq_quiesce_queue(q);
   
         /* for synchronous bio-based driver finish in-flight integrity i/o */
@@@ -780,30 -813,7 +813,7 @@@
          */
         WARN_ON_ONCE(q->kobj.state_in_sysfs);
   
-       /*
-        * Since the I/O scheduler exit code may access cgroup information,
-        * perform I/O scheduler exit before disassociating from the block
-        * cgroup controller.
-        */
-       if (q->elevator) {
-               ioc_clear_queue(q);
-               elevator_exit(q, q->elevator);
-               q->elevator = NULL;
-       }
- 
-       /*
-        * Remove all references to @q from the block cgroup controller before
-        * restoring @q->queue_lock to avoid that restoring this pointer causes
-        * e.g. blkcg_print_blkgs() to crash.
-        */
-       blkcg_exit_queue(q);
- 
-       /*
-        * Since the cgroup code may dereference the @q->backing_dev_info
-        * pointer, only decrease its reference count after having removed the
-        * association with the block cgroup controller.
-        */
-       bdi_put(q->backing_dev_info);
+       blk_exit_queue(q);
   
         if (q->mq_ops)
                 blk_mq_free_queue(q);
@@@ -1180,6 -1190,7 +1190,7 @@@ out_exit_flush_rq
                 q->exit_rq_fn(q, q->fq->flush_rq);
   out_free_flush_queue:
         blk_free_flush_queue(q->fq);
+       q->fq = NULL;
         return -ENOMEM;
   }
   EXPORT_SYMBOL(blk_init_allocated_queue);
@@@ -1641,7 -1652,7 +1652,7 @@@ void blk_requeue_request(struct request
         blk_delete_timer(rq);
         blk_clear_rq_complete(rq);
         trace_block_rq_requeue(q, rq);
-       wbt_requeue(q->rq_wb, rq);
+       rq_qos_requeue(q, rq);
   
         if (rq->rq_flags & RQF_QUEUED)
                 blk_queue_end_tag(q, rq);
@@@ -1748,7 -1759,7 +1759,7 @@@ void __blk_put_request(struct request_q
         /* this is a bio leak */
         WARN_ON(req->bio != NULL);
   
-       wbt_done(q->rq_wb, req);
+       rq_qos_done(q, req);
   
         /*
          * Request may not have originated from ll_rw_blk. if not,
@@@ -1982,7 -1993,6 +1993,6 @@@ static blk_qc_t blk_queue_bio(struct re
         int where = ELEVATOR_INSERT_SORT;
         struct request *req, *free;
         unsigned int request_count = 0;
-       unsigned int wb_acct;
   
         /*
          * low level driver can indicate that it wants pages above a
@@@ -2040,7 -2050,7 +2050,7 @@@
         }
   
   get_rq:
-       wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+       rq_qos_throttle(q, bio, q->queue_lock);
   
         /*
          * Grab a free request. This is might sleep but can not fail.
@@@ -2050,7 -2060,7 +2060,7 @@@
         req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
         if (IS_ERR(req)) {
                 blk_queue_exit(q);
-               __wbt_done(q->rq_wb, wb_acct);
+               rq_qos_cleanup(q, bio);
                 if (PTR_ERR(req) == -ENOMEM)
                         bio->bi_status = BLK_STS_RESOURCE;
                 else
@@@ -2059,7 -2069,7 +2069,7 @@@
                 goto out_unlock;
         }
   
-       wbt_track(req, wb_acct);
+       rq_qos_track(q, req, bio);
   
         /*
          * After dropping the lock and possibly sleeping here, our request
@@@ -2155,12 -2165,11 +2165,12 @@@ static inline bool bio_check_ro(struct 
         if (part->policy && op_is_write(bio_op(bio))) {
                 char b[BDEVNAME_SIZE];
   
- -              printk(KERN_ERR
+ +              WARN_ONCE(1,
                        "generic_make_request: Trying to write "
                         "to read-only block-device %s (partno %d)\n",
                         bio_devname(bio, b), part->partno);
- -              return true;
+ +              /* Older lvm-tools actually trigger this */
+ +              return false;
         }
   
         return false;
@@@ -2700,13 -2709,13 +2710,13 @@@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes)
   void blk_account_io_completion(struct request *req, unsigned int bytes)
   {
         if (blk_do_io_stat(req)) {
-               const int rw = rq_data_dir(req);
+               const int sgrp = op_stat_group(req_op(req));
                 struct hd_struct *part;
                 int cpu;
   
                 cpu = part_stat_lock();
                 part = req->part;
-               part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+               part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
                 part_stat_unlock();
         }
   }
@@@ -2720,7 -2729,7 +2730,7 @@@ void blk_account_io_done(struct reques
          */
         if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
                 unsigned long duration;
-               const int rw = rq_data_dir(req);
+               const int sgrp = op_stat_group(req_op(req));
                 struct hd_struct *part;
                 int cpu;
   
@@@ -2728,10 -2737,10 +2738,10 @@@
                 cpu = part_stat_lock();
                 part = req->part;
   
-               part_stat_inc(cpu, part, ios[rw]);
-               part_stat_add(cpu, part, ticks[rw], duration);
+               part_stat_inc(cpu, part, ios[sgrp]);
+               part_stat_add(cpu, part, ticks[sgrp], duration);
                 part_round_stats(req->q, cpu, part);
-               part_dec_in_flight(req->q, part, rw);
+               part_dec_in_flight(req->q, part, rq_data_dir(req));
   
                 hd_struct_put(part);
                 part_stat_unlock();
@@@ -2751,9 -2760,9 +2761,9 @@@ static bool blk_pm_allow_request(struc
                 return rq->rq_flags & RQF_PM;
         case RPM_SUSPENDED:
                 return false;
+       default:
+               return true;
         }
- 
-       return true;
   }
   #else
   static bool blk_pm_allow_request(struct request *rq)
@@@ -2980,7 -2989,7 +2990,7 @@@ void blk_start_request(struct request *
                 req->throtl_size = blk_rq_sectors(req);
   #endif
                 req->rq_flags |= RQF_STATS;
-               wbt_issue(req->q->rq_wb, req);
+               rq_qos_issue(req->q, req);
         }
   
         BUG_ON(blk_rq_is_complete(req));
@@@ -3053,6 -3062,10 +3063,10 @@@ EXPORT_SYMBOL_GPL(blk_steal_bios)
    *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
    *     %false return from this function.
    *
+  * Note:
+  *    The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
+  *    blk_rq_bytes() and in blk_update_request().
+  *
    * Return:
    *     %false - this request doesn't have any more data
    *     %true  - this request has more data
@@@ -3200,7 -3213,7 +3214,7 @@@ void blk_finish_request(struct request 
         blk_account_io_done(req, now);
   
         if (req->end_io) {
-               wbt_done(req->q->rq_wb, req);
+               rq_qos_done(q, req);
                 req->end_io(req, error);
         } else {
                 if (blk_bidi_rq(req))
@@@ -3763,9 -3776,11 +3777,11 @@@ EXPORT_SYMBOL(blk_finish_plug)
    */
   void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
   {
-       /* not support for RQF_PM and ->rpm_status in blk-mq yet */
-       if (q->mq_ops)
+       /* Don't enable runtime PM for blk-mq until it is ready */
+       if (q->mq_ops) {
+               pm_runtime_disable(dev);
                 return;
+       }
   
         q->dev = dev;
         q->rpm_status = RPM_ACTIVE;
diff --combined block/blk-mq-tag.c

index 3de0836,c0c4e63..816923b
--- 1/block/blk-mq-tag.c
--- 2/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@@ -23,6 -23,9 +23,9 @@@ bool blk_mq_has_free_tags(struct blk_mq
   
   /*
    * If a previously inactive queue goes active, bump the active user count.
+  * We need to do this before try to allocate driver tag, then even if fail
+  * to get tag when first time, the other shared-tag users could reserve
+  * budget for it.
    */
   bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
   {
@@@ -271,7 -274,7 +274,7 @@@ static bool bt_tags_iter(struct sbitma
          * test and set the bit before assining ->rqs[].
          */
         rq = tags->rqs[bitnr];
- -      if (rq && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
+ +      if (rq && blk_mq_request_started(rq))
                 iter_data->fn(rq, iter_data->data, reserved);
   
         return true;
@@@ -399,8 -402,6 +402,6 @@@ int blk_mq_tag_update_depth(struct blk_
         if (tdepth <= tags->nr_reserved_tags)
                 return -EINVAL;
   
-       tdepth -= tags->nr_reserved_tags;
- 
         /*
          * If we are allowed to grow beyond the original size, allocate
          * a new set of tags before freeing the old one.
@@@ -420,7 -421,8 +421,8 @@@
                 if (tdepth > 16 * BLKDEV_MAX_RQ)
                         return -EINVAL;
   
-               new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
+               new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
+                               tags->nr_reserved_tags);
                 if (!new)
                         return -ENOMEM;
                 ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
@@@ -437,7 -439,8 +439,8 @@@
                  * Don't need (or can't) update reserved tags here, they
                  * remain static and should never need resizing.
                  */
-               sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+               sbitmap_queue_resize(&tags->bitmap_tags,
+                               tdepth - tags->nr_reserved_tags);
         }
   
         return 0;
diff --combined block/blk-mq.c

index 654b0dc,5efd789..72a0033
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -34,8 -34,8 +34,8 @@@
   #include "blk-mq-debugfs.h"
   #include "blk-mq-tag.h"
   #include "blk-stat.h"
- #include "blk-wbt.h"
   #include "blk-mq-sched.h"
+ #include "blk-rq-qos.h"
   
   static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
   static void blk_mq_poll_stats_start(struct request_queue *q);
@@@ -285,7 -285,7 +285,7 @@@ static struct request *blk_mq_rq_ctx_in
                 rq->tag = -1;
                 rq->internal_tag = tag;
         } else {
-               if (blk_mq_tag_busy(data->hctx)) {
+               if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
                         rq_flags = RQF_MQ_INFLIGHT;
                         atomic_inc(&data->hctx->nr_active);
                 }
@@@ -367,6 -367,8 +367,8 @@@ static struct request *blk_mq_get_reque
                 if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
                     !(data->flags & BLK_MQ_REQ_RESERVED))
                         e->type->ops.mq.limit_depth(op, data);
+       } else {
+               blk_mq_tag_busy(data->hctx);
         }
   
         tag = blk_mq_get_tag(data);
@@@ -504,7 -506,7 +506,7 @@@ void blk_mq_free_request(struct reques
         if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
                 laptop_io_completion(q->backing_dev_info);
   
-       wbt_done(q->rq_wb, rq);
+       rq_qos_done(q, rq);
   
         if (blk_rq_rl(rq))
                 blk_put_rl(blk_rq_rl(rq));
@@@ -527,7 -529,7 +529,7 @@@ inline void __blk_mq_end_request(struc
         blk_account_io_done(rq, now);
   
         if (rq->end_io) {
-               wbt_done(rq->q->rq_wb, rq);
+               rq_qos_done(rq->q, rq);
                 rq->end_io(rq, error);
         } else {
                 if (unlikely(blk_bidi_rq(rq)))
@@@ -558,8 -560,10 +560,8 @@@ static void __blk_mq_complete_request(s
         bool shared = false;
         int cpu;
   
- -      if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) !=
- -                      MQ_RQ_IN_FLIGHT)
+ +      if (!blk_mq_mark_complete(rq))
                 return;
- -
         if (rq->internal_tag != -1)
                 blk_mq_sched_completed_request(rq);
   
@@@ -639,7 -643,7 +641,7 @@@ void blk_mq_start_request(struct reques
                 rq->throtl_size = blk_rq_sectors(rq);
   #endif
                 rq->rq_flags |= RQF_STATS;
-               wbt_issue(q->rq_wb, rq);
+               rq_qos_issue(q, rq);
         }
   
         WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
@@@ -665,7 -669,7 +667,7 @@@ static void __blk_mq_requeue_request(st
         blk_mq_put_driver_tag(rq);
   
         trace_block_rq_requeue(q, rq);
-       wbt_requeue(q->rq_wb, rq);
+       rq_qos_requeue(q, rq);
   
         if (blk_mq_request_started(rq)) {
                 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
@@@ -962,16 -966,14 +964,14 @@@ static inline unsigned int queued_to_in
         return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
   }
   
- bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
-                          bool wait)
+ bool blk_mq_get_driver_tag(struct request *rq)
   {
         struct blk_mq_alloc_data data = {
                 .q = rq->q,
                 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
-               .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+               .flags = BLK_MQ_REQ_NOWAIT,
         };
- 
-       might_sleep_if(wait);
+       bool shared;
   
         if (rq->tag != -1)
                 goto done;
@@@ -979,9 -981,10 +979,10 @@@
         if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
                 data.flags |= BLK_MQ_REQ_RESERVED;
   
+       shared = blk_mq_tag_busy(data.hctx);
         rq->tag = blk_mq_get_tag(&data);
         if (rq->tag >= 0) {
-               if (blk_mq_tag_busy(data.hctx)) {
+               if (shared) {
                         rq->rq_flags |= RQF_MQ_INFLIGHT;
                         atomic_inc(&data.hctx->nr_active);
                 }
@@@ -989,8 -992,6 +990,6 @@@
         }
   
   done:
-       if (hctx)
-               *hctx = data.hctx;
         return rq->tag != -1;
   }
   
@@@ -1001,7 -1002,10 +1000,10 @@@ static int blk_mq_dispatch_wake(wait_qu
   
         hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
   
+       spin_lock(&hctx->dispatch_wait_lock);
         list_del_init(&wait->entry);
+       spin_unlock(&hctx->dispatch_wait_lock);
+ 
         blk_mq_run_hw_queue(hctx, true);
         return 1;
   }
@@@ -1012,17 -1016,16 +1014,16 @@@
    * restart. For both cases, take care to check the condition again after
    * marking us as waiting.
    */
- static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
+ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
                                  struct request *rq)
   {
-       struct blk_mq_hw_ctx *this_hctx = *hctx;
-       struct sbq_wait_state *ws;
+       struct wait_queue_head *wq;
         wait_queue_entry_t *wait;
         bool ret;
   
-       if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
-               if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
-                       set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
+       if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
+               if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+                       set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
   
                 /*
                  * It's possible that a tag was freed in the window between the
@@@ -1032,30 -1035,35 +1033,35 @@@
                  * Don't clear RESTART here, someone else could have set it.
                  * At most this will cost an extra queue run.
                  */
-               return blk_mq_get_driver_tag(rq, hctx, false);
+               return blk_mq_get_driver_tag(rq);
         }
   
-       wait = &this_hctx->dispatch_wait;
+       wait = &hctx->dispatch_wait;
         if (!list_empty_careful(&wait->entry))
                 return false;
   
-       spin_lock(&this_hctx->lock);
+       wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
+ 
+       spin_lock_irq(&wq->lock);
+       spin_lock(&hctx->dispatch_wait_lock);
         if (!list_empty(&wait->entry)) {
-               spin_unlock(&this_hctx->lock);
+               spin_unlock(&hctx->dispatch_wait_lock);
+               spin_unlock_irq(&wq->lock);
                 return false;
         }
   
-       ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
-       add_wait_queue(&ws->wait, wait);
+       wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+       __add_wait_queue(wq, wait);
   
         /*
          * It's possible that a tag was freed in the window between the
          * allocation failure and adding the hardware queue to the wait
          * queue.
          */
-       ret = blk_mq_get_driver_tag(rq, hctx, false);
+       ret = blk_mq_get_driver_tag(rq);
         if (!ret) {
-               spin_unlock(&this_hctx->lock);
+               spin_unlock(&hctx->dispatch_wait_lock);
+               spin_unlock_irq(&wq->lock);
                 return false;
         }
   
@@@ -1063,14 -1071,42 +1069,42 @@@
          * We got a tag, remove ourselves from the wait queue to ensure
          * someone else gets the wakeup.
          */
-       spin_lock_irq(&ws->wait.lock);
         list_del_init(&wait->entry);
-       spin_unlock_irq(&ws->wait.lock);
-       spin_unlock(&this_hctx->lock);
+       spin_unlock(&hctx->dispatch_wait_lock);
+       spin_unlock_irq(&wq->lock);
   
         return true;
   }
   
+ #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
+ #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
+ /*
+  * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
+  * - EWMA is one simple way to compute running average value
+  * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
+  * - take 4 as factor for avoiding to get too small(0) result, and this
+  *   factor doesn't matter because EWMA decreases exponentially
+  */
+ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
+ {
+       unsigned int ewma;
+ 
+       if (hctx->queue->elevator)
+               return;
+ 
+       ewma = hctx->dispatch_busy;
+ 
+       if (!ewma && !busy)
+               return;
+ 
+       ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
+       if (busy)
+               ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
+       ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
+ 
+       hctx->dispatch_busy = ewma;
+ }
+ 
   #define BLK_MQ_RESOURCE_DELAY 3               /* ms units */
   
   /*
@@@ -1103,7 -1139,7 +1137,7 @@@ bool blk_mq_dispatch_rq_list(struct req
                 if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
                         break;
   
-               if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+               if (!blk_mq_get_driver_tag(rq)) {
                         /*
                          * The initial allocation attempt failed, so we need to
                          * rerun the hardware queue when a tag is freed. The
@@@ -1111,7 -1147,7 +1145,7 @@@
                          * before we add this entry back on the dispatch list,
                          * we'll re-run it below.
                          */
-                       if (!blk_mq_mark_tag_wait(&hctx, rq)) {
+                       if (!blk_mq_mark_tag_wait(hctx, rq)) {
                                 blk_mq_put_dispatch_budget(hctx);
                                 /*
                                  * For non-shared tags, the RESTART check
@@@ -1135,7 -1171,7 +1169,7 @@@
                         bd.last = true;
                 else {
                         nxt = list_first_entry(list, struct request, queuelist);
-                       bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
+                       bd.last = !blk_mq_get_driver_tag(nxt);
                 }
   
                 ret = q->mq_ops->queue_rq(hctx, &bd);
@@@ -1207,8 -1243,10 +1241,10 @@@
                 else if (needs_restart && (ret == BLK_STS_RESOURCE))
                         blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
   
+               blk_mq_update_dispatch_busy(hctx, true);
                 return false;
-       }
+       } else
+               blk_mq_update_dispatch_busy(hctx, false);
   
         /*
          * If the host/device is unable to accept more work, inform the
@@@ -1542,19 -1580,19 +1578,19 @@@ void blk_mq_insert_requests(struct blk_
                             struct list_head *list)
   
   {
+       struct request *rq;
+ 
         /*
          * preemption doesn't flush plug list, so it's possible ctx->cpu is
          * offline now
          */
-       spin_lock(&ctx->lock);
-       while (!list_empty(list)) {
-               struct request *rq;
- 
-               rq = list_first_entry(list, struct request, queuelist);
+       list_for_each_entry(rq, list, queuelist) {
                 BUG_ON(rq->mq_ctx != ctx);
-               list_del_init(&rq->queuelist);
-               __blk_mq_insert_req_list(hctx, rq, false);
+               trace_block_rq_insert(hctx->queue, rq);
         }
+ 
+       spin_lock(&ctx->lock);
+       list_splice_tail_init(list, &ctx->rq_list);
         blk_mq_hctx_mark_pending(hctx, ctx);
         spin_unlock(&ctx->lock);
   }
@@@ -1657,13 -1695,16 +1693,16 @@@ static blk_status_t __blk_mq_issue_dire
         ret = q->mq_ops->queue_rq(hctx, &bd);
         switch (ret) {
         case BLK_STS_OK:
+               blk_mq_update_dispatch_busy(hctx, false);
                 *cookie = new_cookie;
                 break;
         case BLK_STS_RESOURCE:
         case BLK_STS_DEV_RESOURCE:
+               blk_mq_update_dispatch_busy(hctx, true);
                 __blk_mq_requeue_request(rq);
                 break;
         default:
+               blk_mq_update_dispatch_busy(hctx, false);
                 *cookie = BLK_QC_T_NONE;
                 break;
         }
@@@ -1698,7 -1739,7 +1737,7 @@@ static blk_status_t __blk_mq_try_issue_
         if (!blk_mq_get_dispatch_budget(hctx))
                 goto insert;
   
-       if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+       if (!blk_mq_get_driver_tag(rq)) {
                 blk_mq_put_dispatch_budget(hctx);
                 goto insert;
         }
@@@ -1746,6 -1787,27 +1785,27 @@@ blk_status_t blk_mq_request_issue_direc
         return ret;
   }
   
+ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+               struct list_head *list)
+ {
+       while (!list_empty(list)) {
+               blk_status_t ret;
+               struct request *rq = list_first_entry(list, struct request,
+                               queuelist);
+ 
+               list_del_init(&rq->queuelist);
+               ret = blk_mq_request_issue_directly(rq);
+               if (ret != BLK_STS_OK) {
+                       if (ret == BLK_STS_RESOURCE ||
+                                       ret == BLK_STS_DEV_RESOURCE) {
+                               list_add(&rq->queuelist, list);
+                               break;
+                       }
+                       blk_mq_end_request(rq, ret);
+               }
+       }
+ }
+ 
   static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
   {
         const int is_sync = op_is_sync(bio->bi_opf);
@@@ -1756,7 -1818,6 +1816,6 @@@
         struct blk_plug *plug;
         struct request *same_queue_rq = NULL;
         blk_qc_t cookie;
-       unsigned int wb_acct;
   
         blk_queue_bounce(q, &bio);
   
@@@ -1772,19 -1833,19 +1831,19 @@@
         if (blk_mq_sched_bio_merge(q, bio))
                 return BLK_QC_T_NONE;
   
-       wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+       rq_qos_throttle(q, bio, NULL);
   
         trace_block_getrq(q, bio, bio->bi_opf);
   
         rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
         if (unlikely(!rq)) {
-               __wbt_done(q->rq_wb, wb_acct);
+               rq_qos_cleanup(q, bio);
                 if (bio->bi_opf & REQ_NOWAIT)
                         bio_wouldblock_error(bio);
                 return BLK_QC_T_NONE;
         }
   
-       wbt_track(rq, wb_acct);
+       rq_qos_track(q, rq, bio);
   
         cookie = request_to_qc_t(data.hctx, rq);
   
@@@ -1847,7 -1908,8 +1906,8 @@@
                         blk_mq_try_issue_directly(data.hctx, same_queue_rq,
                                         &cookie);
                 }
-       } else if (q->nr_hw_queues > 1 && is_sync) {
+       } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
+                       !data.hctx->dispatch_busy)) {
                 blk_mq_put_ctx(data.ctx);
                 blk_mq_bio_to_request(rq, bio);
                 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
@@@ -2146,6 -2208,7 +2206,7 @@@ static int blk_mq_init_hctx(struct requ
   
         hctx->nr_ctx = 0;
   
+       spin_lock_init(&hctx->dispatch_wait_lock);
         init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
         INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
   
@@@ -2331,15 -2394,10 +2392,10 @@@ static void queue_set_hctx_shared(struc
         int i;
   
         queue_for_each_hw_ctx(q, hctx, i) {
-               if (shared) {
-                       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-                               atomic_inc(&q->shared_hctx_restart);
+               if (shared)
                         hctx->flags |= BLK_MQ_F_TAG_SHARED;
-               } else {
-                       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-                               atomic_dec(&q->shared_hctx_restart);
+               else
                         hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
-               }
         }
   }
   
@@@ -2370,7 -2428,6 +2426,6 @@@ static void blk_mq_del_queue_tag_set(st
                 blk_mq_update_tag_set_depth(set, false);
         }
         mutex_unlock(&set->tag_list_lock);
-       synchronize_rcu();
         INIT_LIST_HEAD(&q->tag_set_list);
   }
   
@@@ -2685,7 -2742,6 +2740,6 @@@ static int blk_mq_alloc_rq_maps(struct 
   static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
   {
         if (set->ops->map_queues) {
-               int cpu;
                 /*
                  * transport .map_queues is usually done in the following
                  * way:
@@@ -2700,8 -2756,7 +2754,7 @@@
                  * killing stale mapping since one CPU may not be mapped
                  * to any hw queue.
                  */
-               for_each_possible_cpu(cpu)
-                       set->mq_map[cpu] = 0;
+               blk_mq_clear_mq_map(set);
   
                 return set->ops->map_queues(set);
         } else
@@@ -2711,7 -2766,7 +2764,7 @@@
   /*
    * Alloc a tag set to be associated with one or more request queues.
    * May fail with EINVAL for various error conditions. May adjust the
-  * requested depth down, if if it too large. In that case, the set
+  * requested depth down, if it's too large. In that case, the set
    * value will be stored in set->queue_depth.
    */
   int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
diff --combined drivers/block/zram/zram_drv.c

index a390c6d,2907a81..c7acf74
--- 1/drivers/block/zram/zram_drv.c
--- 2/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@@ -298,8 -298,7 +298,8 @@@ static void reset_bdev(struct zram *zra
         zram->backing_dev = NULL;
         zram->old_block_size = 0;
         zram->bdev = NULL;
- -
+ +      zram->disk->queue->backing_dev_info->capabilities |=
+ +                              BDI_CAP_SYNCHRONOUS_IO;
         kvfree(zram->bitmap);
         zram->bitmap = NULL;
   }
@@@ -401,18 -400,6 +401,18 @@@ static ssize_t backing_dev_store(struc
         zram->backing_dev = backing_dev;
         zram->bitmap = bitmap;
         zram->nr_pages = nr_pages;
+ +      /*
+ +       * With writeback feature, zram does asynchronous IO so it's no longer
+ +       * synchronous device so let's remove synchronous io flag. Othewise,
+ +       * upper layer(e.g., swap) could wait IO completion rather than
+ +       * (submit and return), which will cause system sluggish.
+ +       * Furthermore, when the IO function returns(e.g., swap_readpage),
+ +       * upper layer expects IO was done so it could deallocate the page
+ +       * freely but in fact, IO is going on so finally could cause
+ +       * use-after-free when the IO is really done.
+ +       */
+ +      zram->disk->queue->backing_dev_info->capabilities &=
+ +                      ~BDI_CAP_SYNCHRONOUS_IO;
         up_write(&zram->init_lock);
   
         pr_info("setup backing device %s\n", file_name);
@@@ -1287,17 -1274,16 +1287,16 @@@ static void zram_bio_discard(struct zra
    * Returns 1 if IO request was successfully submitted.
    */
   static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
-                       int offset, bool is_write, struct bio *bio)
+                       int offset, unsigned int op, struct bio *bio)
   {
         unsigned long start_time = jiffies;
-       int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
         struct request_queue *q = zram->disk->queue;
         int ret;
   
-       generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
+       generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT,
                         &zram->disk->part0);
   
-       if (!is_write) {
+       if (!op_is_write(op)) {
                 atomic64_inc(&zram->stats.num_reads);
                 ret = zram_bvec_read(zram, bvec, index, offset, bio);
                 flush_dcache_page(bvec->bv_page);
@@@ -1306,14 -1292,14 +1305,14 @@@
                 ret = zram_bvec_write(zram, bvec, index, offset, bio);
         }
   
-       generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
+       generic_end_io_acct(q, op, &zram->disk->part0, start_time);
   
         zram_slot_lock(zram, index);
         zram_accessed(zram, index);
         zram_slot_unlock(zram, index);
   
         if (unlikely(ret < 0)) {
-               if (!is_write)
+               if (!op_is_write(op))
                         atomic64_inc(&zram->stats.failed_reads);
                 else
                         atomic64_inc(&zram->stats.failed_writes);
@@@ -1351,7 -1337,7 +1350,7 @@@ static void __zram_make_request(struct 
                         bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
                                                         unwritten);
                         if (zram_bvec_rw(zram, &bv, index, offset,
-                                       op_is_write(bio_op(bio)), bio) < 0)
+                                        bio_op(bio), bio) < 0)
                                 goto out;
   
                         bv.bv_offset += bv.bv_len;
@@@ -1403,7 -1389,7 +1402,7 @@@ static void zram_slot_free_notify(struc
   }
   
   static int zram_rw_page(struct block_device *bdev, sector_t sector,
-                      struct page *page, bool is_write)
+                      struct page *page, unsigned int op)
   {
         int offset, ret;
         u32 index;
@@@ -1427,7 -1413,7 +1426,7 @@@
         bv.bv_len = PAGE_SIZE;
         bv.bv_offset = 0;
   
-       ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
+       ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
   out:
         /*
          * If I/O fails, just return error(ie, non-zero) without
@@@ -1442,7 -1428,7 +1441,7 @@@
   
         switch (ret) {
         case 0:
-               page_endio(page, is_write, 0);
+               page_endio(page, op_is_write(op), 0);
                 break;
         case 1:
                 ret = 0;
diff --combined drivers/nvme/host/fabrics.c

index f7efe5a,3c6cd0f..206d63c
--- 1/drivers/nvme/host/fabrics.c
--- 2/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@@ -474,7 -474,7 +474,7 @@@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue
   
   bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
   {
-       if (ctrl->opts->max_reconnects != -1 &&
+       if (ctrl->opts->max_reconnects == -1 ||
             ctrl->nr_reconnects < ctrl->opts->max_reconnects)
                 return true;
   
@@@ -539,18 -539,14 +539,18 @@@ static struct nvmf_transport_ops *nvmf_
   /*
    * For something we're not in a state to send to the device the default action
    * is to busy it and retry it after the controller state is recovered.  However,
- - * anything marked for failfast or nvme multipath is immediately failed.
+ + * if the controller is deleting or if anything is marked for failfast or
+ + * nvme multipath it is immediately failed.
    *
    * Note: commands used to initialize the controller will be marked for failfast.
    * Note: nvme cli/ioctl commands are marked for failfast.
    */
- -blk_status_t nvmf_fail_nonready_command(struct request *rq)
+ +blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
+ +              struct request *rq)
   {
- -      if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
+ +      if (ctrl->state != NVME_CTRL_DELETING &&
+ +          ctrl->state != NVME_CTRL_DEAD &&
+ +          !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
                 return BLK_STS_RESOURCE;
         nvme_req(rq)->status = NVME_SC_ABORT_REQ;
         return BLK_STS_IOERR;
diff --combined drivers/nvme/host/fc.c

index 9bac912,9cc3375..611e70c
--- 1/drivers/nvme/host/fc.c
--- 2/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@@ -1737,6 -1737,7 +1737,7 @@@ nvme_fc_init_request(struct blk_mq_tag_
         int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
         struct nvme_fc_queue *queue = &ctrl->queues[queue_idx];
   
+       nvme_req(rq)->ctrl = &ctrl->ctrl;
         return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
   }
   
@@@ -2272,7 -2273,7 +2273,7 @@@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *
   
         if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE ||
             !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
- -              return nvmf_fail_nonready_command(rq);
+ +              return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
   
         ret = nvme_setup_cmd(ns, rq, sqe);
         if (ret)
diff --combined drivers/nvme/host/rdma.c

index 66ec598,13a6064..0805fa6
--- 1/drivers/nvme/host/rdma.c
--- 2/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@@ -40,13 -40,14 +40,14 @@@
   
   #define NVME_RDMA_MAX_SEGMENTS                256
   
- #define NVME_RDMA_MAX_INLINE_SEGMENTS 1
+ #define NVME_RDMA_MAX_INLINE_SEGMENTS 4
   
   struct nvme_rdma_device {
         struct ib_device        *dev;
         struct ib_pd            *pd;
         struct kref             ref;
         struct list_head        entry;
+       unsigned int            num_inline_segments;
   };
   
   struct nvme_rdma_qe {
@@@ -117,6 -118,7 +118,7 @@@ struct nvme_rdma_ctrl 
         struct sockaddr_storage src_addr;
   
         struct nvme_ctrl        ctrl;
+       bool                    use_inline_data;
   };
   
   static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
@@@ -249,7 -251,7 +251,7 @@@ static int nvme_rdma_create_qp(struct n
         /* +1 for drain */
         init_attr.cap.max_recv_wr = queue->queue_size + 1;
         init_attr.cap.max_recv_sge = 1;
-       init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
+       init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
         init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
         init_attr.qp_type = IB_QPT_RC;
         init_attr.send_cq = queue->ib_cq;
@@@ -286,6 -288,7 +288,7 @@@ static int nvme_rdma_init_request(struc
         struct ib_device *ibdev = dev->dev;
         int ret;
   
+       nvme_req(rq)->ctrl = &ctrl->ctrl;
         ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
                         DMA_TO_DEVICE);
         if (ret)
@@@ -374,6 -377,8 +377,8 @@@ nvme_rdma_find_get_device(struct rdma_c
                 goto out_free_pd;
         }
   
+       ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
+                                       ndev->dev->attrs.max_sge - 1);
         list_add(&ndev->entry, &device_list);
   out_unlock:
         mutex_unlock(&device_list_mutex);
@@@ -868,6 -873,31 +873,31 @@@ out_free_io_queues
         return ret;
   }
   
+ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
+               bool remove)
+ {
+       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_rdma_stop_queue(&ctrl->queues[0]);
+       blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
+                       &ctrl->ctrl);
+       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_rdma_destroy_admin_queue(ctrl, remove);
+ }
+ 
+ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
+               bool remove)
+ {
+       if (ctrl->ctrl.queue_count > 1) {
+               nvme_stop_queues(&ctrl->ctrl);
+               nvme_rdma_stop_io_queues(ctrl);
+               blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
+                               &ctrl->ctrl);
+               if (remove)
+                       nvme_start_queues(&ctrl->ctrl);
+               nvme_rdma_destroy_io_queues(ctrl, remove);
+       }
+ }
+ 
   static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
   {
         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@@ -912,21 -942,44 +942,44 @@@ static void nvme_rdma_reconnect_or_remo
         }
   }
   
- static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
   {
-       struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
-                       struct nvme_rdma_ctrl, reconnect_work);
+       int ret = -EINVAL;
         bool changed;
-       int ret;
   
-       ++ctrl->ctrl.nr_reconnects;
- 
-       ret = nvme_rdma_configure_admin_queue(ctrl, false);
+       ret = nvme_rdma_configure_admin_queue(ctrl, new);
         if (ret)
-               goto requeue;
+               return ret;
+ 
+       if (ctrl->ctrl.icdoff) {
+               dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
+               goto destroy_admin;
+       }
+ 
+       if (!(ctrl->ctrl.sgls & (1 << 2))) {
+               dev_err(ctrl->ctrl.device,
+                       "Mandatory keyed sgls are not supported!\n");
+               goto destroy_admin;
+       }
+ 
+       if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
+               dev_warn(ctrl->ctrl.device,
+                       "queue_size %zu > ctrl sqsize %u, clamping down\n",
+                       ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
+       }
+ 
+       if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
+               dev_warn(ctrl->ctrl.device,
+                       "sqsize %u > ctrl maxcmd %u, clamping down\n",
+                       ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
+               ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
+       }
+ 
+       if (ctrl->ctrl.sgls & (1 << 20))
+               ctrl->use_inline_data = true;
   
         if (ctrl->ctrl.queue_count > 1) {
-               ret = nvme_rdma_configure_io_queues(ctrl, false);
+               ret = nvme_rdma_configure_io_queues(ctrl, new);
                 if (ret)
                         goto destroy_admin;
         }
@@@ -935,10 -988,31 +988,31 @@@
         if (!changed) {
                 /* state change failure is ok if we're in DELETING state */
                 WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
-               return;
+               ret = -EINVAL;
+               goto destroy_io;
         }
   
         nvme_start_ctrl(&ctrl->ctrl);
+       return 0;
+ 
+ destroy_io:
+       if (ctrl->ctrl.queue_count > 1)
+               nvme_rdma_destroy_io_queues(ctrl, new);
+ destroy_admin:
+       nvme_rdma_stop_queue(&ctrl->queues[0]);
+       nvme_rdma_destroy_admin_queue(ctrl, new);
+       return ret;
+ }
+ 
+ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+ {
+       struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
+                       struct nvme_rdma_ctrl, reconnect_work);
+ 
+       ++ctrl->ctrl.nr_reconnects;
+ 
+       if (nvme_rdma_setup_ctrl(ctrl, false))
+               goto requeue;
   
         dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
                         ctrl->ctrl.nr_reconnects);
@@@ -947,9 -1021,6 +1021,6 @@@
   
         return;
   
- destroy_admin:
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       nvme_rdma_destroy_admin_queue(ctrl, false);
   requeue:
         dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
                         ctrl->ctrl.nr_reconnects);
@@@ -962,27 -1033,9 +1033,9 @@@ static void nvme_rdma_error_recovery_wo
                         struct nvme_rdma_ctrl, err_work);
   
         nvme_stop_keep_alive(&ctrl->ctrl);
- 
-       if (ctrl->ctrl.queue_count > 1) {
-               nvme_stop_queues(&ctrl->ctrl);
-               nvme_rdma_stop_io_queues(ctrl);
-               blk_mq_tagset_busy_iter(&ctrl->tag_set,
-                                       nvme_cancel_request, &ctrl->ctrl);
-               nvme_rdma_destroy_io_queues(ctrl, false);
-       }
- 
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
-                               nvme_cancel_request, &ctrl->ctrl);
-       nvme_rdma_destroy_admin_queue(ctrl, false);
- 
-       /*
-        * queues are not a live anymore, so restart the queues to fail fast
-        * new IO
-        */
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_rdma_teardown_io_queues(ctrl, false);
         nvme_start_queues(&ctrl->ctrl);
+       nvme_rdma_teardown_admin_queue(ctrl, false);
   
         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
                 /* state change failure is ok if we're in DELETING state */
@@@ -1090,19 -1143,27 +1143,27 @@@ static int nvme_rdma_set_sg_null(struc
   }
   
   static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
-               struct nvme_rdma_request *req, struct nvme_command *c)
+               struct nvme_rdma_request *req, struct nvme_command *c,
+               int count)
   {
         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+       struct scatterlist *sgl = req->sg_table.sgl;
+       struct ib_sge *sge = &req->sge[1];
+       u32 len = 0;
+       int i;
   
-       req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
-       req->sge[1].length = sg_dma_len(req->sg_table.sgl);
-       req->sge[1].lkey = queue->device->pd->local_dma_lkey;
+       for (i = 0; i < count; i++, sgl++, sge++) {
+               sge->addr = sg_dma_address(sgl);
+               sge->length = sg_dma_len(sgl);
+               sge->lkey = queue->device->pd->local_dma_lkey;
+               len += sge->length;
+       }
   
         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
-       sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
+       sg->length = cpu_to_le32(len);
         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
   
-       req->num_sge++;
+       req->num_sge += count;
         return 0;
   }
   
@@@ -1195,15 -1256,16 +1256,16 @@@ static int nvme_rdma_map_data(struct nv
                 goto out_free_table;
         }
   
-       if (count == 1) {
+       if (count <= dev->num_inline_segments) {
                 if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
+                   queue->ctrl->use_inline_data &&
                     blk_rq_payload_bytes(rq) <=
                                 nvme_rdma_inline_data_size(queue)) {
-                       ret = nvme_rdma_map_sg_inline(queue, req, c);
+                       ret = nvme_rdma_map_sg_inline(queue, req, c, count);
                         goto out;
                 }
   
-               if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
+               if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
                         ret = nvme_rdma_map_sg_single(queue, req, c);
                         goto out;
                 }
@@@ -1574,6 -1636,7 +1636,7 @@@ static int nvme_rdma_cm_handler(struct 
         case RDMA_CM_EVENT_CONNECT_ERROR:
         case RDMA_CM_EVENT_UNREACHABLE:
                 nvme_rdma_destroy_queue_ib(queue);
+               /* fall through */
         case RDMA_CM_EVENT_ADDR_ERROR:
                 dev_dbg(queue->ctrl->ctrl.device,
                         "CM error event %d\n", ev->event);
@@@ -1639,7 -1702,7 +1702,7 @@@ static blk_status_t nvme_rdma_queue_rq(
         WARN_ON_ONCE(rq->tag < 0);
   
         if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
- -              return nvmf_fail_nonready_command(rq);
+ +              return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
   
         dev = queue->device->dev;
         ib_dma_sync_single_for_cpu(dev, sqe->dma,
@@@ -1736,25 -1799,12 +1799,12 @@@ static const struct blk_mq_ops nvme_rdm
   
   static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
   {
-       if (ctrl->ctrl.queue_count > 1) {
-               nvme_stop_queues(&ctrl->ctrl);
-               nvme_rdma_stop_io_queues(ctrl);
-               blk_mq_tagset_busy_iter(&ctrl->tag_set,
-                                       nvme_cancel_request, &ctrl->ctrl);
-               nvme_rdma_destroy_io_queues(ctrl, shutdown);
-       }
- 
+       nvme_rdma_teardown_io_queues(ctrl, shutdown);
         if (shutdown)
                 nvme_shutdown_ctrl(&ctrl->ctrl);
         else
                 nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
- 
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
-                               nvme_cancel_request, &ctrl->ctrl);
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
-       nvme_rdma_destroy_admin_queue(ctrl, shutdown);
+       nvme_rdma_teardown_admin_queue(ctrl, shutdown);
   }
   
   static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
@@@ -1766,8 -1816,6 +1816,6 @@@ static void nvme_rdma_reset_ctrl_work(s
   {
         struct nvme_rdma_ctrl *ctrl =
                 container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
-       int ret;
-       bool changed;
   
         nvme_stop_ctrl(&ctrl->ctrl);
         nvme_rdma_shutdown_ctrl(ctrl, false);
@@@ -1778,25 -1826,9 +1826,9 @@@
                 return;
         }
   
-       ret = nvme_rdma_configure_admin_queue(ctrl, false);
-       if (ret)
+       if (nvme_rdma_setup_ctrl(ctrl, false))
                 goto out_fail;
   
-       if (ctrl->ctrl.queue_count > 1) {
-               ret = nvme_rdma_configure_io_queues(ctrl, false);
-               if (ret)
-                       goto out_fail;
-       }
- 
-       changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-       if (!changed) {
-               /* state change failure is ok if we're in DELETING state */
-               WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
-               return;
-       }
- 
-       nvme_start_ctrl(&ctrl->ctrl);
- 
         return;
   
   out_fail:
@@@ -1959,49 -1991,10 +1991,10 @@@ static struct nvme_ctrl *nvme_rdma_crea
         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
         WARN_ON_ONCE(!changed);
   
-       ret = nvme_rdma_configure_admin_queue(ctrl, true);
+       ret = nvme_rdma_setup_ctrl(ctrl, true);
         if (ret)
                 goto out_uninit_ctrl;
   
-       /* sanity check icdoff */
-       if (ctrl->ctrl.icdoff) {
-               dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
-               ret = -EINVAL;
-               goto out_remove_admin_queue;
-       }
- 
-       /* sanity check keyed sgls */
-       if (!(ctrl->ctrl.sgls & (1 << 2))) {
-               dev_err(ctrl->ctrl.device,
-                       "Mandatory keyed sgls are not supported!\n");
-               ret = -EINVAL;
-               goto out_remove_admin_queue;
-       }
- 
-       /* only warn if argument is too large here, will clamp later */
-       if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
-               dev_warn(ctrl->ctrl.device,
-                       "queue_size %zu > ctrl sqsize %u, clamping down\n",
-                       opts->queue_size, ctrl->ctrl.sqsize + 1);
-       }
- 
-       /* warn if maxcmd is lower than sqsize+1 */
-       if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
-               dev_warn(ctrl->ctrl.device,
-                       "sqsize %u > ctrl maxcmd %u, clamping down\n",
-                       ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
-               ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
-       }
- 
-       if (opts->nr_io_queues) {
-               ret = nvme_rdma_configure_io_queues(ctrl, true);
-               if (ret)
-                       goto out_remove_admin_queue;
-       }
- 
-       changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-       WARN_ON_ONCE(!changed);
- 
         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
   
@@@ -2011,13 -2004,8 +2004,8 @@@
         list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
         mutex_unlock(&nvme_rdma_ctrl_mutex);
   
-       nvme_start_ctrl(&ctrl->ctrl);
- 
         return &ctrl->ctrl;
   
- out_remove_admin_queue:
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       nvme_rdma_destroy_admin_queue(ctrl, true);
   out_uninit_ctrl:
         nvme_uninit_ctrl(&ctrl->ctrl);
         nvme_put_ctrl(&ctrl->ctrl);
diff --combined drivers/nvme/target/configfs.c

index ebea137,51f5a8c..b37a8e3
--- 1/drivers/nvme/target/configfs.c
--- 2/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@@ -218,6 -218,35 +218,35 @@@ static ssize_t nvmet_addr_trsvcid_store
   
   CONFIGFS_ATTR(nvmet_, addr_trsvcid);
   
+ static ssize_t nvmet_param_inline_data_size_show(struct config_item *item,
+               char *page)
+ {
+       struct nvmet_port *port = to_nvmet_port(item);
+ 
+       return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size);
+ }
+ 
+ static ssize_t nvmet_param_inline_data_size_store(struct config_item *item,
+               const char *page, size_t count)
+ {
+       struct nvmet_port *port = to_nvmet_port(item);
+       int ret;
+ 
+       if (port->enabled) {
+               pr_err("Cannot modify inline_data_size while port enabled\n");
+               pr_err("Disable the port before modifying\n");
+               return -EACCES;
+       }
+       ret = kstrtoint(page, 0, &port->inline_data_size);
+       if (ret) {
+               pr_err("Invalid value '%s' for inline_data_size\n", page);
+               return -EINVAL;
+       }
+       return count;
+ }
+ 
+ CONFIGFS_ATTR(nvmet_, param_inline_data_size);
+ 
   static ssize_t nvmet_addr_trtype_show(struct config_item *item,
                 char *page)
   {
@@@ -282,7 -311,6 +311,7 @@@ static ssize_t nvmet_ns_device_path_sto
   {
         struct nvmet_ns *ns = to_nvmet_ns(item);
         struct nvmet_subsys *subsys = ns->subsys;
+ +      size_t len;
         int ret;
   
         mutex_lock(&subsys->lock);
@@@ -290,14 -318,10 +319,14 @@@
         if (ns->enabled)
                 goto out_unlock;
   
- -      kfree(ns->device_path);
+ +      ret = -EINVAL;
+ +      len = strcspn(page, "\n");
+ +      if (!len)
+ +              goto out_unlock;
   
+ +      kfree(ns->device_path);
         ret = -ENOMEM;
- -      ns->device_path = kstrndup(page, strcspn(page, "\n"), GFP_KERNEL);
+ +      ns->device_path = kstrndup(page, len, GFP_KERNEL);
         if (!ns->device_path)
                 goto out_unlock;
   
@@@ -387,6 -411,39 +416,39 @@@ out_unlock
   
   CONFIGFS_ATTR(nvmet_ns_, device_nguid);
   
+ static ssize_t nvmet_ns_ana_grpid_show(struct config_item *item, char *page)
+ {
+       return sprintf(page, "%u\n", to_nvmet_ns(item)->anagrpid);
+ }
+ 
+ static ssize_t nvmet_ns_ana_grpid_store(struct config_item *item,
+               const char *page, size_t count)
+ {
+       struct nvmet_ns *ns = to_nvmet_ns(item);
+       u32 oldgrpid, newgrpid;
+       int ret;
+ 
+       ret = kstrtou32(page, 0, &newgrpid);
+       if (ret)
+               return ret;
+ 
+       if (newgrpid < 1 || newgrpid > NVMET_MAX_ANAGRPS)
+               return -EINVAL;
+ 
+       down_write(&nvmet_ana_sem);
+       oldgrpid = ns->anagrpid;
+       nvmet_ana_group_enabled[newgrpid]++;
+       ns->anagrpid = newgrpid;
+       nvmet_ana_group_enabled[oldgrpid]--;
+       nvmet_ana_chgcnt++;
+       up_write(&nvmet_ana_sem);
+ 
+       nvmet_send_ana_event(ns->subsys, NULL);
+       return count;
+ }
+ 
+ CONFIGFS_ATTR(nvmet_ns_, ana_grpid);
+ 
   static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
   {
         return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled);
@@@ -412,11 -469,41 +474,41 @@@ static ssize_t nvmet_ns_enable_store(st
   
   CONFIGFS_ATTR(nvmet_ns_, enable);
   
+ static ssize_t nvmet_ns_buffered_io_show(struct config_item *item, char *page)
+ {
+       return sprintf(page, "%d\n", to_nvmet_ns(item)->buffered_io);
+ }
+ 
+ static ssize_t nvmet_ns_buffered_io_store(struct config_item *item,
+               const char *page, size_t count)
+ {
+       struct nvmet_ns *ns = to_nvmet_ns(item);
+       bool val;
+ 
+       if (strtobool(page, &val))
+               return -EINVAL;
+ 
+       mutex_lock(&ns->subsys->lock);
+       if (ns->enabled) {
+               pr_err("disable ns before setting buffered_io value.\n");
+               mutex_unlock(&ns->subsys->lock);
+               return -EINVAL;
+       }
+ 
+       ns->buffered_io = val;
+       mutex_unlock(&ns->subsys->lock);
+       return count;
+ }
+ 
+ CONFIGFS_ATTR(nvmet_ns_, buffered_io);
+ 
   static struct configfs_attribute *nvmet_ns_attrs[] = {
         &nvmet_ns_attr_device_path,
         &nvmet_ns_attr_device_nguid,
         &nvmet_ns_attr_device_uuid,
+       &nvmet_ns_attr_ana_grpid,
         &nvmet_ns_attr_enable,
+       &nvmet_ns_attr_buffered_io,
         NULL,
   };
   
@@@ -863,6 -950,134 +955,134 @@@ static const struct config_item_type nv
         .ct_group_ops   = &nvmet_referral_group_ops,
   };
   
+ static struct {
+       enum nvme_ana_state     state;
+       const char              *name;
+ } nvmet_ana_state_names[] = {
+       { NVME_ANA_OPTIMIZED,           "optimized" },
+       { NVME_ANA_NONOPTIMIZED,        "non-optimized" },
+       { NVME_ANA_INACCESSIBLE,        "inaccessible" },
+       { NVME_ANA_PERSISTENT_LOSS,     "persistent-loss" },
+       { NVME_ANA_CHANGE,              "change" },
+ };
+ 
+ static ssize_t nvmet_ana_group_ana_state_show(struct config_item *item,
+               char *page)
+ {
+       struct nvmet_ana_group *grp = to_ana_group(item);
+       enum nvme_ana_state state = grp->port->ana_state[grp->grpid];
+       int i;
+ 
+       for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
+               if (state != nvmet_ana_state_names[i].state)
+                       continue;
+               return sprintf(page, "%s\n", nvmet_ana_state_names[i].name);
+       }
+ 
+       return sprintf(page, "\n");
+ }
+ 
+ static ssize_t nvmet_ana_group_ana_state_store(struct config_item *item,
+               const char *page, size_t count)
+ {
+       struct nvmet_ana_group *grp = to_ana_group(item);
+       int i;
+ 
+       for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
+               if (sysfs_streq(page, nvmet_ana_state_names[i].name))
+                       goto found;
+       }
+ 
+       pr_err("Invalid value '%s' for ana_state\n", page);
+       return -EINVAL;
+ 
+ found:
+       down_write(&nvmet_ana_sem);
+       grp->port->ana_state[grp->grpid] = nvmet_ana_state_names[i].state;
+       nvmet_ana_chgcnt++;
+       up_write(&nvmet_ana_sem);
+ 
+       nvmet_port_send_ana_event(grp->port);
+       return count;
+ }
+ 
+ CONFIGFS_ATTR(nvmet_ana_group_, ana_state);
+ 
+ static struct configfs_attribute *nvmet_ana_group_attrs[] = {
+       &nvmet_ana_group_attr_ana_state,
+       NULL,
+ };
+ 
+ static void nvmet_ana_group_release(struct config_item *item)
+ {
+       struct nvmet_ana_group *grp = to_ana_group(item);
+ 
+       if (grp == &grp->port->ana_default_group)
+               return;
+ 
+       down_write(&nvmet_ana_sem);
+       grp->port->ana_state[grp->grpid] = NVME_ANA_INACCESSIBLE;
+       nvmet_ana_group_enabled[grp->grpid]--;
+       up_write(&nvmet_ana_sem);
+ 
+       nvmet_port_send_ana_event(grp->port);
+       kfree(grp);
+ }
+ 
+ static struct configfs_item_operations nvmet_ana_group_item_ops = {
+       .release                = nvmet_ana_group_release,
+ };
+ 
+ static const struct config_item_type nvmet_ana_group_type = {
+       .ct_item_ops            = &nvmet_ana_group_item_ops,
+       .ct_attrs               = nvmet_ana_group_attrs,
+       .ct_owner               = THIS_MODULE,
+ };
+ 
+ static struct config_group *nvmet_ana_groups_make_group(
+               struct config_group *group, const char *name)
+ {
+       struct nvmet_port *port = ana_groups_to_port(&group->cg_item);
+       struct nvmet_ana_group *grp;
+       u32 grpid;
+       int ret;
+ 
+       ret = kstrtou32(name, 0, &grpid);
+       if (ret)
+               goto out;
+ 
+       ret = -EINVAL;
+       if (grpid <= 1 || grpid > NVMET_MAX_ANAGRPS)
+               goto out;
+ 
+       ret = -ENOMEM;
+       grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+       if (!grp)
+               goto out;
+       grp->port = port;
+       grp->grpid = grpid;
+ 
+       down_write(&nvmet_ana_sem);
+       nvmet_ana_group_enabled[grpid]++;
+       up_write(&nvmet_ana_sem);
+ 
+       nvmet_port_send_ana_event(grp->port);
+ 
+       config_group_init_type_name(&grp->group, name, &nvmet_ana_group_type);
+       return &grp->group;
+ out:
+       return ERR_PTR(ret);
+ }
+ 
+ static struct configfs_group_operations nvmet_ana_groups_group_ops = {
+       .make_group             = nvmet_ana_groups_make_group,
+ };
+ 
+ static const struct config_item_type nvmet_ana_groups_type = {
+       .ct_group_ops           = &nvmet_ana_groups_group_ops,
+       .ct_owner               = THIS_MODULE,
+ };
+ 
   /*
    * Ports definitions.
    */
@@@ -870,6 -1085,7 +1090,7 @@@ static void nvmet_port_release(struct c
   {
         struct nvmet_port *port = to_nvmet_port(item);
   
+       kfree(port->ana_state);
         kfree(port);
   }
   
@@@ -879,6 -1095,7 +1100,7 @@@ static struct configfs_attribute *nvmet
         &nvmet_attr_addr_traddr,
         &nvmet_attr_addr_trsvcid,
         &nvmet_attr_addr_trtype,
+       &nvmet_attr_param_inline_data_size,
         NULL,
   };
   
@@@ -897,6 -1114,7 +1119,7 @@@ static struct config_group *nvmet_ports
   {
         struct nvmet_port *port;
         u16 portid;
+       u32 i;
   
         if (kstrtou16(name, 0, &portid))
                 return ERR_PTR(-EINVAL);
@@@ -905,9 -1123,24 +1128,24 @@@
         if (!port)
                 return ERR_PTR(-ENOMEM);
   
+       port->ana_state = kcalloc(NVMET_MAX_ANAGRPS + 1,
+                       sizeof(*port->ana_state), GFP_KERNEL);
+       if (!port->ana_state) {
+               kfree(port);
+               return ERR_PTR(-ENOMEM);
+       }
+ 
+       for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) {
+               if (i == NVMET_DEFAULT_ANA_GRPID)
+                       port->ana_state[1] = NVME_ANA_OPTIMIZED;
+               else
+                       port->ana_state[i] = NVME_ANA_INACCESSIBLE;
+       }
+ 
         INIT_LIST_HEAD(&port->entry);
         INIT_LIST_HEAD(&port->subsystems);
         INIT_LIST_HEAD(&port->referrals);
+       port->inline_data_size = -1;    /* < 0 == let the transport choose */
   
         port->disc_addr.portid = cpu_to_le16(portid);
         config_group_init_type_name(&port->group, name, &nvmet_port_type);
@@@ -920,6 -1153,18 +1158,18 @@@
                         "referrals", &nvmet_referrals_type);
         configfs_add_default_group(&port->referrals_group, &port->group);
   
+       config_group_init_type_name(&port->ana_groups_group,
+                       "ana_groups", &nvmet_ana_groups_type);
+       configfs_add_default_group(&port->ana_groups_group, &port->group);
+ 
+       port->ana_default_group.port = port;
+       port->ana_default_group.grpid = NVMET_DEFAULT_ANA_GRPID;
+       config_group_init_type_name(&port->ana_default_group.group,
+                       __stringify(NVMET_DEFAULT_ANA_GRPID),
+                       &nvmet_ana_group_type);
+       configfs_add_default_group(&port->ana_default_group.group,
+                       &port->ana_groups_group);
+ 
         return &port->group;
   }
   
diff --combined drivers/nvme/target/core.c

index 9838103,14b4c49..ebf3e7a
--- 1/drivers/nvme/target/core.c
--- 2/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@@ -18,6 -18,7 +18,7 @@@
   
   #include "nvmet.h"
   
+ struct workqueue_struct *buffered_io_wq;
   static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
   static DEFINE_IDA(cntlid_ida);
   
@@@ -39,6 -40,10 +40,10 @@@
    */
   DECLARE_RWSEM(nvmet_config_sem);
   
+ u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
+ u64 nvmet_ana_chgcnt;
+ DECLARE_RWSEM(nvmet_ana_sem);
+ 
   static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
                 const char *subsysnqn);
   
@@@ -175,7 -180,7 +180,7 @@@ out_unlock
         mutex_unlock(&ctrl->lock);
   }
   
- static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
+ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
   {
         struct nvmet_ctrl *ctrl;
   
@@@ -189,6 -194,33 +194,33 @@@
         }
   }
   
+ void nvmet_send_ana_event(struct nvmet_subsys *subsys,
+               struct nvmet_port *port)
+ {
+       struct nvmet_ctrl *ctrl;
+ 
+       mutex_lock(&subsys->lock);
+       list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+               if (port && ctrl->port != port)
+                       continue;
+               if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE))
+                       continue;
+               nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+                               NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
+       }
+       mutex_unlock(&subsys->lock);
+ }
+ 
+ void nvmet_port_send_ana_event(struct nvmet_port *port)
+ {
+       struct nvmet_subsys_link *p;
+ 
+       down_read(&nvmet_config_sem);
+       list_for_each_entry(p, &port->subsystems, entry)
+               nvmet_send_ana_event(p->subsys, port);
+       up_read(&nvmet_config_sem);
+ }
+ 
   int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
   {
         int ret = 0;
@@@ -241,6 -273,10 +273,10 @@@ int nvmet_enable_port(struct nvmet_por
                 return ret;
         }
   
+       /* If the transport didn't set inline_data_size, then disable it. */
+       if (port->inline_data_size < 0)
+               port->inline_data_size = 0;
+ 
         port->enabled = true;
         return 0;
   }
@@@ -332,14 -368,18 +368,18 @@@ static void nvmet_ns_dev_disable(struc
   int nvmet_ns_enable(struct nvmet_ns *ns)
   {
         struct nvmet_subsys *subsys = ns->subsys;
-       int ret = 0;
+       int ret;
   
         mutex_lock(&subsys->lock);
+       ret = -EMFILE;
+       if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
+               goto out_unlock;
+       ret = 0;
         if (ns->enabled)
                 goto out_unlock;
   
         ret = nvmet_bdev_ns_enable(ns);
- -      if (ret)
+ +      if (ret == -ENOTBLK)
                 ret = nvmet_file_ns_enable(ns);
         if (ret)
                 goto out_unlock;
@@@ -369,6 -409,7 +409,7 @@@
   
                 list_add_tail_rcu(&ns->dev_link, &old->dev_link);
         }
+       subsys->nr_namespaces++;
   
         nvmet_ns_changed(subsys, ns->nsid);
         ns->enabled = true;
@@@ -409,6 -450,7 +450,7 @@@ void nvmet_ns_disable(struct nvmet_ns *
         percpu_ref_exit(&ns->ref);
   
         mutex_lock(&subsys->lock);
+       subsys->nr_namespaces--;
         nvmet_ns_changed(subsys, ns->nsid);
         nvmet_ns_dev_disable(ns);
   out_unlock:
@@@ -419,6 -461,10 +461,10 @@@ void nvmet_ns_free(struct nvmet_ns *ns
   {
         nvmet_ns_disable(ns);
   
+       down_write(&nvmet_ana_sem);
+       nvmet_ana_group_enabled[ns->anagrpid]--;
+       up_write(&nvmet_ana_sem);
+ 
         kfree(ns->device_path);
         kfree(ns);
   }
@@@ -436,7 -482,14 +482,14 @@@ struct nvmet_ns *nvmet_ns_alloc(struct 
   
         ns->nsid = nsid;
         ns->subsys = subsys;
+ 
+       down_write(&nvmet_ana_sem);
+       ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
+       nvmet_ana_group_enabled[ns->anagrpid]++;
+       up_write(&nvmet_ana_sem);
+ 
         uuid_gen(&ns->uuid);
+       ns->buffered_io = false;
   
         return ns;
   }
@@@ -542,6 -595,35 +595,35 @@@ int nvmet_sq_init(struct nvmet_sq *sq
   }
   EXPORT_SYMBOL_GPL(nvmet_sq_init);
   
+ static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
+               struct nvmet_ns *ns)
+ {
+       enum nvme_ana_state state = port->ana_state[ns->anagrpid];
+ 
+       if (unlikely(state == NVME_ANA_INACCESSIBLE))
+               return NVME_SC_ANA_INACCESSIBLE;
+       if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
+               return NVME_SC_ANA_PERSISTENT_LOSS;
+       if (unlikely(state == NVME_ANA_CHANGE))
+               return NVME_SC_ANA_TRANSITION;
+       return 0;
+ }
+ 
+ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
+ {
+       if (unlikely(req->ns->readonly)) {
+               switch (req->cmd->common.opcode) {
+               case nvme_cmd_read:
+               case nvme_cmd_flush:
+                       break;
+               default:
+                       return NVME_SC_NS_WRITE_PROTECTED;
+               }
+       }
+ 
+       return 0;
+ }
+ 
   static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
   {
         struct nvme_command *cmd = req->cmd;
@@@ -554,6 -636,12 +636,12 @@@
         req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
         if (unlikely(!req->ns))
                 return NVME_SC_INVALID_NS | NVME_SC_DNR;
+       ret = nvmet_check_ana_state(req->port, req->ns);
+       if (unlikely(ret))
+               return ret;
+       ret = nvmet_io_cmd_check_access(req);
+       if (unlikely(ret))
+               return ret;
   
         if (req->ns->file)
                 return nvmet_file_parse_io_cmd(req);
@@@ -870,6 -958,8 +958,8 @@@ u16 nvmet_alloc_ctrl(const char *subsys
   
         nvmet_init_cap(ctrl);
   
+       ctrl->port = req->port;
+ 
         INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
         INIT_LIST_HEAD(&ctrl->async_events);
   
@@@ -1109,6 -1199,15 +1199,15 @@@ static int __init nvmet_init(void
   {
         int error;
   
+       nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
+ 
+       buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
+                       WQ_MEM_RECLAIM, 0);
+       if (!buffered_io_wq) {
+               error = -ENOMEM;
+               goto out;
+       }
+ 
         error = nvmet_init_discovery();
         if (error)
                 goto out;
@@@ -1129,6 -1228,7 +1228,7 @@@ static void __exit nvmet_exit(void
         nvmet_exit_configfs();
         nvmet_exit_discovery();
         ida_destroy(&cntlid_ida);
+       destroy_workqueue(buffered_io_wq);
   
         BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
         BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
diff --combined drivers/nvme/target/loop.c

index ae7586b,af7fbf4..9908082
--- 1/drivers/nvme/target/loop.c
--- 2/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@@ -162,7 -162,7 +162,7 @@@ static blk_status_t nvme_loop_queue_rq(
         blk_status_t ret;
   
         if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready))
- -              return nvmf_fail_nonready_command(req);
+ +              return nvmf_fail_nonready_command(&queue->ctrl->ctrl, req);
   
         ret = nvme_setup_cmd(ns, req, &iod->cmd);
         if (ret)
@@@ -227,6 -227,7 +227,7 @@@ static int nvme_loop_init_request(struc
   {
         struct nvme_loop_ctrl *ctrl = set->driver_data;
   
+       nvme_req(req)->ctrl = &ctrl->ctrl;
         return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req),
                         (set == &ctrl->tag_set) ? hctx_idx + 1 : 0);
   }
diff --combined fs/block_dev.c

index aba2541,496fb51..38b8ce0
--- 1/fs/block_dev.c
--- 2/fs/block_dev.c
+++ b/fs/block_dev.c
@@@ -221,7 -221,7 +221,7 @@@ __blkdev_direct_IO_simple(struct kiocb 
   
         ret = bio_iov_iter_get_pages(&bio, iter);
         if (unlikely(ret))
- -              return ret;
+ +              goto out;
         ret = bio.bi_iter.bi_size;
   
         if (iov_iter_rw(iter) == READ) {
@@@ -250,13 -250,12 +250,13 @@@
                 put_page(bvec->bv_page);
         }
   
- -      if (vecs != inline_vecs)
- -              kfree(vecs);
- -
         if (unlikely(bio.bi_status))
                 ret = blk_status_to_errno(bio.bi_status);
   
+ +out:
+ +      if (vecs != inline_vecs)
+ +              kfree(vecs);
+ +
         bio_uninit(&bio);
   
         return ret;
@@@ -666,7 -665,8 +666,8 @@@ int bdev_read_page(struct block_device 
         result = blk_queue_enter(bdev->bd_queue, 0);
         if (result)
                 return result;
-       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
+       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+                             REQ_OP_READ);
         blk_queue_exit(bdev->bd_queue);
         return result;
   }
@@@ -704,7 -704,8 +705,8 @@@ int bdev_write_page(struct block_devic
                 return result;
   
         set_page_writeback(page);
-       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
+       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+                             REQ_OP_WRITE);
         if (result) {
                 end_page_writeback(page);
         } else {
diff --combined fs/ext4/super.c

index f7750bc,4b8aef9..5863fd2
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -312,24 -312,6 +312,24 @@@ void ext4_itable_unused_set(struct supe
                 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
   }
   
+ +static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
+ +{
+ +      time64_t now = ktime_get_real_seconds();
+ +
+ +      now = clamp_val(now, 0, (1ull << 40) - 1);
+ +
+ +      *lo = cpu_to_le32(lower_32_bits(now));
+ +      *hi = upper_32_bits(now);
+ +}
+ +
+ +static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
+ +{
+ +      return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
+ +}
+ +#define ext4_update_tstamp(es, tstamp) \
+ +      __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
+ +#define ext4_get_tstamp(es, tstamp) \
+ +      __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
   
   static void __save_error_info(struct super_block *sb, const char *func,
                             unsigned int line)
@@@ -340,12 -322,11 +340,12 @@@
         if (bdev_read_only(sb->s_bdev))
                 return;
         es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- -      es->s_last_error_time = cpu_to_le32(get_seconds());
+ +      ext4_update_tstamp(es, s_last_error_time);
         strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
         es->s_last_error_line = cpu_to_le32(line);
         if (!es->s_first_error_time) {
                 es->s_first_error_time = es->s_last_error_time;
+ +              es->s_first_error_time_hi = es->s_last_error_time_hi;
                 strncpy(es->s_first_error_func, func,
                         sizeof(es->s_first_error_func));
                 es->s_first_error_line = cpu_to_le32(line);
@@@ -795,26 -776,26 +795,26 @@@ void ext4_mark_group_bitmap_corrupted(s
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
         struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+ +      int ret;
   
- -      if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) &&
- -          !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) {
- -              percpu_counter_sub(&sbi->s_freeclusters_counter,
- -                                      grp->bb_free);
- -              set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
- -                      &grp->bb_state);
+ +      if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
+ +              ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+ +                                          &grp->bb_state);
+ +              if (!ret)
+ +                      percpu_counter_sub(&sbi->s_freeclusters_counter,
+ +                                         grp->bb_free);
         }
   
- -      if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) &&
- -          !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- -              if (gdp) {
+ +      if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
+ +              ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
+ +                                          &grp->bb_state);
+ +              if (!ret && gdp) {
                         int count;
   
                         count = ext4_free_inodes_count(sb, gdp);
                         percpu_counter_sub(&sbi->s_freeinodes_counter,
                                            count);
                 }
- -              set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
- -                      &grp->bb_state);
         }
   }
   
@@@ -2193,8 -2174,8 +2193,8 @@@ static int ext4_setup_super(struct supe
                          "warning: maximal mount count reached, "
                          "running e2fsck is recommended");
         else if (le32_to_cpu(es->s_checkinterval) &&
- -              (le32_to_cpu(es->s_lastcheck) +
- -                      le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+ +               (ext4_get_tstamp(es, s_lastcheck) +
+ +                le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
                 ext4_msg(sb, KERN_WARNING,
                          "warning: checktime reached, "
                          "running e2fsck is recommended");
@@@ -2203,7 -2184,7 +2203,7 @@@
         if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
         le16_add_cpu(&es->s_mnt_count, 1);
- -      es->s_mtime = cpu_to_le32(get_seconds());
+ +      ext4_update_tstamp(es, s_mtime);
         ext4_update_dynamic_rev(sb);
         if (sbi->s_journal)
                 ext4_set_feature_journal_needs_recovery(sb);
@@@ -2361,7 -2342,7 +2361,7 @@@ static int ext4_check_descriptors(struc
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
         ext4_fsblk_t last_block;
- -      ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0) + 1;
+ +      ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
         ext4_fsblk_t block_bitmap;
         ext4_fsblk_t inode_bitmap;
         ext4_fsblk_t inode_table;
@@@ -2894,9 -2875,8 +2894,9 @@@ static void print_daily_error_info(stru
                 ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
                          le32_to_cpu(es->s_error_count));
         if (es->s_first_error_time) {
- -              printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
- -                     sb->s_id, le32_to_cpu(es->s_first_error_time),
+ +              printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
+ +                     sb->s_id,
+ +                     ext4_get_tstamp(es, s_first_error_time),
                        (int) sizeof(es->s_first_error_func),
                        es->s_first_error_func,
                        le32_to_cpu(es->s_first_error_line));
@@@ -2909,9 -2889,8 +2909,9 @@@
                 printk(KERN_CONT "\n");
         }
         if (es->s_last_error_time) {
- -              printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
- -                     sb->s_id, le32_to_cpu(es->s_last_error_time),
+ +              printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
+ +                     sb->s_id,
+ +                     ext4_get_tstamp(es, s_last_error_time),
                        (int) sizeof(es->s_last_error_func),
                        es->s_last_error_func,
                        le32_to_cpu(es->s_last_error_line));
@@@ -3162,8 -3141,14 +3162,8 @@@ static ext4_group_t ext4_has_uninit_ita
                 if (!gdp)
                         continue;
   
- -              if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
- -                      continue;
- -              if (group != 0)
+ +              if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                         break;
- -              ext4_error(sb, "Inode table for bg 0 marked as "
- -                         "needing zeroing");
- -              if (sb_rdonly(sb))
- -                      return ngroups;
         }
   
         return group;
@@@ -3529,7 -3514,7 +3529,7 @@@ static int ext4_fill_super(struct super
         sbi->s_sb_block = sb_block;
         if (sb->s_bdev->bd_part)
                 sbi->s_sectors_written_start =
-                       part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+                       part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
   
         /* Cleanup superblock name */
         strreplace(sb->s_id, '/', '!');
@@@ -4100,13 -4085,14 +4100,13 @@@
                         goto failed_mount2;
                 }
         }
+ +      sbi->s_gdb_count = db_count;
         if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
                 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                 ret = -EFSCORRUPTED;
                 goto failed_mount2;
         }
   
- -      sbi->s_gdb_count = db_count;
- -
         timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
   
         /* Register extent status tree shrinker */
@@@ -4834,11 -4820,12 +4834,12 @@@ static int ext4_commit_super(struct sup
          * to complain and force a full file system check.
          */
         if (!(sb->s_flags & SB_RDONLY))
- -              es->s_wtime = cpu_to_le32(get_seconds());
+ +              ext4_update_tstamp(es, s_wtime);
         if (sb->s_bdev->bd_part)
                 es->s_kbytes_written =
                         cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
-                           ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                           ((part_stat_read(sb->s_bdev->bd_part,
+                                            sectors[STAT_WRITE]) -
                               EXT4_SB(sb)->s_sectors_written_start) >> 1));
         else
                 es->s_kbytes_written =
@@@ -5101,9 -5088,6 +5102,9 @@@ static int ext4_remount(struct super_bl
   #endif
         char *orig_data = kstrdup(data, GFP_KERNEL);
   
+ +      if (data && !orig_data)
+ +              return -ENOMEM;
+ +
         /* Store the original options */
         old_sb_flags = sb->s_flags;
         old_opts.s_mount_opt = sbi->s_mount_opt;
@@@ -5230,8 -5214,6 +5231,8 @@@
   
                         if (sbi->s_journal)
                                 ext4_mark_recovery_complete(sb, es);
+ +                      if (sbi->s_mmp_tsk)
+ +                              kthread_stop(sbi->s_mmp_tsk);
                 } else {
                         /* Make sure we can mount this feature set readwrite */
                         if (ext4_has_feature_readonly(sb) ||
@@@ -5689,13 -5671,13 +5690,13 @@@ static int ext4_enable_quotas(struct su
                                 DQUOT_USAGE_ENABLED |
                                 (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
                         if (err) {
- -                              for (type--; type >= 0; type--)
- -                                      dquot_quota_off(sb, type);
- -
                                 ext4_warning(sb,
                                         "Failed to enable quota tracking "
                                         "(type=%d, err=%d). Please run "
                                         "e2fsck to fix.", type, err);
+ +                              for (type--; type >= 0; type--)
+ +                                      dquot_quota_off(sb, type);
+ +
                                 return err;
                         }
                 }
diff --combined fs/ext4/sysfs.c

index e60cc5e,2be9ad7..9212a02
--- 1/fs/ext4/sysfs.c
--- 2/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@@ -25,8 -25,6 +25,8 @@@ typedef enum 
         attr_reserved_clusters,
         attr_inode_readahead,
         attr_trigger_test_error,
+ +      attr_first_error_time,
+ +      attr_last_error_time,
         attr_feature,
         attr_pointer_ui,
         attr_pointer_atomic,
@@@ -58,7 -56,8 +58,8 @@@ static ssize_t session_write_kbytes_sho
         if (!sb->s_bdev->bd_part)
                 return snprintf(buf, PAGE_SIZE, "0\n");
         return snprintf(buf, PAGE_SIZE, "%lu\n",
-                       (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                       (part_stat_read(sb->s_bdev->bd_part,
+                                       sectors[STAT_WRITE]) -
                          sbi->s_sectors_written_start) >> 1);
   }
   
@@@ -70,7 -69,8 +71,8 @@@ static ssize_t lifetime_write_kbytes_sh
                 return snprintf(buf, PAGE_SIZE, "0\n");
         return snprintf(buf, PAGE_SIZE, "%llu\n",
                         (unsigned long long)(sbi->s_kbytes_written +
-                       ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                       ((part_stat_read(sb->s_bdev->bd_part,
+                                        sectors[STAT_WRITE]) -
                           EXT4_SB(sb)->s_sectors_written_start) >> 1)));
   }
   
@@@ -184,8 -184,8 +186,8 @@@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_b
   EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
   EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
   EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
- -EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
- -EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
+ +EXT4_ATTR(first_error_time, 0444, first_error_time);
+ +EXT4_ATTR(last_error_time, 0444, last_error_time);
   
   static unsigned int old_bump_val = 128;
   EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@@ -251,15 -251,6 +253,15 @@@ static void *calc_ptr(struct ext4_attr 
         return NULL;
   }
   
+ +static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
+ +{
+ +      return snprintf(buf, PAGE_SIZE, "%lld",
+ +                      ((time64_t)hi << 32) + le32_to_cpu(lo));
+ +}
+ +
+ +#define print_tstamp(buf, es, tstamp) \
+ +      __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi)
+ +
   static ssize_t ext4_attr_show(struct kobject *kobj,
                               struct attribute *attr, char *buf)
   {
@@@ -285,12 -276,8 +287,12 @@@
         case attr_pointer_ui:
                 if (!ptr)
                         return 0;
- -              return snprintf(buf, PAGE_SIZE, "%u\n",
- -                              *((unsigned int *) ptr));
+ +              if (a->attr_ptr == ptr_ext4_super_block_offset)
+ +                      return snprintf(buf, PAGE_SIZE, "%u\n",
+ +                                      le32_to_cpup(ptr));
+ +              else
+ +                      return snprintf(buf, PAGE_SIZE, "%u\n",
+ +                                      *((unsigned int *) ptr));
         case attr_pointer_atomic:
                 if (!ptr)
                         return 0;
@@@ -298,10 -285,6 +300,10 @@@
                                 atomic_read((atomic_t *) ptr));
         case attr_feature:
                 return snprintf(buf, PAGE_SIZE, "supported\n");
+ +      case attr_first_error_time:
+ +              return print_tstamp(buf, sbi->s_es, s_first_error_time);
+ +      case attr_last_error_time:
+ +              return print_tstamp(buf, sbi->s_es, s_last_error_time);
         }
   
         return 0;
@@@ -327,10 -310,7 +329,10 @@@ static ssize_t ext4_attr_store(struct k
                 ret = kstrtoul(skip_spaces(buf), 0, &t);
                 if (ret)
                         return ret;
- -              *((unsigned int *) ptr) = t;
+ +              if (a->attr_ptr == ptr_ext4_super_block_offset)
+ +                      *((__le32 *) ptr) = cpu_to_le32(t);
+ +              else
+ +                      *((unsigned int *) ptr) = t;
                 return len;
         case attr_inode_readahead:
                 return inode_readahead_blks_store(sbi, buf, len);
diff --combined include/linux/blk-mq.h

index ca3f2c2,d710e92..1da59c1
--- 1/include/linux/blk-mq.h
--- 2/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@@ -35,10 -35,12 +35,12 @@@ struct blk_mq_hw_ctx 
         struct sbitmap          ctx_map;
   
         struct blk_mq_ctx       *dispatch_from;
+       unsigned int            dispatch_busy;
   
-       struct blk_mq_ctx       **ctxs;
         unsigned int            nr_ctx;
+       struct blk_mq_ctx       **ctxs;
   
+       spinlock_t              dispatch_wait_lock;
         wait_queue_entry_t      dispatch_wait;
         atomic_t                wait_index;
   
@@@ -287,20 -289,6 +289,20 @@@ void blk_mq_update_nr_hw_queues(struct 
   
   void blk_mq_quiesce_queue_nowait(struct request_queue *q);
   
+ +/**
+ + * blk_mq_mark_complete() - Set request state to complete
+ + * @rq: request to set to complete state
+ + *
+ + * Returns true if request state was successfully set to complete. If
+ + * successful, the caller is responsibile for seeing this request is ended, as
+ + * blk_mq_complete_request will not work again.
+ + */
+ +static inline bool blk_mq_mark_complete(struct request *rq)
+ +{
+ +      return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) ==
+ +                      MQ_RQ_IN_FLIGHT;
+ +}
+ +
   /*
    * Driver command data is immediately after the request. So subtract request
    * size to get back to the original request, add request size to get the PDU.
diff --combined include/linux/sched.h

index dac5086,c2e993d..95a5018
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -167,8 -167,8 +167,8 @@@ struct task_group
    *   need_sleep = false;
    *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
    *
- - * Where wake_up_state() (and all other wakeup primitives) imply enough
- - * barriers to order the store of the variable against wakeup.
+ + * where wake_up_state() executes a full memory barrier before accessing the
+ + * task state.
    *
    * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
    * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
@@@ -734,6 -734,10 +734,10 @@@ struct task_struct 
         /* disallow userland-initiated cgroup migration */
         unsigned                        no_cgroup_migration:1;
   #endif
+ #ifdef CONFIG_BLK_CGROUP
+       /* to be used once the psi infrastructure lands upstream. */
+       unsigned                        use_memdelay:1;
+ #endif
   
         unsigned long                   atomic_flags; /* Flags requiring atomic access. */
   
@@@ -1017,6 -1021,7 +1021,6 @@@
         u64                             last_sum_exec_runtime;
         struct callback_head            numa_work;
   
- -      struct list_head                numa_entry;
         struct numa_group               *numa_group;
   
         /*
@@@ -1150,6 -1155,10 +1154,10 @@@
         unsigned int                    memcg_nr_pages_over_high;
   #endif
   
+ #ifdef CONFIG_BLK_CGROUP
+       struct request_queue            *throttle_queue;
+ #endif
+ 
   #ifdef CONFIG_UPROBES
         struct uprobe_task              *utask;
   #endif
diff --combined kernel/fork.c

index 9d8d0e0,f40c82b..3311231
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -312,8 -312,10 +312,8 @@@ struct vm_area_struct *vm_area_alloc(st
   {
         struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
   
- -      if (vma) {
- -              vma->vm_mm = mm;
- -              INIT_LIST_HEAD(&vma->anon_vma_chain);
- -      }
+ +      if (vma)
+ +              vma_init(vma, mm);
         return vma;
   }
   
@@@ -866,6 -868,11 +866,11 @@@ static struct task_struct *dup_task_str
         tsk->fail_nth = 0;
   #endif
   
+ #ifdef CONFIG_BLK_CGROUP
+       tsk->throttle_queue = NULL;
+       tsk->use_memdelay = 0;
+ #endif
+ 
         return tsk;
   
   free_stack:
@@@ -2276,8 -2283,6 +2281,8 @@@ static void sighand_ctor(void *data
   
   void __init proc_caches_init(void)
   {
+ +      unsigned int mm_size;
+ +
         sighand_cachep = kmem_cache_create("sighand_cache",
                         sizeof(struct sighand_struct), 0,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@@ -2294,16 -2299,15 +2299,16 @@@
                         sizeof(struct fs_struct), 0,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                         NULL);
+ +
         /*
- -       * FIXME! The "sizeof(struct mm_struct)" currently includes the
- -       * whole struct cpumask for the OFFSTACK case. We could change
- -       * this to *only* allocate as much of it as required by the
- -       * maximum number of CPU's we can ever have.  The cpumask_allocation
- -       * is at the end of the structure, exactly for that reason.
+ +       * The mm_cpumask is located at the end of mm_struct, and is
+ +       * dynamically sized based on the maximum CPU number this system
+ +       * can have, taking hotplug into account (nr_cpu_ids).
          */
+ +      mm_size = sizeof(struct mm_struct) + cpumask_size();
+ +
         mm_cachep = kmem_cache_create_usercopy("mm_struct",
- -                      sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
+ +                      mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                         offsetof(struct mm_struct, saved_auxv),
                         sizeof_field(struct mm_struct, saved_auxv),
diff --combined mm/memcontrol.c

index b2173f7,473278b..b836e7f
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -4037,14 -4037,6 +4037,14 @@@ static struct cftype mem_cgroup_legacy_
   
   static DEFINE_IDR(mem_cgroup_idr);
   
+ +static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
+ +{
+ +      if (memcg->id.id > 0) {
+ +              idr_remove(&mem_cgroup_idr, memcg->id.id);
+ +              memcg->id.id = 0;
+ +      }
+ +}
+ +
   static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
   {
         VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
@@@ -4055,7 -4047,8 +4055,7 @@@ static void mem_cgroup_id_put_many(stru
   {
         VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
         if (atomic_sub_and_test(n, &memcg->id.ref)) {
- -              idr_remove(&mem_cgroup_idr, memcg->id.id);
- -              memcg->id.id = 0;
+ +              mem_cgroup_id_remove(memcg);
   
                 /* Memcg ID pins CSS */
                 css_put(&memcg->css);
@@@ -4192,7 -4185,8 +4192,7 @@@ static struct mem_cgroup *mem_cgroup_al
         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
         return memcg;
   fail:
- -      if (memcg->id.id > 0)
- -              idr_remove(&mem_cgroup_idr, memcg->id.id);
+ +      mem_cgroup_id_remove(memcg);
         __mem_cgroup_free(memcg);
         return NULL;
   }
@@@ -4251,7 -4245,6 +4251,7 @@@ mem_cgroup_css_alloc(struct cgroup_subs
   
         return &memcg->css;
   fail:
+ +      mem_cgroup_id_remove(memcg);
         mem_cgroup_free(memcg);
         return ERR_PTR(-ENOMEM);
   }
@@@ -5600,6 -5593,19 +5600,19 @@@ out
         return ret;
   }
   
+ int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound)
+ {
+       struct mem_cgroup *memcg;
+       int ret;
+ 
+       ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
+       memcg = *memcgp;
+       mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
+       return ret;
+ }
+ 
   /**
    * mem_cgroup_commit_charge - commit a page charge
    * @page: page to charge
diff --combined mm/memory.c

index 6d17505,dfe80c5..348279f
--- 1/mm/memory.c
--- 2/mm/memory.c
+++ b/mm/memory.c
@@@ -326,20 -326,16 +326,20 @@@ bool __tlb_remove_page_size(struct mmu_
   
   #ifdef CONFIG_HAVE_RCU_TABLE_FREE
   
- -/*
- - * See the comment near struct mmu_table_batch.
- - */
- -
   static void tlb_remove_table_smp_sync(void *arg)
   {
- -      /* Simply deliver the interrupt */
+ +      struct mm_struct __maybe_unused *mm = arg;
+ +      /*
+ +       * On most architectures this does nothing. Simply delivering the
+ +       * interrupt is enough to prevent races with software page table
+ +       * walking like that done in get_user_pages_fast.
+ +       *
+ +       * See the comment near struct mmu_table_batch.
+ +       */
+ +      tlb_flush_remove_tables_local(mm);
   }
   
- -static void tlb_remove_table_one(void *table)
+ +static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
   {
         /*
          * This isn't an RCU grace period and hence the page-tables cannot be
@@@ -348,7 -344,7 +348,7 @@@
          * It is however sufficient for software page-table walkers that rely on
          * IRQ disabling. See the comment near struct mmu_table_batch.
          */
- -      smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+ +      smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
         __tlb_remove_table(table);
   }
   
@@@ -369,8 -365,6 +369,8 @@@ void tlb_table_flush(struct mmu_gather 
   {
         struct mmu_table_batch **batch = &tlb->batch;
   
+ +      tlb_flush_remove_tables(tlb->mm);
+ +
         if (*batch) {
                 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
                 *batch = NULL;
@@@ -393,7 -387,7 +393,7 @@@ void tlb_remove_table(struct mmu_gathe
         if (*batch == NULL) {
                 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                 if (*batch == NULL) {
- -                      tlb_remove_table_one(table);
+ +                      tlb_remove_table_one(table, tlb);
                         return;
                 }
                 (*batch)->nr = 0;
@@@ -1423,9 -1417,11 +1423,9 @@@ static inline unsigned long zap_pmd_ran
         do {
                 next = pmd_addr_end(addr, end);
                 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
- -                      if (next - addr != HPAGE_PMD_SIZE) {
- -                              VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
- -                                  !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+ +                      if (next - addr != HPAGE_PMD_SIZE)
                                 __split_huge_pmd(vma, pmd, addr, false, NULL);
- -                      } else if (zap_huge_pmd(tlb, vma, pmd, addr))
+ +                      else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                 goto next;
                         /* fall through */
                 }
@@@ -1890,9 -1886,6 +1890,9 @@@ int vm_insert_pfn_prot(struct vm_area_s
         if (addr < vma->vm_start || addr >= vma->vm_end)
                 return -EFAULT;
   
+ +      if (!pfn_modify_allowed(pfn, pgprot))
+ +              return -EACCES;
+ +
         track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
   
         ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
@@@ -1928,9 -1921,6 +1928,9 @@@ static int __vm_insert_mixed(struct vm_
   
         track_pfn_insert(vma, &pgprot, pfn);
   
+ +      if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+ +              return -EACCES;
+ +
         /*
          * If we don't have pte special, then we have to use the pfn_valid()
          * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@@ -1992,7 -1982,6 +1992,7 @@@ static int remap_pte_range(struct mm_st
   {
         pte_t *pte;
         spinlock_t *ptl;
+ +      int err = 0;
   
         pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
         if (!pte)
@@@ -2000,16 -1989,12 +2000,16 @@@
         arch_enter_lazy_mmu_mode();
         do {
                 BUG_ON(!pte_none(*pte));
+ +              if (!pfn_modify_allowed(pfn, prot)) {
+ +                      err = -EACCES;
+ +                      break;
+ +              }
                 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                 pfn++;
         } while (pte++, addr += PAGE_SIZE, addr != end);
         arch_leave_lazy_mmu_mode();
         pte_unmap_unlock(pte - 1, ptl);
- -      return 0;
+ +      return err;
   }
   
   static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@@ -2018,7 -2003,6 +2018,7 @@@
   {
         pmd_t *pmd;
         unsigned long next;
+ +      int err;
   
         pfn -= addr >> PAGE_SHIFT;
         pmd = pmd_alloc(mm, pud, addr);
@@@ -2027,10 -2011,9 +2027,10 @@@
         VM_BUG_ON(pmd_trans_huge(*pmd));
         do {
                 next = pmd_addr_end(addr, end);
- -              if (remap_pte_range(mm, pmd, addr, next,
- -                              pfn + (addr >> PAGE_SHIFT), prot))
- -                      return -ENOMEM;
+ +              err = remap_pte_range(mm, pmd, addr, next,
+ +                              pfn + (addr >> PAGE_SHIFT), prot);
+ +              if (err)
+ +                      return err;
         } while (pmd++, addr = next, addr != end);
         return 0;
   }
@@@ -2041,7 -2024,6 +2041,7 @@@ static inline int remap_pud_range(struc
   {
         pud_t *pud;
         unsigned long next;
+ +      int err;
   
         pfn -= addr >> PAGE_SHIFT;
         pud = pud_alloc(mm, p4d, addr);
@@@ -2049,10 -2031,9 +2049,10 @@@
                 return -ENOMEM;
         do {
                 next = pud_addr_end(addr, end);
- -              if (remap_pmd_range(mm, pud, addr, next,
- -                              pfn + (addr >> PAGE_SHIFT), prot))
- -                      return -ENOMEM;
+ +              err = remap_pmd_range(mm, pud, addr, next,
+ +                              pfn + (addr >> PAGE_SHIFT), prot);
+ +              if (err)
+ +                      return err;
         } while (pud++, addr = next, addr != end);
         return 0;
   }
@@@ -2063,7 -2044,6 +2063,7 @@@ static inline int remap_p4d_range(struc
   {
         p4d_t *p4d;
         unsigned long next;
+ +      int err;
   
         pfn -= addr >> PAGE_SHIFT;
         p4d = p4d_alloc(mm, pgd, addr);
@@@ -2071,10 -2051,9 +2071,10 @@@
                 return -ENOMEM;
         do {
                 next = p4d_addr_end(addr, end);
- -              if (remap_pud_range(mm, p4d, addr, next,
- -                              pfn + (addr >> PAGE_SHIFT), prot))
- -                      return -ENOMEM;
+ +              err = remap_pud_range(mm, p4d, addr, next,
+ +                              pfn + (addr >> PAGE_SHIFT), prot);
+ +              if (err)
+ +                      return err;
         } while (p4d++, addr = next, addr != end);
         return 0;
   }
@@@ -2524,7 -2503,7 +2524,7 @@@ static int wp_page_copy(struct vm_faul
                 cow_user_page(new_page, old_page, vmf->address, vma);
         }
   
-       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
+       if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
                 goto oom_free_new;
   
         __SetPageUptodate(new_page);
@@@ -3024,8 -3003,8 +3024,8 @@@ int do_swap_page(struct vm_fault *vmf
                 goto out_page;
         }
   
-       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
-                               &memcg, false)) {
+       if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
+                                       &memcg, false)) {
                 ret = VM_FAULT_OOM;
                 goto out_page;
         }
@@@ -3186,7 -3165,8 +3186,8 @@@ static int do_anonymous_page(struct vm_
         if (!page)
                 goto oom;
   
-       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+       if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
+                                       false))
                 goto oom_free_page;
   
         /*
@@@ -3682,7 -3662,7 +3683,7 @@@ static int do_cow_fault(struct vm_faul
         if (!vmf->cow_page)
                 return VM_FAULT_OOM;
   
-       if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+       if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
                                 &vmf->memcg, false)) {
                 put_page(vmf->cow_page);
                 return VM_FAULT_OOM;
@@@ -4418,9 -4398,6 +4419,9 @@@ int generic_access_phys(struct vm_area_
                 return -EINVAL;
   
         maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
+ +      if (!maddr)
+ +              return -ENOMEM;
+ +
         if (write)
                 memcpy_toio(maddr + offset, buf, len);
         else
diff --combined mm/shmem.c

index 96bcc51,6206ca3..06ebe17
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -1239,8 -1239,8 +1239,8 @@@ int shmem_unuse(swp_entry_t swap, struc
          * the shmem_swaplist_mutex which might hold up shmem_writepage().
          * Charged back to the user (not to caller) when swap account is used.
          */
-       error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
-                       false);
+       error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
+                                           &memcg, false);
         if (error)
                 goto out;
         /* No radix_tree_preload: swap entry keeps a place for page in tree */
@@@ -1421,7 -1421,6 +1421,7 @@@ static void shmem_pseudo_vma_init(struc
   {
         /* Create a pseudo vma that just contains the policy */
         memset(vma, 0, sizeof(*vma));
+ +      vma_init(vma, NULL);
         /* Bias interleave by inode number to distribute better across nodes */
         vma->vm_pgoff = index + info->vfs_inode.i_ino;
         vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
@@@ -1713,7 -1712,7 +1713,7 @@@ repeat
                                 goto failed;
                 }
   
-               error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+               error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
                                 false);
                 if (!error) {
                         error = shmem_add_to_page_cache(page, mapping, index,
@@@ -1819,7 -1818,7 +1819,7 @@@ alloc_nohuge:           page = shmem_alloc_and_a
                 if (sgp == SGP_WRITE)
                         __SetPageReferenced(page);
   
-               error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+               error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
                                 PageTransHuge(page));
                 if (error)
                         goto unacct;
@@@ -2292,7 -2291,7 +2292,7 @@@ static int shmem_mfill_atomic_pte(struc
         __SetPageSwapBacked(page);
         __SetPageUptodate(page);
   
-       ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+       ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
         if (ret)
                 goto out_release;
   
@@@ -3897,11 -3896,18 +3897,11 @@@ EXPORT_SYMBOL_GPL(shmem_truncate_range)
   
   /* common code */
   
- -static const struct dentry_operations anon_ops = {
- -      .d_dname = simple_dname
- -};
- -
   static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
                                        unsigned long flags, unsigned int i_flags)
   {
- -      struct file *res;
         struct inode *inode;
- -      struct path path;
- -      struct super_block *sb;
- -      struct qstr this;
+ +      struct file *res;
   
         if (IS_ERR(mnt))
                 return ERR_CAST(mnt);
@@@ -3912,21 -3918,41 +3912,21 @@@
         if (shmem_acct_size(flags, size))
                 return ERR_PTR(-ENOMEM);
   
- -      res = ERR_PTR(-ENOMEM);
- -      this.name = name;
- -      this.len = strlen(name);
- -      this.hash = 0; /* will go */
- -      sb = mnt->mnt_sb;
- -      path.mnt = mntget(mnt);
- -      path.dentry = d_alloc_pseudo(sb, &this);
- -      if (!path.dentry)
- -              goto put_memory;
- -      d_set_d_op(path.dentry, &anon_ops);
- -
- -      res = ERR_PTR(-ENOSPC);
- -      inode = shmem_get_inode(sb, NULL, S_IFREG | 0777, 0, flags);
- -      if (!inode)
- -              goto put_memory;
- -
+ +      inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
+ +                              flags);
+ +      if (unlikely(!inode)) {
+ +              shmem_unacct_size(flags, size);
+ +              return ERR_PTR(-ENOSPC);
+ +      }
         inode->i_flags |= i_flags;
- -      d_instantiate(path.dentry, inode);
         inode->i_size = size;
         clear_nlink(inode);     /* It is unlinked */
         res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
+ +      if (!IS_ERR(res))
+ +              res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
+ +                              &shmem_file_operations);
         if (IS_ERR(res))
- -              goto put_path;
- -
- -      res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
- -                &shmem_file_operations);
- -      if (IS_ERR(res))
- -              goto put_path;
- -
- -      return res;
- -
- -put_memory:
- -      shmem_unacct_size(flags, size);
- -put_path:
- -      path_put(&path);
+ +              iput(inode);
         return res;
   }
   
diff --combined mm/swapfile.c

index 18185ae,db4ec8a..8837b22
--- 1/mm/swapfile.c
--- 2/mm/swapfile.c
+++ b/mm/swapfile.c
@@@ -2909,35 -2909,6 +2909,35 @@@ static int claim_swapfile(struct swap_i
         return 0;
   }
   
+ +
+ +/*
+ + * Find out how many pages are allowed for a single swap device. There
+ + * are two limiting factors:
+ + * 1) the number of bits for the swap offset in the swp_entry_t type, and
+ + * 2) the number of bits in the swap pte, as defined by the different
+ + * architectures.
+ + *
+ + * In order to find the largest possible bit mask, a swap entry with
+ + * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
+ + * decoded to a swp_entry_t again, and finally the swap offset is
+ + * extracted.
+ + *
+ + * This will mask all the bits from the initial ~0UL mask that can't
+ + * be encoded in either the swp_entry_t or the architecture definition
+ + * of a swap pte.
+ + */
+ +unsigned long generic_max_swapfile_size(void)
+ +{
+ +      return swp_offset(pte_to_swp_entry(
+ +                      swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ +}
+ +
+ +/* Can be overridden by an architecture for additional checks. */
+ +__weak unsigned long max_swapfile_size(void)
+ +{
+ +      return generic_max_swapfile_size();
+ +}
+ +
   static unsigned long read_swap_header(struct swap_info_struct *p,
                                         union swap_header *swap_header,
                                         struct inode *inode)
@@@ -2973,7 -2944,22 +2973,7 @@@
         p->cluster_next = 1;
         p->cluster_nr = 0;
   
- -      /*
- -       * Find out how many pages are allowed for a single swap
- -       * device. There are two limiting factors: 1) the number
- -       * of bits for the swap offset in the swp_entry_t type, and
- -       * 2) the number of bits in the swap pte as defined by the
- -       * different architectures. In order to find the
- -       * largest possible bit mask, a swap entry with swap type 0
- -       * and swap offset ~0UL is created, encoded to a swap pte,
- -       * decoded to a swp_entry_t again, and finally the swap
- -       * offset is extracted. This will mask all the bits from
- -       * the initial ~0UL mask that can't be encoded in either
- -       * the swp_entry_t or the architecture definition of a
- -       * swap pte.
- -       */
- -      maxpages = swp_offset(pte_to_swp_entry(
- -                      swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ +      maxpages = max_swapfile_size();
         last_page = swap_header->info.last_page;
         if (!last_page) {
                 pr_warn("Empty swap-file\n");
@@@ -3745,6 -3731,37 +3745,37 @@@ static void free_swap_count_continuatio
         }
   }
   
+ #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+                                 gfp_t gfp_mask)
+ {
+       struct swap_info_struct *si, *next;
+       if (!(gfp_mask & __GFP_IO) || !memcg)
+               return;
+ 
+       if (!blk_cgroup_congested())
+               return;
+ 
+       /*
+        * We've already scheduled a throttle, avoid taking the global swap
+        * lock.
+        */
+       if (current->throttle_queue)
+               return;
+ 
+       spin_lock(&swap_avail_lock);
+       plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
+                                 avail_lists[node]) {
+               if (si->bdev) {
+                       blkcg_schedule_throttle(bdev_get_queue(si->bdev),
+                                               true);
+                       break;
+               }
+       }
+       spin_unlock(&swap_avail_lock);
+ }
+ #endif
+ 
   static int __init swapfile_init(void)
   {
         int nid;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 14 Aug 2018 17:23:25 +0000 (10:23 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 14 Aug 2018 17:23:25 +0000 (10:23 -0700)
		1	2
block/bio.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq-tag.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/zram/zram_drv.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/fabrics.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/fc.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/rdma.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/target/configfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/target/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/target/loop.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/block_dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/sysfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk-mq.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/swapfile.c	patch \|	diff1 \|	diff2 \|	blob \| history