#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
+ #include <linux/blk-cgroup.h>
#include <trace/events/block.h>
#include "blk.h"
+ #include "blk-rq-qos.h"
/*
* Test patch to inline a certain number of bi_io_vec's inside the bio
unsigned int bvec_nr_vecs(unsigned short idx)
{
- return bvec_slabs[idx].nr_vecs;
+ return bvec_slabs[--idx].nr_vecs;
}
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
}
EXPORT_SYMBOL(bio_clone_fast);
- /**
- * bio_clone_bioset - clone a bio
- * @bio_src: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- *
- * Clone bio. Caller will own the returned bio, but not the actual data it
- * points to. Reference count of returned bio will be one.
- */
- struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs)
- {
- struct bvec_iter iter;
- struct bio_vec bv;
- struct bio *bio;
-
- /*
- * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
- * bio_src->bi_io_vec to bio->bi_io_vec.
- *
- * We can't do that anymore, because:
- *
- * - The point of cloning the biovec is to produce a bio with a biovec
- * the caller can modify: bi_idx and bi_bvec_done should be 0.
- *
- * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
- * we tried to clone the whole thing bio_alloc_bioset() would fail.
- * But the clone should succeed as long as the number of biovecs we
- * actually need to allocate is fewer than BIO_MAX_PAGES.
- *
- * - Lastly, bi_vcnt should not be looked at or relied upon by code
- * that does not own the bio - reason being drivers don't use it for
- * iterating over the biovec anymore, so expecting it to be kept up
- * to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work on
- * __bio_clone_fast() anyways.
- */
-
- bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
- if (!bio)
- return NULL;
- bio->bi_disk = bio_src->bi_disk;
- bio->bi_opf = bio_src->bi_opf;
- bio->bi_write_hint = bio_src->bi_write_hint;
- bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
- bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
-
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- case REQ_OP_WRITE_ZEROES:
- break;
- case REQ_OP_WRITE_SAME:
- bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
- break;
- default:
- bio_for_each_segment(bv, bio_src, iter)
- bio->bi_io_vec[bio->bi_vcnt++] = bv;
- break;
- }
-
- if (bio_integrity(bio_src)) {
- int ret;
-
- ret = bio_integrity_clone(bio, bio_src, gfp_mask);
- if (ret < 0) {
- bio_put(bio);
- return NULL;
- }
- }
-
- bio_clone_blkcg_association(bio, bio_src);
-
- return bio;
- }
- EXPORT_SYMBOL(bio_clone_bioset);
-
/**
* bio_add_pc_page - attempt to add page to bio
* @q: the target queue
EXPORT_SYMBOL(bio_add_page);
/**
- * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+ * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be mapped
*
- * Pins as many pages from *iter and appends them to @bio's bvec array. The
+ * Pins pages from *iter and appends them to @bio's bvec array. The
* pages will have to be released using put_page() when done.
+ * For multi-segment *iter, this function only adds pages from the
+ * the next non-empty segment of the iov iterator.
*/
-int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
- unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+ unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
- size_t offset, diff;
+ size_t offset;
ssize_t size;
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
- nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
+ idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
/*
* Deep magic below: We need to walk the pinned pages backwards
bio->bi_iter.bi_size += size;
bio->bi_vcnt += nr_pages;
- diff = (nr_pages * PAGE_SIZE - offset) - size;
- while (nr_pages--) {
- bv[nr_pages].bv_page = pages[nr_pages];
- bv[nr_pages].bv_len = PAGE_SIZE;
- bv[nr_pages].bv_offset = 0;
+ while (idx--) {
+ bv[idx].bv_page = pages[idx];
+ bv[idx].bv_len = PAGE_SIZE;
+ bv[idx].bv_offset = 0;
}
bv[0].bv_offset += offset;
bv[0].bv_len -= offset;
- if (diff)
- bv[bio->bi_vcnt - 1].bv_len -= diff;
+ bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size;
iov_iter_advance(iter, size);
return 0;
}
+
+/**
+ * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+ * @bio: bio to add pages to
+ * @iter: iov iterator describing the region to be mapped
+ *
+ * Pins pages from *iter and appends them to @bio's bvec array. The
+ * pages will have to be released using put_page() when done.
+ * The function tries, but does not guarantee, to pin as many pages as
+ * fit into the bio, or are requested in *iter, whatever is smaller.
+ * If MM encounters an error pinning the requested pages, it stops.
+ * Error is returned only if 0 pages could be pinned.
+ */
+int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+{
+ unsigned short orig_vcnt = bio->bi_vcnt;
+
+ do {
+ int ret = __bio_iov_iter_get_pages(bio, iter);
+
+ if (unlikely(ret))
+ return bio->bi_vcnt > orig_vcnt ? 0 : ret;
+
+ } while (iov_iter_count(iter) && !bio_full(bio));
+
+ return 0;
+}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
static void submit_bio_wait_endio(struct bio *bio)
int i;
bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (page && !PageCompound(page))
- set_page_dirty_lock(page);
+ if (!PageCompound(bvec->bv_page))
+ set_page_dirty_lock(bvec->bv_page);
}
}
EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
struct bio_vec *bvec;
int i;
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (page)
- put_page(page);
- }
+ bio_for_each_segment_all(bvec, bio, i)
+ put_page(bvec->bv_page);
}
/*
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
* If they are, then fine. If, however, some pages are clean then they must
* have been written out during the direct-IO read. So we take another ref on
- * the BIO and the offending pages and re-dirty the pages in process context.
+ * the BIO and re-dirty the pages in process context.
*
* It is expected that bio_check_pages_dirty() will wholly own the BIO from
* here on. It will run one put_page() against each page and will run one
*/
static void bio_dirty_fn(struct work_struct *work)
{
- unsigned long flags;
- struct bio *bio;
+ struct bio *bio, *next;
- spin_lock_irqsave(&bio_dirty_lock, flags);
- bio = bio_dirty_list;
+ spin_lock_irq(&bio_dirty_lock);
+ next = bio_dirty_list;
bio_dirty_list = NULL;
- spin_unlock_irqrestore(&bio_dirty_lock, flags);
+ spin_unlock_irq(&bio_dirty_lock);
- while (bio) {
- struct bio *next = bio->bi_private;
+ while ((bio = next) != NULL) {
+ next = bio->bi_private;
bio_set_pages_dirty(bio);
bio_release_pages(bio);
bio_put(bio);
- bio = next;
}
}
void bio_check_pages_dirty(struct bio *bio)
{
struct bio_vec *bvec;
- int nr_clean_pages = 0;
+ unsigned long flags;
int i;
bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (PageDirty(page) || PageCompound(page)) {
- put_page(page);
- bvec->bv_page = NULL;
- } else {
- nr_clean_pages++;
- }
+ if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
+ goto defer;
}
- if (nr_clean_pages) {
- unsigned long flags;
-
- spin_lock_irqsave(&bio_dirty_lock, flags);
- bio->bi_private = bio_dirty_list;
- bio_dirty_list = bio;
- spin_unlock_irqrestore(&bio_dirty_lock, flags);
- schedule_work(&bio_dirty_work);
- } else {
- bio_put(bio);
- }
+ bio_release_pages(bio);
+ bio_put(bio);
+ return;
+ defer:
+ spin_lock_irqsave(&bio_dirty_lock, flags);
+ bio->bi_private = bio_dirty_list;
+ bio_dirty_list = bio;
+ spin_unlock_irqrestore(&bio_dirty_lock, flags);
+ schedule_work(&bio_dirty_work);
}
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
- void generic_start_io_acct(struct request_queue *q, int rw,
+ void generic_start_io_acct(struct request_queue *q, int op,
unsigned long sectors, struct hd_struct *part)
{
+ const int sgrp = op_stat_group(op);
int cpu = part_stat_lock();
part_round_stats(q, cpu, part);
- part_stat_inc(cpu, part, ios[rw]);
- part_stat_add(cpu, part, sectors[rw], sectors);
- part_inc_in_flight(q, part, rw);
+ part_stat_inc(cpu, part, ios[sgrp]);
+ part_stat_add(cpu, part, sectors[sgrp], sectors);
+ part_inc_in_flight(q, part, op_is_write(op));
part_stat_unlock();
}
EXPORT_SYMBOL(generic_start_io_acct);
- void generic_end_io_acct(struct request_queue *q, int rw,
+ void generic_end_io_acct(struct request_queue *q, int req_op,
struct hd_struct *part, unsigned long start_time)
{
unsigned long duration = jiffies - start_time;
+ const int sgrp = op_stat_group(req_op);
int cpu = part_stat_lock();
- part_stat_add(cpu, part, ticks[rw], duration);
+ part_stat_add(cpu, part, ticks[sgrp], duration);
part_round_stats(q, cpu, part);
- part_dec_in_flight(q, part, rw);
+ part_dec_in_flight(q, part, op_is_write(req_op));
part_stat_unlock();
}
if (!bio_integrity_endio(bio))
return;
+ if (bio->bi_disk)
+ rq_qos_done_bio(bio->bi_disk->queue, bio);
+
/*
* Need to have a real endio function for chained bios, otherwise
* various corner cases will break (like stacking block devices that
bio_integrity_trim(split);
bio_advance(bio, split->bi_iter.bi_size);
+ bio->bi_iter.bi_done = 0;
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
bio_set_flag(split, BIO_TRACE_COMPLETION);
#ifdef CONFIG_BLK_CGROUP
+ #ifdef CONFIG_MEMCG
+ /**
+ * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
+ * @bio: target bio
+ * @page: the page to lookup the blkcg from
+ *
+ * Associate @bio with the blkcg from @page's owning memcg. This works like
+ * every other associate function wrt references.
+ */
+ int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
+ {
+ struct cgroup_subsys_state *blkcg_css;
+
+ if (unlikely(bio->bi_css))
+ return -EBUSY;
+ if (!page->mem_cgroup)
+ return 0;
+ blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
+ &io_cgrp_subsys);
+ bio->bi_css = blkcg_css;
+ return 0;
+ }
+ #endif /* CONFIG_MEMCG */
+
/**
* bio_associate_blkcg - associate a bio with the specified blkcg
* @bio: target bio
}
EXPORT_SYMBOL_GPL(bio_associate_blkcg);
+ /**
+ * bio_associate_blkg - associate a bio with the specified blkg
+ * @bio: target bio
+ * @blkg: the blkg to associate
+ *
+ * Associate @bio with the blkg specified by @blkg. This is the queue specific
+ * blkcg information associated with the @bio, a reference will be taken on the
+ * @blkg and will be freed when the bio is freed.
+ */
+ int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+ {
+ if (unlikely(bio->bi_blkg))
+ return -EBUSY;
+ blkg_get(blkg);
+ bio->bi_blkg = blkg;
+ return 0;
+ }
+
/**
* bio_disassociate_task - undo bio_associate_current()
* @bio: target bio
css_put(bio->bi_css);
bio->bi_css = NULL;
}
+ if (bio->bi_blkg) {
+ blkg_put(bio->bi_blkg);
+ bio->bi_blkg = NULL;
+ }
}
/**
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
- #include "blk-wbt.h"
+ #include "blk-rq-qos.h"
#ifdef CONFIG_DEBUG_FS
struct dentry *blk_debugfs_root;
}
EXPORT_SYMBOL_GPL(blk_set_queue_dying);
+ /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
+ void blk_exit_queue(struct request_queue *q)
+ {
+ /*
+ * Since the I/O scheduler exit code may access cgroup information,
+ * perform I/O scheduler exit before disassociating from the block
+ * cgroup controller.
+ */
+ if (q->elevator) {
+ ioc_clear_queue(q);
+ elevator_exit(q, q->elevator);
+ q->elevator = NULL;
+ }
+
+ /*
+ * Remove all references to @q from the block cgroup controller before
+ * restoring @q->queue_lock to avoid that restoring this pointer causes
+ * e.g. blkcg_print_blkgs() to crash.
+ */
+ blkcg_exit_queue(q);
+
+ /*
+ * Since the cgroup code may dereference the @q->backing_dev_info
+ * pointer, only decrease its reference count after having removed the
+ * association with the block cgroup controller.
+ */
+ bdi_put(q->backing_dev_info);
+ }
+
/**
* blk_cleanup_queue - shutdown a request queue
* @q: request queue to shutdown
* make sure all in-progress dispatch are completed because
* blk_freeze_queue() can only complete all requests, and
* dispatch may still be in-progress since we dispatch requests
- * from more than one contexts
+ * from more than one contexts.
+ *
+ * No need to quiesce queue if it isn't initialized yet since
+ * blk_freeze_queue() should be enough for cases of passthrough
+ * request.
*/
- if (q->mq_ops)
+ if (q->mq_ops && blk_queue_init_done(q))
blk_mq_quiesce_queue(q);
/* for synchronous bio-based driver finish in-flight integrity i/o */
*/
WARN_ON_ONCE(q->kobj.state_in_sysfs);
- /*
- * Since the I/O scheduler exit code may access cgroup information,
- * perform I/O scheduler exit before disassociating from the block
- * cgroup controller.
- */
- if (q->elevator) {
- ioc_clear_queue(q);
- elevator_exit(q, q->elevator);
- q->elevator = NULL;
- }
-
- /*
- * Remove all references to @q from the block cgroup controller before
- * restoring @q->queue_lock to avoid that restoring this pointer causes
- * e.g. blkcg_print_blkgs() to crash.
- */
- blkcg_exit_queue(q);
-
- /*
- * Since the cgroup code may dereference the @q->backing_dev_info
- * pointer, only decrease its reference count after having removed the
- * association with the block cgroup controller.
- */
- bdi_put(q->backing_dev_info);
+ blk_exit_queue(q);
if (q->mq_ops)
blk_mq_free_queue(q);
q->exit_rq_fn(q, q->fq->flush_rq);
out_free_flush_queue:
blk_free_flush_queue(q->fq);
+ q->fq = NULL;
return -ENOMEM;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
- wbt_requeue(q->rq_wb, rq);
+ rq_qos_requeue(q, rq);
if (rq->rq_flags & RQF_QUEUED)
blk_queue_end_tag(q, rq);
/* this is a bio leak */
WARN_ON(req->bio != NULL);
- wbt_done(q->rq_wb, req);
+ rq_qos_done(q, req);
/*
* Request may not have originated from ll_rw_blk. if not,
int where = ELEVATOR_INSERT_SORT;
struct request *req, *free;
unsigned int request_count = 0;
- unsigned int wb_acct;
/*
* low level driver can indicate that it wants pages above a
}
get_rq:
- wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+ rq_qos_throttle(q, bio, q->queue_lock);
/*
* Grab a free request. This is might sleep but can not fail.
req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
if (IS_ERR(req)) {
blk_queue_exit(q);
- __wbt_done(q->rq_wb, wb_acct);
+ rq_qos_cleanup(q, bio);
if (PTR_ERR(req) == -ENOMEM)
bio->bi_status = BLK_STS_RESOURCE;
else
goto out_unlock;
}
- wbt_track(req, wb_acct);
+ rq_qos_track(q, req, bio);
/*
* After dropping the lock and possibly sleeping here, our request
if (part->policy && op_is_write(bio_op(bio))) {
char b[BDEVNAME_SIZE];
- printk(KERN_ERR
+ WARN_ONCE(1,
"generic_make_request: Trying to write "
"to read-only block-device %s (partno %d)\n",
bio_devname(bio, b), part->partno);
- return true;
+ /* Older lvm-tools actually trigger this */
+ return false;
}
return false;
void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (blk_do_io_stat(req)) {
- const int rw = rq_data_dir(req);
+ const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
int cpu;
cpu = part_stat_lock();
part = req->part;
- part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+ part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
part_stat_unlock();
}
}
*/
if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
unsigned long duration;
- const int rw = rq_data_dir(req);
+ const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
int cpu;
cpu = part_stat_lock();
part = req->part;
- part_stat_inc(cpu, part, ios[rw]);
- part_stat_add(cpu, part, ticks[rw], duration);
+ part_stat_inc(cpu, part, ios[sgrp]);
+ part_stat_add(cpu, part, ticks[sgrp], duration);
part_round_stats(req->q, cpu, part);
- part_dec_in_flight(req->q, part, rw);
+ part_dec_in_flight(req->q, part, rq_data_dir(req));
hd_struct_put(part);
part_stat_unlock();
return rq->rq_flags & RQF_PM;
case RPM_SUSPENDED:
return false;
+ default:
+ return true;
}
-
- return true;
}
#else
static bool blk_pm_allow_request(struct request *rq)
req->throtl_size = blk_rq_sectors(req);
#endif
req->rq_flags |= RQF_STATS;
- wbt_issue(req->q->rq_wb, req);
+ rq_qos_issue(req->q, req);
}
BUG_ON(blk_rq_is_complete(req));
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
* %false return from this function.
*
+ * Note:
+ * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
+ * blk_rq_bytes() and in blk_update_request().
+ *
* Return:
* %false - this request doesn't have any more data
* %true - this request has more data
blk_account_io_done(req, now);
if (req->end_io) {
- wbt_done(req->q->rq_wb, req);
+ rq_qos_done(q, req);
req->end_io(req, error);
} else {
if (blk_bidi_rq(req))
*/
void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
{
- /* not support for RQF_PM and ->rpm_status in blk-mq yet */
- if (q->mq_ops)
+ /* Don't enable runtime PM for blk-mq until it is ready */
+ if (q->mq_ops) {
+ pm_runtime_disable(dev);
return;
+ }
q->dev = dev;
q->rpm_status = RPM_ACTIVE;
/*
* If a previously inactive queue goes active, bump the active user count.
+ * We need to do this before try to allocate driver tag, then even if fail
+ * to get tag when first time, the other shared-tag users could reserve
+ * budget for it.
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
* test and set the bit before assining ->rqs[].
*/
rq = tags->rqs[bitnr];
- if (rq && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
+ if (rq && blk_mq_request_started(rq))
iter_data->fn(rq, iter_data->data, reserved);
return true;
if (tdepth <= tags->nr_reserved_tags)
return -EINVAL;
- tdepth -= tags->nr_reserved_tags;
-
/*
* If we are allowed to grow beyond the original size, allocate
* a new set of tags before freeing the old one.
if (tdepth > 16 * BLKDEV_MAX_RQ)
return -EINVAL;
- new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
+ new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
+ tags->nr_reserved_tags);
if (!new)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
* Don't need (or can't) update reserved tags here, they
* remain static and should never need resizing.
*/
- sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+ sbitmap_queue_resize(&tags->bitmap_tags,
+ tdepth - tags->nr_reserved_tags);
}
return 0;
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
#include "blk-stat.h"
- #include "blk-wbt.h"
#include "blk-mq-sched.h"
+ #include "blk-rq-qos.h"
static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
static void blk_mq_poll_stats_start(struct request_queue *q);
rq->tag = -1;
rq->internal_tag = tag;
} else {
- if (blk_mq_tag_busy(data->hctx)) {
+ if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
!(data->flags & BLK_MQ_REQ_RESERVED))
e->type->ops.mq.limit_depth(op, data);
+ } else {
+ blk_mq_tag_busy(data->hctx);
}
tag = blk_mq_get_tag(data);
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->backing_dev_info);
- wbt_done(q->rq_wb, rq);
+ rq_qos_done(q, rq);
if (blk_rq_rl(rq))
blk_put_rl(blk_rq_rl(rq));
blk_account_io_done(rq, now);
if (rq->end_io) {
- wbt_done(rq->q->rq_wb, rq);
+ rq_qos_done(rq->q, rq);
rq->end_io(rq, error);
} else {
if (unlikely(blk_bidi_rq(rq)))
bool shared = false;
int cpu;
- if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) !=
- MQ_RQ_IN_FLIGHT)
+ if (!blk_mq_mark_complete(rq))
return;
-
if (rq->internal_tag != -1)
blk_mq_sched_completed_request(rq);
rq->throtl_size = blk_rq_sectors(rq);
#endif
rq->rq_flags |= RQF_STATS;
- wbt_issue(q->rq_wb, rq);
+ rq_qos_issue(q, rq);
}
WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
blk_mq_put_driver_tag(rq);
trace_block_rq_requeue(q, rq);
- wbt_requeue(q->rq_wb, rq);
+ rq_qos_requeue(q, rq);
if (blk_mq_request_started(rq)) {
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
- bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
- bool wait)
+ bool blk_mq_get_driver_tag(struct request *rq)
{
struct blk_mq_alloc_data data = {
.q = rq->q,
.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
- .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+ .flags = BLK_MQ_REQ_NOWAIT,
};
-
- might_sleep_if(wait);
+ bool shared;
if (rq->tag != -1)
goto done;
if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
data.flags |= BLK_MQ_REQ_RESERVED;
+ shared = blk_mq_tag_busy(data.hctx);
rq->tag = blk_mq_get_tag(&data);
if (rq->tag >= 0) {
- if (blk_mq_tag_busy(data.hctx)) {
+ if (shared) {
rq->rq_flags |= RQF_MQ_INFLIGHT;
atomic_inc(&data.hctx->nr_active);
}
}
done:
- if (hctx)
- *hctx = data.hctx;
return rq->tag != -1;
}
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+ spin_lock(&hctx->dispatch_wait_lock);
list_del_init(&wait->entry);
+ spin_unlock(&hctx->dispatch_wait_lock);
+
blk_mq_run_hw_queue(hctx, true);
return 1;
}
* restart. For both cases, take care to check the condition again after
* marking us as waiting.
*/
- static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
+ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- struct blk_mq_hw_ctx *this_hctx = *hctx;
- struct sbq_wait_state *ws;
+ struct wait_queue_head *wq;
wait_queue_entry_t *wait;
bool ret;
- if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
- set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
+ if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
+ if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+ set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
/*
* It's possible that a tag was freed in the window between the
* Don't clear RESTART here, someone else could have set it.
* At most this will cost an extra queue run.
*/
- return blk_mq_get_driver_tag(rq, hctx, false);
+ return blk_mq_get_driver_tag(rq);
}
- wait = &this_hctx->dispatch_wait;
+ wait = &hctx->dispatch_wait;
if (!list_empty_careful(&wait->entry))
return false;
- spin_lock(&this_hctx->lock);
+ wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
+
+ spin_lock_irq(&wq->lock);
+ spin_lock(&hctx->dispatch_wait_lock);
if (!list_empty(&wait->entry)) {
- spin_unlock(&this_hctx->lock);
+ spin_unlock(&hctx->dispatch_wait_lock);
+ spin_unlock_irq(&wq->lock);
return false;
}
- ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
- add_wait_queue(&ws->wait, wait);
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue(wq, wait);
/*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
*/
- ret = blk_mq_get_driver_tag(rq, hctx, false);
+ ret = blk_mq_get_driver_tag(rq);
if (!ret) {
- spin_unlock(&this_hctx->lock);
+ spin_unlock(&hctx->dispatch_wait_lock);
+ spin_unlock_irq(&wq->lock);
return false;
}
* We got a tag, remove ourselves from the wait queue to ensure
* someone else gets the wakeup.
*/
- spin_lock_irq(&ws->wait.lock);
list_del_init(&wait->entry);
- spin_unlock_irq(&ws->wait.lock);
- spin_unlock(&this_hctx->lock);
+ spin_unlock(&hctx->dispatch_wait_lock);
+ spin_unlock_irq(&wq->lock);
return true;
}
+ #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
+ #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
+ /*
+ * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
+ * - EWMA is one simple way to compute running average value
+ * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
+ * - take 4 as factor for avoiding to get too small(0) result, and this
+ * factor doesn't matter because EWMA decreases exponentially
+ */
+ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
+ {
+ unsigned int ewma;
+
+ if (hctx->queue->elevator)
+ return;
+
+ ewma = hctx->dispatch_busy;
+
+ if (!ewma && !busy)
+ return;
+
+ ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
+ if (busy)
+ ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
+ ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
+
+ hctx->dispatch_busy = ewma;
+ }
+
#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
/*
if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
break;
- if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+ if (!blk_mq_get_driver_tag(rq)) {
/*
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed. The
* before we add this entry back on the dispatch list,
* we'll re-run it below.
*/
- if (!blk_mq_mark_tag_wait(&hctx, rq)) {
+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
blk_mq_put_dispatch_budget(hctx);
/*
* For non-shared tags, the RESTART check
bd.last = true;
else {
nxt = list_first_entry(list, struct request, queuelist);
- bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
+ bd.last = !blk_mq_get_driver_tag(nxt);
}
ret = q->mq_ops->queue_rq(hctx, &bd);
else if (needs_restart && (ret == BLK_STS_RESOURCE))
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
+ blk_mq_update_dispatch_busy(hctx, true);
return false;
- }
+ } else
+ blk_mq_update_dispatch_busy(hctx, false);
/*
* If the host/device is unable to accept more work, inform the
struct list_head *list)
{
+ struct request *rq;
+
/*
* preemption doesn't flush plug list, so it's possible ctx->cpu is
* offline now
*/
- spin_lock(&ctx->lock);
- while (!list_empty(list)) {
- struct request *rq;
-
- rq = list_first_entry(list, struct request, queuelist);
+ list_for_each_entry(rq, list, queuelist) {
BUG_ON(rq->mq_ctx != ctx);
- list_del_init(&rq->queuelist);
- __blk_mq_insert_req_list(hctx, rq, false);
+ trace_block_rq_insert(hctx->queue, rq);
}
+
+ spin_lock(&ctx->lock);
+ list_splice_tail_init(list, &ctx->rq_list);
blk_mq_hctx_mark_pending(hctx, ctx);
spin_unlock(&ctx->lock);
}
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
+ blk_mq_update_dispatch_busy(hctx, false);
*cookie = new_cookie;
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
+ blk_mq_update_dispatch_busy(hctx, true);
__blk_mq_requeue_request(rq);
break;
default:
+ blk_mq_update_dispatch_busy(hctx, false);
*cookie = BLK_QC_T_NONE;
break;
}
if (!blk_mq_get_dispatch_budget(hctx))
goto insert;
- if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+ if (!blk_mq_get_driver_tag(rq)) {
blk_mq_put_dispatch_budget(hctx);
goto insert;
}
return ret;
}
+ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list)
+ {
+ while (!list_empty(list)) {
+ blk_status_t ret;
+ struct request *rq = list_first_entry(list, struct request,
+ queuelist);
+
+ list_del_init(&rq->queuelist);
+ ret = blk_mq_request_issue_directly(rq);
+ if (ret != BLK_STS_OK) {
+ if (ret == BLK_STS_RESOURCE ||
+ ret == BLK_STS_DEV_RESOURCE) {
+ list_add(&rq->queuelist, list);
+ break;
+ }
+ blk_mq_end_request(rq, ret);
+ }
+ }
+ }
+
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
- unsigned int wb_acct;
blk_queue_bounce(q, &bio);
if (blk_mq_sched_bio_merge(q, bio))
return BLK_QC_T_NONE;
- wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+ rq_qos_throttle(q, bio, NULL);
trace_block_getrq(q, bio, bio->bi_opf);
rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
if (unlikely(!rq)) {
- __wbt_done(q->rq_wb, wb_acct);
+ rq_qos_cleanup(q, bio);
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
return BLK_QC_T_NONE;
}
- wbt_track(rq, wb_acct);
+ rq_qos_track(q, rq, bio);
cookie = request_to_qc_t(data.hctx, rq);
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
}
- } else if (q->nr_hw_queues > 1 && is_sync) {
+ } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
+ !data.hctx->dispatch_busy)) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
hctx->nr_ctx = 0;
+ spin_lock_init(&hctx->dispatch_wait_lock);
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (shared) {
- if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_inc(&q->shared_hctx_restart);
+ if (shared)
hctx->flags |= BLK_MQ_F_TAG_SHARED;
- } else {
- if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_dec(&q->shared_hctx_restart);
+ else
hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
- }
}
}
blk_mq_update_tag_set_depth(set, false);
}
mutex_unlock(&set->tag_list_lock);
- synchronize_rcu();
INIT_LIST_HEAD(&q->tag_set_list);
}
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
if (set->ops->map_queues) {
- int cpu;
/*
* transport .map_queues is usually done in the following
* way:
* killing stale mapping since one CPU may not be mapped
* to any hw queue.
*/
- for_each_possible_cpu(cpu)
- set->mq_map[cpu] = 0;
+ blk_mq_clear_mq_map(set);
return set->ops->map_queues(set);
} else
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
- * requested depth down, if if it too large. In that case, the set
+ * requested depth down, if it's too large. In that case, the set
* value will be stored in set->queue_depth.
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
zram->backing_dev = NULL;
zram->old_block_size = 0;
zram->bdev = NULL;
-
+ zram->disk->queue->backing_dev_info->capabilities |=
+ BDI_CAP_SYNCHRONOUS_IO;
kvfree(zram->bitmap);
zram->bitmap = NULL;
}
zram->backing_dev = backing_dev;
zram->bitmap = bitmap;
zram->nr_pages = nr_pages;
+ /*
+ * With writeback feature, zram does asynchronous IO so it's no longer
+ * synchronous device so let's remove synchronous io flag. Othewise,
+ * upper layer(e.g., swap) could wait IO completion rather than
+ * (submit and return), which will cause system sluggish.
+ * Furthermore, when the IO function returns(e.g., swap_readpage),
+ * upper layer expects IO was done so it could deallocate the page
+ * freely but in fact, IO is going on so finally could cause
+ * use-after-free when the IO is really done.
+ */
+ zram->disk->queue->backing_dev_info->capabilities &=
+ ~BDI_CAP_SYNCHRONOUS_IO;
up_write(&zram->init_lock);
pr_info("setup backing device %s\n", file_name);
* Returns 1 if IO request was successfully submitted.
*/
static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
- int offset, bool is_write, struct bio *bio)
+ int offset, unsigned int op, struct bio *bio)
{
unsigned long start_time = jiffies;
- int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
struct request_queue *q = zram->disk->queue;
int ret;
- generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
+ generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT,
&zram->disk->part0);
- if (!is_write) {
+ if (!op_is_write(op)) {
atomic64_inc(&zram->stats.num_reads);
ret = zram_bvec_read(zram, bvec, index, offset, bio);
flush_dcache_page(bvec->bv_page);
ret = zram_bvec_write(zram, bvec, index, offset, bio);
}
- generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
+ generic_end_io_acct(q, op, &zram->disk->part0, start_time);
zram_slot_lock(zram, index);
zram_accessed(zram, index);
zram_slot_unlock(zram, index);
if (unlikely(ret < 0)) {
- if (!is_write)
+ if (!op_is_write(op))
atomic64_inc(&zram->stats.failed_reads);
else
atomic64_inc(&zram->stats.failed_writes);
bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
unwritten);
if (zram_bvec_rw(zram, &bv, index, offset,
- op_is_write(bio_op(bio)), bio) < 0)
+ bio_op(bio), bio) < 0)
goto out;
bv.bv_offset += bv.bv_len;
}
static int zram_rw_page(struct block_device *bdev, sector_t sector,
- struct page *page, bool is_write)
+ struct page *page, unsigned int op)
{
int offset, ret;
u32 index;
bv.bv_len = PAGE_SIZE;
bv.bv_offset = 0;
- ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
+ ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
out:
/*
* If I/O fails, just return error(ie, non-zero) without
switch (ret) {
case 0:
- page_endio(page, is_write, 0);
+ page_endio(page, op_is_write(op), 0);
break;
case 1:
ret = 0;
bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
{
- if (ctrl->opts->max_reconnects != -1 &&
+ if (ctrl->opts->max_reconnects == -1 ||
ctrl->nr_reconnects < ctrl->opts->max_reconnects)
return true;
/*
* For something we're not in a state to send to the device the default action
* is to busy it and retry it after the controller state is recovered. However,
- * anything marked for failfast or nvme multipath is immediately failed.
+ * if the controller is deleting or if anything is marked for failfast or
+ * nvme multipath it is immediately failed.
*
* Note: commands used to initialize the controller will be marked for failfast.
* Note: nvme cli/ioctl commands are marked for failfast.
*/
-blk_status_t nvmf_fail_nonready_command(struct request *rq)
+blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
+ struct request *rq)
{
- if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
+ if (ctrl->state != NVME_CTRL_DELETING &&
+ ctrl->state != NVME_CTRL_DEAD &&
+ !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;
nvme_req(rq)->status = NVME_SC_ABORT_REQ;
return BLK_STS_IOERR;
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_fc_queue *queue = &ctrl->queues[queue_idx];
+ nvme_req(rq)->ctrl = &ctrl->ctrl;
return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
}
if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE ||
!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
- return nvmf_fail_nonready_command(rq);
+ return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
ret = nvme_setup_cmd(ns, rq, sqe);
if (ret)
#define NVME_RDMA_MAX_SEGMENTS 256
- #define NVME_RDMA_MAX_INLINE_SEGMENTS 1
+ #define NVME_RDMA_MAX_INLINE_SEGMENTS 4
struct nvme_rdma_device {
struct ib_device *dev;
struct ib_pd *pd;
struct kref ref;
struct list_head entry;
+ unsigned int num_inline_segments;
};
struct nvme_rdma_qe {
struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
+ bool use_inline_data;
};
static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
/* +1 for drain */
init_attr.cap.max_recv_wr = queue->queue_size + 1;
init_attr.cap.max_recv_sge = 1;
- init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
+ init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
init_attr.qp_type = IB_QPT_RC;
init_attr.send_cq = queue->ib_cq;
struct ib_device *ibdev = dev->dev;
int ret;
+ nvme_req(rq)->ctrl = &ctrl->ctrl;
ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
DMA_TO_DEVICE);
if (ret)
goto out_free_pd;
}
+ ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
+ ndev->dev->attrs.max_sge - 1);
list_add(&ndev->entry, &device_list);
out_unlock:
mutex_unlock(&device_list_mutex);
return ret;
}
+ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
+ bool remove)
+ {
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ nvme_rdma_stop_queue(&ctrl->queues[0]);
+ blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
+ &ctrl->ctrl);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_rdma_destroy_admin_queue(ctrl, remove);
+ }
+
+ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
+ bool remove)
+ {
+ if (ctrl->ctrl.queue_count > 1) {
+ nvme_stop_queues(&ctrl->ctrl);
+ nvme_rdma_stop_io_queues(ctrl);
+ blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
+ &ctrl->ctrl);
+ if (remove)
+ nvme_start_queues(&ctrl->ctrl);
+ nvme_rdma_destroy_io_queues(ctrl, remove);
+ }
+ }
+
static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
}
}
- static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
{
- struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
- struct nvme_rdma_ctrl, reconnect_work);
+ int ret = -EINVAL;
bool changed;
- int ret;
- ++ctrl->ctrl.nr_reconnects;
-
- ret = nvme_rdma_configure_admin_queue(ctrl, false);
+ ret = nvme_rdma_configure_admin_queue(ctrl, new);
if (ret)
- goto requeue;
+ return ret;
+
+ if (ctrl->ctrl.icdoff) {
+ dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
+ goto destroy_admin;
+ }
+
+ if (!(ctrl->ctrl.sgls & (1 << 2))) {
+ dev_err(ctrl->ctrl.device,
+ "Mandatory keyed sgls are not supported!\n");
+ goto destroy_admin;
+ }
+
+ if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
+ dev_warn(ctrl->ctrl.device,
+ "queue_size %zu > ctrl sqsize %u, clamping down\n",
+ ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
+ }
+
+ if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
+ dev_warn(ctrl->ctrl.device,
+ "sqsize %u > ctrl maxcmd %u, clamping down\n",
+ ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
+ ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
+ }
+
+ if (ctrl->ctrl.sgls & (1 << 20))
+ ctrl->use_inline_data = true;
if (ctrl->ctrl.queue_count > 1) {
- ret = nvme_rdma_configure_io_queues(ctrl, false);
+ ret = nvme_rdma_configure_io_queues(ctrl, new);
if (ret)
goto destroy_admin;
}
if (!changed) {
/* state change failure is ok if we're in DELETING state */
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
- return;
+ ret = -EINVAL;
+ goto destroy_io;
}
nvme_start_ctrl(&ctrl->ctrl);
+ return 0;
+
+ destroy_io:
+ if (ctrl->ctrl.queue_count > 1)
+ nvme_rdma_destroy_io_queues(ctrl, new);
+ destroy_admin:
+ nvme_rdma_stop_queue(&ctrl->queues[0]);
+ nvme_rdma_destroy_admin_queue(ctrl, new);
+ return ret;
+ }
+
+ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+ {
+ struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
+ struct nvme_rdma_ctrl, reconnect_work);
+
+ ++ctrl->ctrl.nr_reconnects;
+
+ if (nvme_rdma_setup_ctrl(ctrl, false))
+ goto requeue;
dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
ctrl->ctrl.nr_reconnects);
return;
- destroy_admin:
- nvme_rdma_stop_queue(&ctrl->queues[0]);
- nvme_rdma_destroy_admin_queue(ctrl, false);
requeue:
dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
ctrl->ctrl.nr_reconnects);
struct nvme_rdma_ctrl, err_work);
nvme_stop_keep_alive(&ctrl->ctrl);
-
- if (ctrl->ctrl.queue_count > 1) {
- nvme_stop_queues(&ctrl->ctrl);
- nvme_rdma_stop_io_queues(ctrl);
- blk_mq_tagset_busy_iter(&ctrl->tag_set,
- nvme_cancel_request, &ctrl->ctrl);
- nvme_rdma_destroy_io_queues(ctrl, false);
- }
-
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
- nvme_rdma_stop_queue(&ctrl->queues[0]);
- blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
- nvme_cancel_request, &ctrl->ctrl);
- nvme_rdma_destroy_admin_queue(ctrl, false);
-
- /*
- * queues are not a live anymore, so restart the queues to fail fast
- * new IO
- */
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_rdma_teardown_io_queues(ctrl, false);
nvme_start_queues(&ctrl->ctrl);
+ nvme_rdma_teardown_admin_queue(ctrl, false);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
}
static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
- struct nvme_rdma_request *req, struct nvme_command *c)
+ struct nvme_rdma_request *req, struct nvme_command *c,
+ int count)
{
struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+ struct scatterlist *sgl = req->sg_table.sgl;
+ struct ib_sge *sge = &req->sge[1];
+ u32 len = 0;
+ int i;
- req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
- req->sge[1].length = sg_dma_len(req->sg_table.sgl);
- req->sge[1].lkey = queue->device->pd->local_dma_lkey;
+ for (i = 0; i < count; i++, sgl++, sge++) {
+ sge->addr = sg_dma_address(sgl);
+ sge->length = sg_dma_len(sgl);
+ sge->lkey = queue->device->pd->local_dma_lkey;
+ len += sge->length;
+ }
sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
- sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
+ sg->length = cpu_to_le32(len);
sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
- req->num_sge++;
+ req->num_sge += count;
return 0;
}
goto out_free_table;
}
- if (count == 1) {
+ if (count <= dev->num_inline_segments) {
if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
+ queue->ctrl->use_inline_data &&
blk_rq_payload_bytes(rq) <=
nvme_rdma_inline_data_size(queue)) {
- ret = nvme_rdma_map_sg_inline(queue, req, c);
+ ret = nvme_rdma_map_sg_inline(queue, req, c, count);
goto out;
}
- if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
+ if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
ret = nvme_rdma_map_sg_single(queue, req, c);
goto out;
}
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
nvme_rdma_destroy_queue_ib(queue);
+ /* fall through */
case RDMA_CM_EVENT_ADDR_ERROR:
dev_dbg(queue->ctrl->ctrl.device,
"CM error event %d\n", ev->event);
WARN_ON_ONCE(rq->tag < 0);
if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
- return nvmf_fail_nonready_command(rq);
+ return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
dev = queue->device->dev;
ib_dma_sync_single_for_cpu(dev, sqe->dma,
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{
- if (ctrl->ctrl.queue_count > 1) {
- nvme_stop_queues(&ctrl->ctrl);
- nvme_rdma_stop_io_queues(ctrl);
- blk_mq_tagset_busy_iter(&ctrl->tag_set,
- nvme_cancel_request, &ctrl->ctrl);
- nvme_rdma_destroy_io_queues(ctrl, shutdown);
- }
-
+ nvme_rdma_teardown_io_queues(ctrl, shutdown);
if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl);
else
nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
-
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
- nvme_rdma_stop_queue(&ctrl->queues[0]);
- blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
- nvme_cancel_request, &ctrl->ctrl);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
- nvme_rdma_destroy_admin_queue(ctrl, shutdown);
+ nvme_rdma_teardown_admin_queue(ctrl, shutdown);
}
static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_rdma_ctrl *ctrl =
container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
- int ret;
- bool changed;
nvme_stop_ctrl(&ctrl->ctrl);
nvme_rdma_shutdown_ctrl(ctrl, false);
return;
}
- ret = nvme_rdma_configure_admin_queue(ctrl, false);
- if (ret)
+ if (nvme_rdma_setup_ctrl(ctrl, false))
goto out_fail;
- if (ctrl->ctrl.queue_count > 1) {
- ret = nvme_rdma_configure_io_queues(ctrl, false);
- if (ret)
- goto out_fail;
- }
-
- changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- if (!changed) {
- /* state change failure is ok if we're in DELETING state */
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
- return;
- }
-
- nvme_start_ctrl(&ctrl->ctrl);
-
return;
out_fail:
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
WARN_ON_ONCE(!changed);
- ret = nvme_rdma_configure_admin_queue(ctrl, true);
+ ret = nvme_rdma_setup_ctrl(ctrl, true);
if (ret)
goto out_uninit_ctrl;
- /* sanity check icdoff */
- if (ctrl->ctrl.icdoff) {
- dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
- ret = -EINVAL;
- goto out_remove_admin_queue;
- }
-
- /* sanity check keyed sgls */
- if (!(ctrl->ctrl.sgls & (1 << 2))) {
- dev_err(ctrl->ctrl.device,
- "Mandatory keyed sgls are not supported!\n");
- ret = -EINVAL;
- goto out_remove_admin_queue;
- }
-
- /* only warn if argument is too large here, will clamp later */
- if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
- dev_warn(ctrl->ctrl.device,
- "queue_size %zu > ctrl sqsize %u, clamping down\n",
- opts->queue_size, ctrl->ctrl.sqsize + 1);
- }
-
- /* warn if maxcmd is lower than sqsize+1 */
- if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
- dev_warn(ctrl->ctrl.device,
- "sqsize %u > ctrl maxcmd %u, clamping down\n",
- ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
- ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
- }
-
- if (opts->nr_io_queues) {
- ret = nvme_rdma_configure_io_queues(ctrl, true);
- if (ret)
- goto out_remove_admin_queue;
- }
-
- changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- WARN_ON_ONCE(!changed);
-
dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
mutex_unlock(&nvme_rdma_ctrl_mutex);
- nvme_start_ctrl(&ctrl->ctrl);
-
return &ctrl->ctrl;
- out_remove_admin_queue:
- nvme_rdma_stop_queue(&ctrl->queues[0]);
- nvme_rdma_destroy_admin_queue(ctrl, true);
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
CONFIGFS_ATTR(nvmet_, addr_trsvcid);
+ static ssize_t nvmet_param_inline_data_size_show(struct config_item *item,
+ char *page)
+ {
+ struct nvmet_port *port = to_nvmet_port(item);
+
+ return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size);
+ }
+
+ static ssize_t nvmet_param_inline_data_size_store(struct config_item *item,
+ const char *page, size_t count)
+ {
+ struct nvmet_port *port = to_nvmet_port(item);
+ int ret;
+
+ if (port->enabled) {
+ pr_err("Cannot modify inline_data_size while port enabled\n");
+ pr_err("Disable the port before modifying\n");
+ return -EACCES;
+ }
+ ret = kstrtoint(page, 0, &port->inline_data_size);
+ if (ret) {
+ pr_err("Invalid value '%s' for inline_data_size\n", page);
+ return -EINVAL;
+ }
+ return count;
+ }
+
+ CONFIGFS_ATTR(nvmet_, param_inline_data_size);
+
static ssize_t nvmet_addr_trtype_show(struct config_item *item,
char *page)
{
{
struct nvmet_ns *ns = to_nvmet_ns(item);
struct nvmet_subsys *subsys = ns->subsys;
+ size_t len;
int ret;
mutex_lock(&subsys->lock);
if (ns->enabled)
goto out_unlock;
- kfree(ns->device_path);
+ ret = -EINVAL;
+ len = strcspn(page, "\n");
+ if (!len)
+ goto out_unlock;
+ kfree(ns->device_path);
ret = -ENOMEM;
- ns->device_path = kstrndup(page, strcspn(page, "\n"), GFP_KERNEL);
+ ns->device_path = kstrndup(page, len, GFP_KERNEL);
if (!ns->device_path)
goto out_unlock;
CONFIGFS_ATTR(nvmet_ns_, device_nguid);
+ static ssize_t nvmet_ns_ana_grpid_show(struct config_item *item, char *page)
+ {
+ return sprintf(page, "%u\n", to_nvmet_ns(item)->anagrpid);
+ }
+
+ static ssize_t nvmet_ns_ana_grpid_store(struct config_item *item,
+ const char *page, size_t count)
+ {
+ struct nvmet_ns *ns = to_nvmet_ns(item);
+ u32 oldgrpid, newgrpid;
+ int ret;
+
+ ret = kstrtou32(page, 0, &newgrpid);
+ if (ret)
+ return ret;
+
+ if (newgrpid < 1 || newgrpid > NVMET_MAX_ANAGRPS)
+ return -EINVAL;
+
+ down_write(&nvmet_ana_sem);
+ oldgrpid = ns->anagrpid;
+ nvmet_ana_group_enabled[newgrpid]++;
+ ns->anagrpid = newgrpid;
+ nvmet_ana_group_enabled[oldgrpid]--;
+ nvmet_ana_chgcnt++;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_send_ana_event(ns->subsys, NULL);
+ return count;
+ }
+
+ CONFIGFS_ATTR(nvmet_ns_, ana_grpid);
+
static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
{
return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled);
CONFIGFS_ATTR(nvmet_ns_, enable);
+ static ssize_t nvmet_ns_buffered_io_show(struct config_item *item, char *page)
+ {
+ return sprintf(page, "%d\n", to_nvmet_ns(item)->buffered_io);
+ }
+
+ static ssize_t nvmet_ns_buffered_io_store(struct config_item *item,
+ const char *page, size_t count)
+ {
+ struct nvmet_ns *ns = to_nvmet_ns(item);
+ bool val;
+
+ if (strtobool(page, &val))
+ return -EINVAL;
+
+ mutex_lock(&ns->subsys->lock);
+ if (ns->enabled) {
+ pr_err("disable ns before setting buffered_io value.\n");
+ mutex_unlock(&ns->subsys->lock);
+ return -EINVAL;
+ }
+
+ ns->buffered_io = val;
+ mutex_unlock(&ns->subsys->lock);
+ return count;
+ }
+
+ CONFIGFS_ATTR(nvmet_ns_, buffered_io);
+
static struct configfs_attribute *nvmet_ns_attrs[] = {
&nvmet_ns_attr_device_path,
&nvmet_ns_attr_device_nguid,
&nvmet_ns_attr_device_uuid,
+ &nvmet_ns_attr_ana_grpid,
&nvmet_ns_attr_enable,
+ &nvmet_ns_attr_buffered_io,
NULL,
};
.ct_group_ops = &nvmet_referral_group_ops,
};
+ static struct {
+ enum nvme_ana_state state;
+ const char *name;
+ } nvmet_ana_state_names[] = {
+ { NVME_ANA_OPTIMIZED, "optimized" },
+ { NVME_ANA_NONOPTIMIZED, "non-optimized" },
+ { NVME_ANA_INACCESSIBLE, "inaccessible" },
+ { NVME_ANA_PERSISTENT_LOSS, "persistent-loss" },
+ { NVME_ANA_CHANGE, "change" },
+ };
+
+ static ssize_t nvmet_ana_group_ana_state_show(struct config_item *item,
+ char *page)
+ {
+ struct nvmet_ana_group *grp = to_ana_group(item);
+ enum nvme_ana_state state = grp->port->ana_state[grp->grpid];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
+ if (state != nvmet_ana_state_names[i].state)
+ continue;
+ return sprintf(page, "%s\n", nvmet_ana_state_names[i].name);
+ }
+
+ return sprintf(page, "\n");
+ }
+
+ static ssize_t nvmet_ana_group_ana_state_store(struct config_item *item,
+ const char *page, size_t count)
+ {
+ struct nvmet_ana_group *grp = to_ana_group(item);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
+ if (sysfs_streq(page, nvmet_ana_state_names[i].name))
+ goto found;
+ }
+
+ pr_err("Invalid value '%s' for ana_state\n", page);
+ return -EINVAL;
+
+ found:
+ down_write(&nvmet_ana_sem);
+ grp->port->ana_state[grp->grpid] = nvmet_ana_state_names[i].state;
+ nvmet_ana_chgcnt++;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_port_send_ana_event(grp->port);
+ return count;
+ }
+
+ CONFIGFS_ATTR(nvmet_ana_group_, ana_state);
+
+ static struct configfs_attribute *nvmet_ana_group_attrs[] = {
+ &nvmet_ana_group_attr_ana_state,
+ NULL,
+ };
+
+ static void nvmet_ana_group_release(struct config_item *item)
+ {
+ struct nvmet_ana_group *grp = to_ana_group(item);
+
+ if (grp == &grp->port->ana_default_group)
+ return;
+
+ down_write(&nvmet_ana_sem);
+ grp->port->ana_state[grp->grpid] = NVME_ANA_INACCESSIBLE;
+ nvmet_ana_group_enabled[grp->grpid]--;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_port_send_ana_event(grp->port);
+ kfree(grp);
+ }
+
+ static struct configfs_item_operations nvmet_ana_group_item_ops = {
+ .release = nvmet_ana_group_release,
+ };
+
+ static const struct config_item_type nvmet_ana_group_type = {
+ .ct_item_ops = &nvmet_ana_group_item_ops,
+ .ct_attrs = nvmet_ana_group_attrs,
+ .ct_owner = THIS_MODULE,
+ };
+
+ static struct config_group *nvmet_ana_groups_make_group(
+ struct config_group *group, const char *name)
+ {
+ struct nvmet_port *port = ana_groups_to_port(&group->cg_item);
+ struct nvmet_ana_group *grp;
+ u32 grpid;
+ int ret;
+
+ ret = kstrtou32(name, 0, &grpid);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (grpid <= 1 || grpid > NVMET_MAX_ANAGRPS)
+ goto out;
+
+ ret = -ENOMEM;
+ grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+ if (!grp)
+ goto out;
+ grp->port = port;
+ grp->grpid = grpid;
+
+ down_write(&nvmet_ana_sem);
+ nvmet_ana_group_enabled[grpid]++;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_port_send_ana_event(grp->port);
+
+ config_group_init_type_name(&grp->group, name, &nvmet_ana_group_type);
+ return &grp->group;
+ out:
+ return ERR_PTR(ret);
+ }
+
+ static struct configfs_group_operations nvmet_ana_groups_group_ops = {
+ .make_group = nvmet_ana_groups_make_group,
+ };
+
+ static const struct config_item_type nvmet_ana_groups_type = {
+ .ct_group_ops = &nvmet_ana_groups_group_ops,
+ .ct_owner = THIS_MODULE,
+ };
+
/*
* Ports definitions.
*/
{
struct nvmet_port *port = to_nvmet_port(item);
+ kfree(port->ana_state);
kfree(port);
}
&nvmet_attr_addr_traddr,
&nvmet_attr_addr_trsvcid,
&nvmet_attr_addr_trtype,
+ &nvmet_attr_param_inline_data_size,
NULL,
};
{
struct nvmet_port *port;
u16 portid;
+ u32 i;
if (kstrtou16(name, 0, &portid))
return ERR_PTR(-EINVAL);
if (!port)
return ERR_PTR(-ENOMEM);
+ port->ana_state = kcalloc(NVMET_MAX_ANAGRPS + 1,
+ sizeof(*port->ana_state), GFP_KERNEL);
+ if (!port->ana_state) {
+ kfree(port);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) {
+ if (i == NVMET_DEFAULT_ANA_GRPID)
+ port->ana_state[1] = NVME_ANA_OPTIMIZED;
+ else
+ port->ana_state[i] = NVME_ANA_INACCESSIBLE;
+ }
+
INIT_LIST_HEAD(&port->entry);
INIT_LIST_HEAD(&port->subsystems);
INIT_LIST_HEAD(&port->referrals);
+ port->inline_data_size = -1; /* < 0 == let the transport choose */
port->disc_addr.portid = cpu_to_le16(portid);
config_group_init_type_name(&port->group, name, &nvmet_port_type);
"referrals", &nvmet_referrals_type);
configfs_add_default_group(&port->referrals_group, &port->group);
+ config_group_init_type_name(&port->ana_groups_group,
+ "ana_groups", &nvmet_ana_groups_type);
+ configfs_add_default_group(&port->ana_groups_group, &port->group);
+
+ port->ana_default_group.port = port;
+ port->ana_default_group.grpid = NVMET_DEFAULT_ANA_GRPID;
+ config_group_init_type_name(&port->ana_default_group.group,
+ __stringify(NVMET_DEFAULT_ANA_GRPID),
+ &nvmet_ana_group_type);
+ configfs_add_default_group(&port->ana_default_group.group,
+ &port->ana_groups_group);
+
return &port->group;
}
#include "nvmet.h"
+ struct workqueue_struct *buffered_io_wq;
static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
static DEFINE_IDA(cntlid_ida);
*/
DECLARE_RWSEM(nvmet_config_sem);
+ u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
+ u64 nvmet_ana_chgcnt;
+ DECLARE_RWSEM(nvmet_ana_sem);
+
static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
const char *subsysnqn);
mutex_unlock(&ctrl->lock);
}
- static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
+ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
{
struct nvmet_ctrl *ctrl;
}
}
+ void nvmet_send_ana_event(struct nvmet_subsys *subsys,
+ struct nvmet_port *port)
+ {
+ struct nvmet_ctrl *ctrl;
+
+ mutex_lock(&subsys->lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ if (port && ctrl->port != port)
+ continue;
+ if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE))
+ continue;
+ nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+ NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
+ }
+ mutex_unlock(&subsys->lock);
+ }
+
+ void nvmet_port_send_ana_event(struct nvmet_port *port)
+ {
+ struct nvmet_subsys_link *p;
+
+ down_read(&nvmet_config_sem);
+ list_for_each_entry(p, &port->subsystems, entry)
+ nvmet_send_ana_event(p->subsys, port);
+ up_read(&nvmet_config_sem);
+ }
+
int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
{
int ret = 0;
return ret;
}
+ /* If the transport didn't set inline_data_size, then disable it. */
+ if (port->inline_data_size < 0)
+ port->inline_data_size = 0;
+
port->enabled = true;
return 0;
}
int nvmet_ns_enable(struct nvmet_ns *ns)
{
struct nvmet_subsys *subsys = ns->subsys;
- int ret = 0;
+ int ret;
mutex_lock(&subsys->lock);
+ ret = -EMFILE;
+ if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
+ goto out_unlock;
+ ret = 0;
if (ns->enabled)
goto out_unlock;
ret = nvmet_bdev_ns_enable(ns);
- if (ret)
+ if (ret == -ENOTBLK)
ret = nvmet_file_ns_enable(ns);
if (ret)
goto out_unlock;
list_add_tail_rcu(&ns->dev_link, &old->dev_link);
}
+ subsys->nr_namespaces++;
nvmet_ns_changed(subsys, ns->nsid);
ns->enabled = true;
percpu_ref_exit(&ns->ref);
mutex_lock(&subsys->lock);
+ subsys->nr_namespaces--;
nvmet_ns_changed(subsys, ns->nsid);
nvmet_ns_dev_disable(ns);
out_unlock:
{
nvmet_ns_disable(ns);
+ down_write(&nvmet_ana_sem);
+ nvmet_ana_group_enabled[ns->anagrpid]--;
+ up_write(&nvmet_ana_sem);
+
kfree(ns->device_path);
kfree(ns);
}
ns->nsid = nsid;
ns->subsys = subsys;
+
+ down_write(&nvmet_ana_sem);
+ ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
+ nvmet_ana_group_enabled[ns->anagrpid]++;
+ up_write(&nvmet_ana_sem);
+
uuid_gen(&ns->uuid);
+ ns->buffered_io = false;
return ns;
}
}
EXPORT_SYMBOL_GPL(nvmet_sq_init);
+ static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
+ struct nvmet_ns *ns)
+ {
+ enum nvme_ana_state state = port->ana_state[ns->anagrpid];
+
+ if (unlikely(state == NVME_ANA_INACCESSIBLE))
+ return NVME_SC_ANA_INACCESSIBLE;
+ if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
+ return NVME_SC_ANA_PERSISTENT_LOSS;
+ if (unlikely(state == NVME_ANA_CHANGE))
+ return NVME_SC_ANA_TRANSITION;
+ return 0;
+ }
+
+ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
+ {
+ if (unlikely(req->ns->readonly)) {
+ switch (req->cmd->common.opcode) {
+ case nvme_cmd_read:
+ case nvme_cmd_flush:
+ break;
+ default:
+ return NVME_SC_NS_WRITE_PROTECTED;
+ }
+ }
+
+ return 0;
+ }
+
static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
if (unlikely(!req->ns))
return NVME_SC_INVALID_NS | NVME_SC_DNR;
+ ret = nvmet_check_ana_state(req->port, req->ns);
+ if (unlikely(ret))
+ return ret;
+ ret = nvmet_io_cmd_check_access(req);
+ if (unlikely(ret))
+ return ret;
if (req->ns->file)
return nvmet_file_parse_io_cmd(req);
nvmet_init_cap(ctrl);
+ ctrl->port = req->port;
+
INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
INIT_LIST_HEAD(&ctrl->async_events);
{
int error;
+ nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
+
+ buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
+ WQ_MEM_RECLAIM, 0);
+ if (!buffered_io_wq) {
+ error = -ENOMEM;
+ goto out;
+ }
+
error = nvmet_init_discovery();
if (error)
goto out;
nvmet_exit_configfs();
nvmet_exit_discovery();
ida_destroy(&cntlid_ida);
+ destroy_workqueue(buffered_io_wq);
BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
blk_status_t ret;
if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready))
- return nvmf_fail_nonready_command(req);
+ return nvmf_fail_nonready_command(&queue->ctrl->ctrl, req);
ret = nvme_setup_cmd(ns, req, &iod->cmd);
if (ret)
{
struct nvme_loop_ctrl *ctrl = set->driver_data;
+ nvme_req(req)->ctrl = &ctrl->ctrl;
return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req),
(set == &ctrl->tag_set) ? hctx_idx + 1 : 0);
}
ret = bio_iov_iter_get_pages(&bio, iter);
if (unlikely(ret))
- return ret;
+ goto out;
ret = bio.bi_iter.bi_size;
if (iov_iter_rw(iter) == READ) {
put_page(bvec->bv_page);
}
- if (vecs != inline_vecs)
- kfree(vecs);
-
if (unlikely(bio.bi_status))
ret = blk_status_to_errno(bio.bi_status);
+out:
+ if (vecs != inline_vecs)
+ kfree(vecs);
+
bio_uninit(&bio);
return ret;
result = blk_queue_enter(bdev->bd_queue, 0);
if (result)
return result;
- result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+ REQ_OP_READ);
blk_queue_exit(bdev->bd_queue);
return result;
}
return result;
set_page_writeback(page);
- result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+ REQ_OP_WRITE);
if (result) {
end_page_writeback(page);
} else {
bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}
+static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
+{
+ time64_t now = ktime_get_real_seconds();
+
+ now = clamp_val(now, 0, (1ull << 40) - 1);
+
+ *lo = cpu_to_le32(lower_32_bits(now));
+ *hi = upper_32_bits(now);
+}
+
+static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
+{
+ return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
+}
+#define ext4_update_tstamp(es, tstamp) \
+ __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
+#define ext4_get_tstamp(es, tstamp) \
+ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
static void __save_error_info(struct super_block *sb, const char *func,
unsigned int line)
if (bdev_read_only(sb->s_bdev))
return;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- es->s_last_error_time = cpu_to_le32(get_seconds());
+ ext4_update_tstamp(es, s_last_error_time);
strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
es->s_last_error_line = cpu_to_le32(line);
if (!es->s_first_error_time) {
es->s_first_error_time = es->s_last_error_time;
+ es->s_first_error_time_hi = es->s_last_error_time_hi;
strncpy(es->s_first_error_func, func,
sizeof(es->s_first_error_func));
es->s_first_error_line = cpu_to_le32(line);
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+ int ret;
- if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) &&
- !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) {
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
- &grp->bb_state);
+ if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
+ ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+ &grp->bb_state);
+ if (!ret)
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
}
- if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) &&
- !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- if (gdp) {
+ if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
+ ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
+ &grp->bb_state);
+ if (!ret && gdp) {
int count;
count = ext4_free_inodes_count(sb, gdp);
percpu_counter_sub(&sbi->s_freeinodes_counter,
count);
}
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
- &grp->bb_state);
}
}
"warning: maximal mount count reached, "
"running e2fsck is recommended");
else if (le32_to_cpu(es->s_checkinterval) &&
- (le32_to_cpu(es->s_lastcheck) +
- le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+ (ext4_get_tstamp(es, s_lastcheck) +
+ le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
ext4_msg(sb, KERN_WARNING,
"warning: checktime reached, "
"running e2fsck is recommended");
if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
- es->s_mtime = cpu_to_le32(get_seconds());
+ ext4_update_tstamp(es, s_mtime);
ext4_update_dynamic_rev(sb);
if (sbi->s_journal)
ext4_set_feature_journal_needs_recovery(sb);
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
ext4_fsblk_t last_block;
- ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0) + 1;
+ ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
ext4_fsblk_t block_bitmap;
ext4_fsblk_t inode_bitmap;
ext4_fsblk_t inode_table;
ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
le32_to_cpu(es->s_error_count));
if (es->s_first_error_time) {
- printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
- sb->s_id, le32_to_cpu(es->s_first_error_time),
+ printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
+ sb->s_id,
+ ext4_get_tstamp(es, s_first_error_time),
(int) sizeof(es->s_first_error_func),
es->s_first_error_func,
le32_to_cpu(es->s_first_error_line));
printk(KERN_CONT "\n");
}
if (es->s_last_error_time) {
- printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
- sb->s_id, le32_to_cpu(es->s_last_error_time),
+ printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
+ sb->s_id,
+ ext4_get_tstamp(es, s_last_error_time),
(int) sizeof(es->s_last_error_func),
es->s_last_error_func,
le32_to_cpu(es->s_last_error_line));
if (!gdp)
continue;
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
- continue;
- if (group != 0)
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
break;
- ext4_error(sb, "Inode table for bg 0 marked as "
- "needing zeroing");
- if (sb_rdonly(sb))
- return ngroups;
}
return group;
sbi->s_sb_block = sb_block;
if (sb->s_bdev->bd_part)
sbi->s_sectors_written_start =
- part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+ part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
/* Cleanup superblock name */
strreplace(sb->s_id, '/', '!');
goto failed_mount2;
}
}
+ sbi->s_gdb_count = db_count;
if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
ret = -EFSCORRUPTED;
goto failed_mount2;
}
- sbi->s_gdb_count = db_count;
-
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
/* Register extent status tree shrinker */
* to complain and force a full file system check.
*/
if (!(sb->s_flags & SB_RDONLY))
- es->s_wtime = cpu_to_le32(get_seconds());
+ ext4_update_tstamp(es, s_wtime);
if (sb->s_bdev->bd_part)
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ ((part_stat_read(sb->s_bdev->bd_part,
+ sectors[STAT_WRITE]) -
EXT4_SB(sb)->s_sectors_written_start) >> 1));
else
es->s_kbytes_written =
#endif
char *orig_data = kstrdup(data, GFP_KERNEL);
+ if (data && !orig_data)
+ return -ENOMEM;
+
/* Store the original options */
old_sb_flags = sb->s_flags;
old_opts.s_mount_opt = sbi->s_mount_opt;
if (sbi->s_journal)
ext4_mark_recovery_complete(sb, es);
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
} else {
/* Make sure we can mount this feature set readwrite */
if (ext4_has_feature_readonly(sb) ||
DQUOT_USAGE_ENABLED |
(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
if (err) {
- for (type--; type >= 0; type--)
- dquot_quota_off(sb, type);
-
ext4_warning(sb,
"Failed to enable quota tracking "
"(type=%d, err=%d). Please run "
"e2fsck to fix.", type, err);
+ for (type--; type >= 0; type--)
+ dquot_quota_off(sb, type);
+
return err;
}
}
attr_reserved_clusters,
attr_inode_readahead,
attr_trigger_test_error,
+ attr_first_error_time,
+ attr_last_error_time,
attr_feature,
attr_pointer_ui,
attr_pointer_atomic,
if (!sb->s_bdev->bd_part)
return snprintf(buf, PAGE_SIZE, "0\n");
return snprintf(buf, PAGE_SIZE, "%lu\n",
- (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ (part_stat_read(sb->s_bdev->bd_part,
+ sectors[STAT_WRITE]) -
sbi->s_sectors_written_start) >> 1);
}
return snprintf(buf, PAGE_SIZE, "0\n");
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)(sbi->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ ((part_stat_read(sb->s_bdev->bd_part,
+ sectors[STAT_WRITE]) -
EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
-EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
-EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
+EXT4_ATTR(first_error_time, 0444, first_error_time);
+EXT4_ATTR(last_error_time, 0444, last_error_time);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
return NULL;
}
+static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
+{
+ return snprintf(buf, PAGE_SIZE, "%lld",
+ ((time64_t)hi << 32) + le32_to_cpu(lo));
+}
+
+#define print_tstamp(buf, es, tstamp) \
+ __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi)
+
static ssize_t ext4_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
case attr_pointer_ui:
if (!ptr)
return 0;
- return snprintf(buf, PAGE_SIZE, "%u\n",
- *((unsigned int *) ptr));
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ le32_to_cpup(ptr));
+ else
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ *((unsigned int *) ptr));
case attr_pointer_atomic:
if (!ptr)
return 0;
atomic_read((atomic_t *) ptr));
case attr_feature:
return snprintf(buf, PAGE_SIZE, "supported\n");
+ case attr_first_error_time:
+ return print_tstamp(buf, sbi->s_es, s_first_error_time);
+ case attr_last_error_time:
+ return print_tstamp(buf, sbi->s_es, s_last_error_time);
}
return 0;
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret)
return ret;
- *((unsigned int *) ptr) = t;
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ *((__le32 *) ptr) = cpu_to_le32(t);
+ else
+ *((unsigned int *) ptr) = t;
return len;
case attr_inode_readahead:
return inode_readahead_blks_store(sbi, buf, len);
struct sbitmap ctx_map;
struct blk_mq_ctx *dispatch_from;
+ unsigned int dispatch_busy;
- struct blk_mq_ctx **ctxs;
unsigned int nr_ctx;
+ struct blk_mq_ctx **ctxs;
+ spinlock_t dispatch_wait_lock;
wait_queue_entry_t dispatch_wait;
atomic_t wait_index;
void blk_mq_quiesce_queue_nowait(struct request_queue *q);
+/**
+ * blk_mq_mark_complete() - Set request state to complete
+ * @rq: request to set to complete state
+ *
+ * Returns true if request state was successfully set to complete. If
+ * successful, the caller is responsibile for seeing this request is ended, as
+ * blk_mq_complete_request will not work again.
+ */
+static inline bool blk_mq_mark_complete(struct request *rq)
+{
+ return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) ==
+ MQ_RQ_IN_FLIGHT;
+}
+
/*
* Driver command data is immediately after the request. So subtract request
* size to get back to the original request, add request size to get the PDU.
* need_sleep = false;
* wake_up_state(p, TASK_UNINTERRUPTIBLE);
*
- * Where wake_up_state() (and all other wakeup primitives) imply enough
- * barriers to order the store of the variable against wakeup.
+ * where wake_up_state() executes a full memory barrier before accessing the
+ * task state.
*
* Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
* once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
/* disallow userland-initiated cgroup migration */
unsigned no_cgroup_migration:1;
#endif
+ #ifdef CONFIG_BLK_CGROUP
+ /* to be used once the psi infrastructure lands upstream. */
+ unsigned use_memdelay:1;
+ #endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
u64 last_sum_exec_runtime;
struct callback_head numa_work;
- struct list_head numa_entry;
struct numa_group *numa_group;
/*
unsigned int memcg_nr_pages_over_high;
#endif
+ #ifdef CONFIG_BLK_CGROUP
+ struct request_queue *throttle_queue;
+ #endif
+
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;
#endif
{
struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
- if (vma) {
- vma->vm_mm = mm;
- INIT_LIST_HEAD(&vma->anon_vma_chain);
- }
+ if (vma)
+ vma_init(vma, mm);
return vma;
}
tsk->fail_nth = 0;
#endif
+ #ifdef CONFIG_BLK_CGROUP
+ tsk->throttle_queue = NULL;
+ tsk->use_memdelay = 0;
+ #endif
+
return tsk;
free_stack:
void __init proc_caches_init(void)
{
+ unsigned int mm_size;
+
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
+
/*
- * FIXME! The "sizeof(struct mm_struct)" currently includes the
- * whole struct cpumask for the OFFSTACK case. We could change
- * this to *only* allocate as much of it as required by the
- * maximum number of CPU's we can ever have. The cpumask_allocation
- * is at the end of the structure, exactly for that reason.
+ * The mm_cpumask is located at the end of mm_struct, and is
+ * dynamically sized based on the maximum CPU number this system
+ * can have, taking hotplug into account (nr_cpu_ids).
*/
+ mm_size = sizeof(struct mm_struct) + cpumask_size();
+
mm_cachep = kmem_cache_create_usercopy("mm_struct",
- sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
+ mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
offsetof(struct mm_struct, saved_auxv),
sizeof_field(struct mm_struct, saved_auxv),
static DEFINE_IDR(mem_cgroup_idr);
+static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
+{
+ if (memcg->id.id > 0) {
+ idr_remove(&mem_cgroup_idr, memcg->id.id);
+ memcg->id.id = 0;
+ }
+}
+
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
{
VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
if (atomic_sub_and_test(n, &memcg->id.ref)) {
- idr_remove(&mem_cgroup_idr, memcg->id.id);
- memcg->id.id = 0;
+ mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
css_put(&memcg->css);
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
fail:
- if (memcg->id.id > 0)
- idr_remove(&mem_cgroup_idr, memcg->id.id);
+ mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
return NULL;
}
return &memcg->css;
fail:
+ mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
return ERR_PTR(-ENOMEM);
}
return ret;
}
+ int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound)
+ {
+ struct mem_cgroup *memcg;
+ int ret;
+
+ ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
+ memcg = *memcgp;
+ mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
+ return ret;
+ }
+
/**
* mem_cgroup_commit_charge - commit a page charge
* @page: page to charge
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-/*
- * See the comment near struct mmu_table_batch.
- */
-
static void tlb_remove_table_smp_sync(void *arg)
{
- /* Simply deliver the interrupt */
+ struct mm_struct __maybe_unused *mm = arg;
+ /*
+ * On most architectures this does nothing. Simply delivering the
+ * interrupt is enough to prevent races with software page table
+ * walking like that done in get_user_pages_fast.
+ *
+ * See the comment near struct mmu_table_batch.
+ */
+ tlb_flush_remove_tables_local(mm);
}
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
* It is however sufficient for software page-table walkers that rely on
* IRQ disabling. See the comment near struct mmu_table_batch.
*/
- smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+ smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
__tlb_remove_table(table);
}
{
struct mmu_table_batch **batch = &tlb->batch;
+ tlb_flush_remove_tables(tlb->mm);
+
if (*batch) {
call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
*batch = NULL;
if (*batch == NULL) {
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (*batch == NULL) {
- tlb_remove_table_one(table);
+ tlb_remove_table_one(table, tlb);
return;
}
(*batch)->nr = 0;
do {
next = pmd_addr_end(addr, end);
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
- if (next - addr != HPAGE_PMD_SIZE) {
- VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
- !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+ if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false, NULL);
- } else if (zap_huge_pmd(tlb, vma, pmd, addr))
+ else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
}
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
+ if (!pfn_modify_allowed(pfn, pgprot))
+ return -EACCES;
+
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
track_pfn_insert(vma, &pgprot, pfn);
+ if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+ return -EACCES;
+
/*
* If we don't have pte special, then we have to use the pfn_valid()
* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
{
pte_t *pte;
spinlock_t *ptl;
+ int err = 0;
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
arch_enter_lazy_mmu_mode();
do {
BUG_ON(!pte_none(*pte));
+ if (!pfn_modify_allowed(pfn, prot)) {
+ err = -EACCES;
+ break;
+ }
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
- return 0;
+ return err;
}
static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
{
pmd_t *pmd;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
pmd = pmd_alloc(mm, pud, addr);
VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
- if (remap_pte_range(mm, pmd, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pte_range(mm, pmd, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (pmd++, addr = next, addr != end);
return 0;
}
{
pud_t *pud;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
pud = pud_alloc(mm, p4d, addr);
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (remap_pmd_range(mm, pud, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pmd_range(mm, pud, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (pud++, addr = next, addr != end);
return 0;
}
{
p4d_t *p4d;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
p4d = p4d_alloc(mm, pgd, addr);
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (remap_pud_range(mm, p4d, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pud_range(mm, p4d, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (p4d++, addr = next, addr != end);
return 0;
}
cow_user_page(new_page, old_page, vmf->address, vma);
}
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_new;
__SetPageUptodate(new_page);
goto out_page;
}
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
if (!page)
goto oom;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
+ false))
goto oom_free_page;
/*
if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+ if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
&vmf->memcg, false)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
+ if (!maddr)
+ return -ENOMEM;
+
if (write)
memcpy_toio(maddr + offset, buf, len);
else
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
- false);
+ error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
+ &memcg, false);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
{
/* Create a pseudo vma that just contains the policy */
memset(vma, 0, sizeof(*vma));
+ vma_init(vma, NULL);
/* Bias interleave by inode number to distribute better across nodes */
vma->vm_pgoff = index + info->vfs_inode.i_ino;
vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
goto failed;
}
- error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
PageTransHuge(page));
if (error)
goto unacct;
__SetPageSwapBacked(page);
__SetPageUptodate(page);
- ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+ ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
if (ret)
goto out_release;
/* common code */
-static const struct dentry_operations anon_ops = {
- .d_dname = simple_dname
-};
-
static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
unsigned long flags, unsigned int i_flags)
{
- struct file *res;
struct inode *inode;
- struct path path;
- struct super_block *sb;
- struct qstr this;
+ struct file *res;
if (IS_ERR(mnt))
return ERR_CAST(mnt);
if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM);
- res = ERR_PTR(-ENOMEM);
- this.name = name;
- this.len = strlen(name);
- this.hash = 0; /* will go */
- sb = mnt->mnt_sb;
- path.mnt = mntget(mnt);
- path.dentry = d_alloc_pseudo(sb, &this);
- if (!path.dentry)
- goto put_memory;
- d_set_d_op(path.dentry, &anon_ops);
-
- res = ERR_PTR(-ENOSPC);
- inode = shmem_get_inode(sb, NULL, S_IFREG | 0777, 0, flags);
- if (!inode)
- goto put_memory;
-
+ inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
+ flags);
+ if (unlikely(!inode)) {
+ shmem_unacct_size(flags, size);
+ return ERR_PTR(-ENOSPC);
+ }
inode->i_flags |= i_flags;
- d_instantiate(path.dentry, inode);
inode->i_size = size;
clear_nlink(inode); /* It is unlinked */
res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
+ if (!IS_ERR(res))
+ res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
+ &shmem_file_operations);
if (IS_ERR(res))
- goto put_path;
-
- res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
- &shmem_file_operations);
- if (IS_ERR(res))
- goto put_path;
-
- return res;
-
-put_memory:
- shmem_unacct_size(flags, size);
-put_path:
- path_put(&path);
+ iput(inode);
return res;
}
return 0;
}
+
+/*
+ * Find out how many pages are allowed for a single swap device. There
+ * are two limiting factors:
+ * 1) the number of bits for the swap offset in the swp_entry_t type, and
+ * 2) the number of bits in the swap pte, as defined by the different
+ * architectures.
+ *
+ * In order to find the largest possible bit mask, a swap entry with
+ * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
+ * decoded to a swp_entry_t again, and finally the swap offset is
+ * extracted.
+ *
+ * This will mask all the bits from the initial ~0UL mask that can't
+ * be encoded in either the swp_entry_t or the architecture definition
+ * of a swap pte.
+ */
+unsigned long generic_max_swapfile_size(void)
+{
+ return swp_offset(pte_to_swp_entry(
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+}
+
+/* Can be overridden by an architecture for additional checks. */
+__weak unsigned long max_swapfile_size(void)
+{
+ return generic_max_swapfile_size();
+}
+
static unsigned long read_swap_header(struct swap_info_struct *p,
union swap_header *swap_header,
struct inode *inode)
p->cluster_next = 1;
p->cluster_nr = 0;
- /*
- * Find out how many pages are allowed for a single swap
- * device. There are two limiting factors: 1) the number
- * of bits for the swap offset in the swp_entry_t type, and
- * 2) the number of bits in the swap pte as defined by the
- * different architectures. In order to find the
- * largest possible bit mask, a swap entry with swap type 0
- * and swap offset ~0UL is created, encoded to a swap pte,
- * decoded to a swp_entry_t again, and finally the swap
- * offset is extracted. This will mask all the bits from
- * the initial ~0UL mask that can't be encoded in either
- * the swp_entry_t or the architecture definition of a
- * swap pte.
- */
- maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ maxpages = max_swapfile_size();
last_page = swap_header->info.last_page;
if (!last_page) {
pr_warn("Empty swap-file\n");
}
}
+ #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+ gfp_t gfp_mask)
+ {
+ struct swap_info_struct *si, *next;
+ if (!(gfp_mask & __GFP_IO) || !memcg)
+ return;
+
+ if (!blk_cgroup_congested())
+ return;
+
+ /*
+ * We've already scheduled a throttle, avoid taking the global swap
+ * lock.
+ */
+ if (current->throttle_queue)
+ return;
+
+ spin_lock(&swap_avail_lock);
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
+ avail_lists[node]) {
+ if (si->bdev) {
+ blkcg_schedule_throttle(bdev_get_queue(si->bdev),
+ true);
+ break;
+ }
+ }
+ spin_unlock(&swap_avail_lock);
+ }
+ #endif
+
static int __init swapfile_init(void)
{
int nid;