--- Documentation/block/queue-sysfs.txt | 13 block/Kconfig | 1 block/Makefile | 2 block/blk-core.c | 22 + block/blk-mq-sysfs.c | 47 ++ block/blk-mq.c | 42 ++ block/blk-mq.h | 3 block/blk-settings.c | 15 block/blk-stat.c | 221 +++++++++++ block/blk-stat.h | 18 block/blk-sysfs.c | 151 ++++++++ block/cfq-iosched.c | 13 drivers/scsi/scsi.c | 3 fs/block_dev.c | 2 fs/buffer.c | 2 fs/f2fs/data.c | 2 fs/f2fs/node.c | 2 fs/gfs2/meta_io.c | 3 fs/mpage.c | 9 fs/xfs/xfs_aops.c | 6 include/linux/backing-dev-defs.h | 2 include/linux/blk_types.h | 16 include/linux/blkdev.h | 19 + include/linux/fs.h | 4 include/linux/wbt.h | 120 ++++++ include/linux/writeback.h | 10 include/trace/events/wbt.h | 153 ++++++++ lib/Kconfig | 3 lib/Makefile | 1 lib/wbt.c | 679 ++++++++++++++++++++++++++++++++++++ mm/backing-dev.c | 1 mm/page-writeback.c | 1 32 files changed, 1565 insertions(+), 21 deletions(-) Index: linux-4.7-ck5/include/linux/blk_types.h =================================================================== --- linux-4.7-ck5.orig/include/linux/blk_types.h 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/include/linux/blk_types.h 2016-09-23 08:35:54.701649331 +1000 @@ -161,6 +161,7 @@ enum rq_flag_bits { __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_FLUSH, /* request for cache flush */ + __REQ_BG, /* background activity */ /* bio only flags */ __REQ_RAHEAD, /* read ahead, can fail anytime */ @@ -208,7 +209,7 @@ enum rq_flag_bits { #define REQ_COMMON_MASK \ (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \ REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \ - REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE) + REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE | REQ_BG) #define REQ_CLONE_MASK REQ_COMMON_MASK #define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME) @@ -235,6 +236,7 @@ enum rq_flag_bits { #define REQ_COPY_USER (1ULL << __REQ_COPY_USER) #define REQ_FLUSH (1ULL << __REQ_FLUSH) #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) +#define REQ_BG (1ULL << __REQ_BG) #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) #define REQ_SECURE (1ULL << __REQ_SECURE) @@ -266,4 +268,16 @@ static inline unsigned int blk_qc_t_to_t return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +#define BLK_RQ_STAT_BATCH 64 + +struct blk_rq_stat { + s64 mean; + u64 min; + u64 max; + s32 nr_samples; + s32 nr_batch; + u64 batch; + s64 time; +}; + #endif /* __LINUX_BLK_TYPES_H */ Index: linux-4.7-ck5/include/linux/fs.h =================================================================== --- linux-4.7-ck5.orig/include/linux/fs.h 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/include/linux/fs.h 2016-09-23 08:35:54.701649331 +1000 @@ -191,6 +191,9 @@ typedef int (dio_iodone_t)(struct kiocb * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded * by a cache flush and data is guaranteed to be on * non-volatile media on completion. + * WRITE_BG Background write. This is for background activity like + * the periodic flush and background threshold writeback + * * */ #define RW_MASK REQ_WRITE @@ -206,6 +209,7 @@ typedef int (dio_iodone_t)(struct kiocb #define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH) #define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA) #define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) +#define WRITE_BG (WRITE | REQ_NOIDLE | REQ_BG) /* * Attribute flags. These should be or-ed together to figure out what Index: linux-4.7-ck5/fs/block_dev.c =================================================================== --- linux-4.7-ck5.orig/fs/block_dev.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/fs/block_dev.c 2016-09-23 08:35:54.701649331 +1000 @@ -445,7 +445,7 @@ int bdev_write_page(struct block_device struct page *page, struct writeback_control *wbc) { int result; - int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; + int rw = wbc_to_write_cmd(wbc); const struct block_device_operations *ops = bdev->bd_disk->fops; if (!ops->rw_page || bdev_get_integrity(bdev)) Index: linux-4.7-ck5/fs/buffer.c =================================================================== --- linux-4.7-ck5.orig/fs/buffer.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/fs/buffer.c 2016-09-23 08:35:54.702649325 +1000 @@ -1697,7 +1697,7 @@ static int __block_write_full_page(struc struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_op = wbc_to_write_cmd(wbc); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); Index: linux-4.7-ck5/fs/f2fs/data.c =================================================================== --- linux-4.7-ck5.orig/fs/f2fs/data.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/fs/f2fs/data.c 2016-09-23 08:35:54.702649325 +1000 @@ -1221,7 +1221,7 @@ static int f2fs_write_data_page(struct p struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .rw = wbc_to_write_cmd(wbc), .page = page, .encrypted_page = NULL, }; Index: linux-4.7-ck5/fs/f2fs/node.c =================================================================== --- linux-4.7-ck5.orig/fs/f2fs/node.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/fs/f2fs/node.c 2016-09-23 08:35:54.703649319 +1000 @@ -1568,7 +1568,7 @@ static int f2fs_write_node_page(struct p struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .rw = wbc_to_write_cmd(wbc), .page = page, .encrypted_page = NULL, }; Index: linux-4.7-ck5/fs/gfs2/meta_io.c =================================================================== --- linux-4.7-ck5.orig/fs/gfs2/meta_io.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/fs/gfs2/meta_io.c 2016-09-23 08:35:54.703649319 +1000 @@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct { struct buffer_head *bh, *head; int nr_underway = 0; - int write_op = REQ_META | REQ_PRIO | - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_op = REQ_META | REQ_PRIO | wbc_to_write_cmd(wbc); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); Index: linux-4.7-ck5/fs/mpage.c =================================================================== --- linux-4.7-ck5.orig/fs/mpage.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/fs/mpage.c 2016-09-23 08:35:54.703649319 +1000 @@ -486,7 +486,6 @@ static int __mpage_writepage(struct page struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; - int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -595,7 +594,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); alloc_new: if (bio == NULL) { @@ -622,7 +621,7 @@ alloc_new: wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); goto alloc_new; } @@ -632,7 +631,7 @@ alloc_new: set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -644,7 +643,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); Index: linux-4.7-ck5/fs/xfs/xfs_aops.c =================================================================== --- linux-4.7-ck5.orig/fs/xfs/xfs_aops.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/fs/xfs/xfs_aops.c 2016-09-23 08:35:54.703649319 +1000 @@ -451,8 +451,7 @@ xfs_submit_ioend( return status; } - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, - ioend->io_bio); + submit_bio(wbc_to_write_cmd(wbc), ioend->io_bio); return 0; } @@ -510,8 +509,7 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, - ioend->io_bio); + submit_bio(wbc_to_write_cmd(wbc), ioend->io_bio); ioend->io_bio = new; } Index: linux-4.7-ck5/include/linux/writeback.h =================================================================== --- linux-4.7-ck5.orig/include/linux/writeback.h 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/include/linux/writeback.h 2016-09-23 08:35:54.703649319 +1000 @@ -100,6 +100,16 @@ struct writeback_control { #endif }; +static inline int wbc_to_write_cmd(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL) + return WRITE_SYNC; + else if (wbc->for_kupdate || wbc->for_background) + return WRITE_BG; + + return WRITE; +} + /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to * and are measured against each other in. There always is one global Index: linux-4.7-ck5/include/linux/backing-dev-defs.h =================================================================== --- linux-4.7-ck5.orig/include/linux/backing-dev-defs.h 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/include/linux/backing-dev-defs.h 2016-09-23 08:35:54.703649319 +1000 @@ -116,6 +116,8 @@ struct bdi_writeback { struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + unsigned long dirty_sleep; /* last wait */ + struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK Index: linux-4.7-ck5/mm/backing-dev.c =================================================================== --- linux-4.7-ck5.orig/mm/backing-dev.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/mm/backing-dev.c 2016-09-23 08:35:54.703649319 +1000 @@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + wb->dirty_sleep = jiffies; wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); if (!wb->congested) Index: linux-4.7-ck5/mm/page-writeback.c =================================================================== --- linux-4.7-ck5.orig/mm/page-writeback.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/mm/page-writeback.c 2016-09-23 08:35:54.704649313 +1000 @@ -1753,6 +1753,7 @@ pause: pause, start_time); __set_current_state(TASK_KILLABLE); + wb->dirty_sleep = now; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; Index: linux-4.7-ck5/block/blk-settings.c =================================================================== --- linux-4.7-ck5.orig/block/blk-settings.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/block/blk-settings.c 2016-09-23 08:35:54.704649313 +1000 @@ -832,6 +832,19 @@ void blk_queue_flush_queueable(struct re EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); /** + * blk_set_queue_depth - tell the block layer about the device queue depth + * @q: the request queue for the device + * @depth: queue depth + * + */ +void blk_set_queue_depth(struct request_queue *q, unsigned int depth) +{ + q->queue_depth = depth; + wbt_set_queue_depth(q->rq_wb, depth); +} +EXPORT_SYMBOL(blk_set_queue_depth); + +/** * blk_queue_write_cache - configure queue's write cache * @q: the request queue for the device * @wc: write back cache on or off @@ -851,6 +864,8 @@ void blk_queue_write_cache(struct reques else queue_flag_clear(QUEUE_FLAG_FUA, q); spin_unlock_irq(q->queue_lock); + + wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); Index: linux-4.7-ck5/drivers/scsi/scsi.c =================================================================== --- linux-4.7-ck5.orig/drivers/scsi/scsi.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/drivers/scsi/scsi.c 2016-09-23 08:35:54.704649313 +1000 @@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_ wmb(); } + if (sdev->request_queue) + blk_set_queue_depth(sdev->request_queue, depth); + return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); Index: linux-4.7-ck5/include/linux/blkdev.h =================================================================== --- linux-4.7-ck5.orig/include/linux/blkdev.h 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/include/linux/blkdev.h 2016-09-23 08:35:54.704649313 +1000 @@ -24,6 +24,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -37,6 +38,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; +struct rq_wb; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -153,6 +155,7 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; unsigned long start_time; + struct wb_issue_stat wb_stat; #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; @@ -290,6 +293,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct rq_wb *rq_wb; + /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg * is used, root blkg allocates from @q->root_rl and all other @@ -315,6 +320,8 @@ struct request_queue { struct blk_mq_ctx __percpu *queue_ctx; unsigned int nr_queues; + unsigned int queue_depth; + /* hw dispatch queues */ struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; @@ -400,6 +407,9 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; + + struct blk_rq_stat rq_stats[2]; + /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the @@ -681,6 +691,14 @@ static inline bool blk_write_same_mergea return false; } +static inline unsigned int blk_queue_depth(struct request_queue *q) +{ + if (q->queue_depth) + return q->queue_depth; + + return q->nr_requests; +} + /* * q->prep_rq_fn return values */ @@ -995,6 +1013,7 @@ extern void blk_limits_io_min(struct que extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); +extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_default_limits(struct queue_limits *lim); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, Index: linux-4.7-ck5/block/Makefile =================================================================== --- linux-4.7-ck5.orig/block/Makefile 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/block/Makefile 2016-09-23 08:35:54.704649313 +1000 @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ Index: linux-4.7-ck5/block/blk-core.c =================================================================== --- linux-4.7-ck5.orig/block/blk-core.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/block/blk-core.c 2016-09-23 08:35:54.705649307 +1000 @@ -33,6 +33,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -880,6 +881,8 @@ blk_init_allocated_queue(struct request_ fail: blk_free_flush_queue(q->fq); + wbt_exit(q->rq_wb); + q->rq_wb = NULL; return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1395,6 +1398,7 @@ void blk_requeue_request(struct request_ blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); + wbt_requeue(q->rq_wb, &rq->wb_stat); if (rq->cmd_flags & REQ_QUEUED) blk_queue_end_tag(q, rq); @@ -1485,6 +1489,8 @@ void __blk_put_request(struct request_qu /* this is a bio leak */ WARN_ON(req->bio != NULL); + wbt_done(q->rq_wb, &req->wb_stat); + /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools @@ -1715,6 +1721,7 @@ static blk_qc_t blk_queue_bio(struct req int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; + unsigned int wb_acct; /* * low level driver can indicate that it wants pages above a @@ -1767,6 +1774,8 @@ static blk_qc_t blk_queue_bio(struct req } get_rq: + wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, q->queue_lock); + /* * This sync check and mask will be re-done in init_request_from_bio(), * but we need to set it earlier to expose the sync flag to the @@ -1782,11 +1791,15 @@ get_rq: */ req = get_request(q, rw_flags, bio, GFP_NOIO); if (IS_ERR(req)) { + if (wb_acct & WBT_TRACKED) + __wbt_done(q->rq_wb); bio->bi_error = PTR_ERR(req); bio_endio(bio); goto out_unlock; } + wbt_track(&req->wb_stat, wb_acct); + /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). @@ -2515,6 +2528,8 @@ void blk_start_request(struct request *r { blk_dequeue_request(req); + wbt_issue(req->q->rq_wb, &req->wb_stat); + /* * We are now handing the request to the hardware, initialize * resid_len to full count and add the timeout handler. @@ -2582,6 +2597,8 @@ bool blk_update_request(struct request * trace_block_rq_complete(req->q, req, nr_bytes); + blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req); + if (!req->bio) return false; @@ -2749,9 +2766,10 @@ void blk_finish_request(struct request * blk_account_io_done(req); - if (req->end_io) + if (req->end_io) { + wbt_done(req->q->rq_wb, &req->wb_stat); req->end_io(req, error); - else { + } else { if (blk_bidi_rq(req)) __blk_put_request(req->next_rq->q, req->next_rq); Index: linux-4.7-ck5/block/blk-mq-sysfs.c =================================================================== --- linux-4.7-ck5.orig/block/blk-mq-sysfs.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/block/blk-mq-sysfs.c 2016-09-23 08:35:54.705649307 +1000 @@ -247,6 +247,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show return ret; } +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + unsigned int i; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_init(&ctx->stat[0]); + blk_stat_init(&ctx->stat[1]); + } +} + +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t count) +{ + blk_mq_stat_clear(hctx); + return count; +} + +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_stat_init(&stat[0]); + blk_stat_init(&stat[1]); + + blk_hctx_stat_get(hctx, stat); + + ret = print_stat(page, &stat[0], "read :"); + ret += print_stat(page + ret, &stat[1], "write:"); + return ret; +} + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_sysfs_dispatched_show, @@ -304,6 +345,11 @@ static struct blk_mq_hw_ctx_sysfs_entry .attr = {.name = "io_poll", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_poll_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { + .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, + .show = blk_mq_hw_sysfs_stat_show, + .store = blk_mq_hw_sysfs_stat_store, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -314,6 +360,7 @@ static struct attribute *default_hw_ctx_ &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, &blk_mq_hw_sysfs_poll.attr, + &blk_mq_hw_sysfs_stat.attr, NULL, }; Index: linux-4.7-ck5/block/blk-mq.c =================================================================== --- linux-4.7-ck5.orig/block/blk-mq.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/block/blk-mq.c 2016-09-23 08:35:54.705649307 +1000 @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -29,6 +30,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-stat.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -274,6 +276,8 @@ static void __blk_mq_free_request(struct if (rq->cmd_flags & REQ_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + + wbt_done(q->rq_wb, &rq->wb_stat); rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); @@ -306,6 +310,7 @@ inline void __blk_mq_end_request(struct blk_account_io_done(rq); if (rq->end_io) { + wbt_done(rq->q->rq_wb, &rq->wb_stat); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) @@ -356,10 +361,19 @@ static void blk_mq_ipi_complete_request( put_cpu(); } +static void blk_mq_stat_add(struct request *rq) +{ + struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)]; + + blk_stat_add(stat, rq); +} + static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_stat_add(rq); + if (!q->softirq_done_fn) blk_mq_end_request(rq, rq->errors); else @@ -403,6 +417,8 @@ void blk_mq_start_request(struct request if (unlikely(blk_bidi_rq(rq))) rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + wbt_issue(q->rq_wb, &rq->wb_stat); + blk_add_timer(rq); /* @@ -438,6 +454,7 @@ static void __blk_mq_requeue_request(str struct request_queue *q = rq->q; trace_block_rq_requeue(q, rq); + wbt_requeue(q->rq_wb, &rq->wb_stat); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { if (q->dma_drain_size && blk_rq_bytes(rq)) @@ -1252,6 +1269,7 @@ static blk_qc_t blk_mq_make_request(stru struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; + unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1266,9 +1284,16 @@ static blk_qc_t blk_mq_make_request(stru blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return BLK_QC_T_NONE; + wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct & WBT_TRACKED) + __wbt_done(q->rq_wb); return BLK_QC_T_NONE; + } + + wbt_track(&rq->wb_stat, wb_acct); cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1345,6 +1370,7 @@ static blk_qc_t blk_sq_make_request(stru struct blk_map_ctx data; struct request *rq; blk_qc_t cookie; + unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1361,9 +1387,16 @@ static blk_qc_t blk_sq_make_request(stru } else request_count = blk_plug_queued_count(q); + wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct & WBT_TRACKED) + __wbt_done(q->rq_wb); return BLK_QC_T_NONE; + } + + wbt_track(&rq->wb_stat, wb_acct); cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1759,6 +1792,8 @@ static void blk_mq_init_cpu_queues(struc spin_lock_init(&__ctx->lock); INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; + blk_stat_init(&__ctx->stat[0]); + blk_stat_init(&__ctx->stat[1]); /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpu_online(i)) @@ -2097,6 +2132,9 @@ void blk_mq_free_queue(struct request_qu list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); + wbt_exit(q->rq_wb); + q->rq_wb = NULL; + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); Index: linux-4.7-ck5/block/blk-mq.h =================================================================== --- linux-4.7-ck5.orig/block/blk-mq.h 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/block/blk-mq.h 2016-09-23 08:35:54.705649307 +1000 @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +#include "blk-stat.h" + struct blk_mq_tag_set; struct blk_mq_ctx { @@ -20,6 +22,7 @@ struct blk_mq_ctx { /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + struct blk_rq_stat stat[2]; struct request_queue *queue; struct kobject kobj; Index: linux-4.7-ck5/block/blk-stat.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-4.7-ck5/block/blk-stat.c 2016-09-23 08:35:54.705649307 +1000 @@ -0,0 +1,221 @@ +/* + * Block stat tracking code + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include + +#include "blk-stat.h" +#include "blk-mq.h" + +static void blk_stat_flush_batch(struct blk_rq_stat *stat) +{ + if (!stat->nr_batch) + return; + if (!stat->nr_samples) + stat->mean = div64_s64(stat->batch, stat->nr_batch); + else { + stat->mean = div64_s64((stat->mean * stat->nr_samples) + + stat->batch, + stat->nr_samples + stat->nr_batch); + } + + stat->nr_samples += stat->nr_batch; + stat->nr_batch = stat->batch = 0; +} + +void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +{ + if (!src->nr_samples) + return; + + blk_stat_flush_batch(src); + + dst->min = min(dst->min, src->min); + dst->max = max(dst->max, src->max); + + if (!dst->nr_samples) + dst->mean = src->mean; + else { + dst->mean = div64_s64((src->mean * src->nr_samples) + + (dst->mean * dst->nr_samples), + dst->nr_samples + src->nr_samples); + } + dst->nr_samples += src->nr_samples; +} + +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + uint64_t latest = 0; + int i, j, nr; + + blk_stat_init(&dst[0]); + blk_stat_init(&dst[1]); + + nr = 0; + do { + uint64_t newest = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (!ctx->stat[0].nr_samples && + !ctx->stat[1].nr_samples) + continue; + if (ctx->stat[0].time > newest) + newest = ctx->stat[0].time; + if (ctx->stat[1].time > newest) + newest = ctx->stat[1].time; + } + } + + /* + * No samples + */ + if (!newest) + break; + + if (newest > latest) + latest = newest; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (ctx->stat[0].time == newest) { + blk_stat_sum(&dst[0], &ctx->stat[0]); + nr++; + } + if (ctx->stat[1].time == newest) { + blk_stat_sum(&dst[1], &ctx->stat[1]); + nr++; + } + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare. + */ + } while (!nr); + + dst[0].time = dst[1].time = latest; +} + +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + if (q->mq_ops) + blk_mq_stat_get(q, dst); + else { + memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat)); + memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat)); + } +} + +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) +{ + struct blk_mq_ctx *ctx; + unsigned int i, nr; + + nr = 0; + do { + uint64_t newest = 0; + + hctx_for_each_ctx(hctx, ctx, i) { + if (!ctx->stat[0].nr_samples && + !ctx->stat[1].nr_samples) + continue; + + if (ctx->stat[0].time > newest) + newest = ctx->stat[0].time; + if (ctx->stat[1].time > newest) + newest = ctx->stat[1].time; + } + + if (!newest) + break; + + hctx_for_each_ctx(hctx, ctx, i) { + if (ctx->stat[0].time == newest) { + blk_stat_sum(&dst[0], &ctx->stat[0]); + nr++; + } + if (ctx->stat[1].time == newest) { + blk_stat_sum(&dst[1], &ctx->stat[1]); + nr++; + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare, as the window is only updated + * occasionally + */ + } while (!nr); +} + +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->batch = stat->nr_batch = 0; + stat->time = time_now & BLK_STAT_MASK; +} + +void blk_stat_init(struct blk_rq_stat *stat) +{ + __blk_stat_init(stat, ktime_to_ns(ktime_get())); +} + +static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) +{ + return (now & BLK_STAT_MASK) == (stat->time & BLK_STAT_MASK); +} + +bool blk_stat_is_current(struct blk_rq_stat *stat) +{ + return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); +} + +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) +{ + s64 now, value; + u64 rq_time = wbt_issue_stat_get_time(&rq->wb_stat); + + now = ktime_to_ns(ktime_get()); + if (now < rq_time) + return; + + if (!__blk_stat_is_current(stat, now)) + __blk_stat_init(stat, now); + + value = now - rq_time; + if (value > stat->max) + stat->max = value; + if (value < stat->min) + stat->min = value; + + if (stat->batch + value < stat->batch || + stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) + blk_stat_flush_batch(stat); + + stat->batch += value; + stat->nr_batch++; +} + +void blk_stat_clear(struct request_queue *q) +{ + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + blk_stat_init(&ctx->stat[0]); + blk_stat_init(&ctx->stat[1]); + } + } + } else { + blk_stat_init(&q->rq_stats[0]); + blk_stat_init(&q->rq_stats[1]); + } +} Index: linux-4.7-ck5/block/blk-stat.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-4.7-ck5/block/blk-stat.h 2016-09-23 08:35:54.705649307 +1000 @@ -0,0 +1,18 @@ +#ifndef BLK_STAT_H +#define BLK_STAT_H + +/* + * ~0.13s window as a power-of-2 (2^27 nsecs) + */ +#define BLK_STAT_NSEC 134217728ULL +#define BLK_STAT_MASK ~(BLK_STAT_NSEC - 1) + +void blk_stat_add(struct blk_rq_stat *, struct request *); +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); +void blk_stat_clear(struct request_queue *q); +void blk_stat_init(struct blk_rq_stat *); +void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); +bool blk_stat_is_current(struct blk_rq_stat *); + +#endif Index: linux-4.7-ck5/block/blk-sysfs.c =================================================================== --- linux-4.7-ck5.orig/block/blk-sysfs.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/block/blk-sysfs.c 2016-09-23 08:35:54.706649301 +1000 @@ -10,6 +10,7 @@ #include #include #include +#include #include "blk.h" #include "blk-mq.h" @@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, cons return count; } +static ssize_t queue_var_store64(u64 *var, const char *page) +{ + int err; + u64 v; + + err = kstrtou64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + static ssize_t queue_requests_show(struct request_queue *q, char *page) { return queue_var_show(q->nr_requests, (page)); @@ -347,6 +361,58 @@ static ssize_t queue_poll_store(struct r return ret; } +static ssize_t queue_wb_win_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000)); +} + +static ssize_t queue_wb_win_store(struct request_queue *q, const char *page, + size_t count) +{ + ssize_t ret; + u64 val; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + + q->rq_wb->win_nsec = val * 1000ULL; + wbt_update_limits(q->rq_wb); + return count; +} + +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + ssize_t ret; + u64 val; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + + q->rq_wb->min_lat_nsec = val * 1000ULL; + wbt_update_limits(q->rq_wb); + return count; +} + static ssize_t queue_wc_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) @@ -379,6 +445,26 @@ static ssize_t queue_wc_store(struct req return count; } +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t queue_stats_show(struct request_queue *q, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_queue_stat_get(q, stat); + + ret = print_stat(page, &stat[0], "read :"); + ret += print_stat(page + ret, &stat[1], "write:"); + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -516,6 +602,23 @@ static struct queue_sysfs_entry queue_wc .store = queue_wc_store, }; +static struct queue_sysfs_entry queue_stats_entry = { + .attr = {.name = "stats", .mode = S_IRUGO }, + .show = queue_stats_show, +}; + +static struct queue_sysfs_entry queue_wb_lat_entry = { + .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_lat_show, + .store = queue_wb_lat_store, +}; + +static struct queue_sysfs_entry queue_wb_win_entry = { + .attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_win_show, + .store = queue_wb_win_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -542,6 +645,9 @@ static struct attribute *default_attrs[] &queue_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, + &queue_stats_entry.attr, + &queue_wb_lat_entry.attr, + &queue_wb_win_entry.attr, NULL, }; @@ -656,6 +762,49 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat) +{ + blk_queue_stat_get(data, stat); +} + +static void blk_wb_stat_clear(void *data) +{ + blk_stat_clear(data); +} + +static bool blk_wb_stat_is_current(struct blk_rq_stat *stat) +{ + return blk_stat_is_current(stat); +} + +static struct wb_stat_ops wb_stat_ops = { + .get = blk_wb_stat_get, + .is_current = blk_wb_stat_is_current, + .clear = blk_wb_stat_clear, +}; + +static void blk_wb_init(struct request_queue *q) +{ + struct rq_wb *rwb; + + rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q); + + /* + * If this fails, we don't get throttling + */ + if (IS_ERR(rwb)) + return; + + if (blk_queue_nonrot(q)) + rwb->min_lat_nsec = 2000000ULL; + else + rwb->min_lat_nsec = 75000000ULL; + + wbt_set_queue_depth(rwb, blk_queue_depth(q)); + wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + q->rq_wb = rwb; +} + int blk_register_queue(struct gendisk *disk) { int ret; @@ -695,6 +844,8 @@ int blk_register_queue(struct gendisk *d if (q->mq_ops) blk_mq_register_disk(disk); + blk_wb_init(q); + if (!q->request_fn) return 0; Index: linux-4.7-ck5/include/linux/wbt.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-4.7-ck5/include/linux/wbt.h 2016-09-23 08:35:54.706649301 +1000 @@ -0,0 +1,120 @@ +#ifndef WB_THROTTLE_H +#define WB_THROTTLE_H + +#include +#include +#include +#include + +enum { + ISSUE_STAT_TRACKED = 1ULL << 63, + ISSUE_STAT_READ = 1ULL << 62, + ISSUE_STAT_MASK = ISSUE_STAT_TRACKED | ISSUE_STAT_READ, + ISSUE_STAT_TIME_MASK = ~ISSUE_STAT_MASK, + + WBT_TRACKED = 1, + WBT_READ = 2, +}; + +struct wb_issue_stat { + u64 time; +}; + +static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat) +{ + stat->time = (stat->time & ISSUE_STAT_MASK) | + (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK); +} + +static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat) +{ + return stat->time & ISSUE_STAT_TIME_MASK; +} + +static inline void wbt_mark_tracked(struct wb_issue_stat *stat) +{ + stat->time |= ISSUE_STAT_TRACKED; +} + +static inline void wbt_clear_state(struct wb_issue_stat *stat) +{ + stat->time &= ~(ISSUE_STAT_TRACKED | ISSUE_STAT_READ); +} + +static inline bool wbt_tracked(struct wb_issue_stat *stat) +{ + return (stat->time & ISSUE_STAT_TRACKED) != 0; +} + +static inline void wbt_mark_read(struct wb_issue_stat *stat) +{ + stat->time |= ISSUE_STAT_READ; +} + +static inline bool wbt_is_read(struct wb_issue_stat *stat) +{ + return (stat->time & ISSUE_STAT_READ) != 0; +} + +struct wb_stat_ops { + void (*get)(void *, struct blk_rq_stat *); + bool (*is_current)(struct blk_rq_stat *); + void (*clear)(void *); +}; + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + unsigned int wb_max; /* max throughput writeback */ + int scale_step; + bool scaled_max; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + /* + * Number of consecutive periods where we don't have enough + * information to make a firm scale up/down decision. + */ + unsigned int unknown_cnt; + + struct timer_list window_timer; + + s64 sync_issue; + void *sync_cookie; + + unsigned int wc; + unsigned int queue_depth; + + unsigned long last_issue; /* last non-throttled issue */ + unsigned long last_comp; /* last non-throttled comp */ + unsigned long min_lat_nsec; + struct backing_dev_info *bdi; + struct request_queue *q; + wait_queue_head_t wait; + atomic_t inflight; + + struct wb_stat_ops *stat_ops; + void *ops_data; +}; + +struct backing_dev_info; + +void __wbt_done(struct rq_wb *); +void wbt_done(struct rq_wb *, struct wb_issue_stat *); +unsigned int wbt_wait(struct rq_wb *, unsigned int, spinlock_t *); +struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void *); +void wbt_exit(struct rq_wb *); +void wbt_update_limits(struct rq_wb *); +void wbt_requeue(struct rq_wb *, struct wb_issue_stat *); +void wbt_issue(struct rq_wb *, struct wb_issue_stat *); +void wbt_disable(struct rq_wb *); +void wbt_track(struct wb_issue_stat *, unsigned int); + +void wbt_set_queue_depth(struct rq_wb *, unsigned int); +void wbt_set_write_cache(struct rq_wb *, bool); + +#endif Index: linux-4.7-ck5/include/trace/events/wbt.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-4.7-ck5/include/trace/events/wbt.h 2016-09-23 08:35:54.706649301 +1000 @@ -0,0 +1,153 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM wbt + +#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WBT_H + +#include +#include + +/** + * wbt_stat - trace stats for blk_wb + * @stat: array of read/write stats + */ +TRACE_EVENT(wbt_stat, + + TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat), + + TP_ARGS(bdi, stat), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(s64, rmean) + __field(u64, rmin) + __field(u64, rmax) + __field(s64, rnr_samples) + __field(s64, rtime) + __field(s64, wmean) + __field(u64, wmin) + __field(u64, wmax) + __field(s64, wnr_samples) + __field(s64, wtime) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->rmean = stat[0].mean; + __entry->rmin = stat[0].min; + __entry->rmax = stat[0].max; + __entry->rnr_samples = stat[0].nr_samples; + __entry->wmean = stat[1].mean; + __entry->wmin = stat[1].min; + __entry->wmax = stat[1].max; + __entry->wnr_samples = stat[1].nr_samples; + ), + + TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, " + "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n", + __entry->name, __entry->rmean, __entry->rmin, __entry->rmax, + __entry->rnr_samples, __entry->wmean, __entry->wmin, + __entry->wmax, __entry->wnr_samples) +); + +/** + * wbt_lat - trace latency event + * @lat: latency trigger + */ +TRACE_EVENT(wbt_lat, + + TP_PROTO(struct backing_dev_info *bdi, unsigned long lat), + + TP_ARGS(bdi, lat), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, lat) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->lat = div_u64(lat, 1000); + ), + + TP_printk("%s: latency %lluus\n", __entry->name, + (unsigned long long) __entry->lat) +); + +/** + * wbt_step - trace wb event step + * @msg: context message + * @step: the current scale step count + * @window: the current monitoring window + * @bg: the current background queue limit + * @normal: the current normal writeback limit + * @max: the current max throughput writeback limit + */ +TRACE_EVENT(wbt_step, + + TP_PROTO(struct backing_dev_info *bdi, const char *msg, + int step, unsigned long window, unsigned int bg, + unsigned int normal, unsigned int max), + + TP_ARGS(bdi, msg, step, window, bg, normal, max), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(const char *, msg) + __field(int, step) + __field(unsigned long, window) + __field(unsigned int, bg) + __field(unsigned int, normal) + __field(unsigned int, max) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->msg = msg; + __entry->step = step; + __entry->window = div_u64(window, 1000); + __entry->bg = bg; + __entry->normal = normal; + __entry->max = max; + ), + + TP_printk("%s: %s: step=%d, window=%luus, background=%u, normal=%u, max=%u\n", + __entry->name, __entry->msg, __entry->step, __entry->window, + __entry->bg, __entry->normal, __entry->max) +); + +/** + * wbt_timer - trace wb timer event + * @status: timer state status + * @step: the current scale step count + * @inflight: tracked writes inflight + */ +TRACE_EVENT(wbt_timer, + + TP_PROTO(struct backing_dev_info *bdi, unsigned int status, + int step, unsigned int inflight), + + TP_ARGS(bdi, status, step, inflight), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned int, status) + __field(int, step) + __field(unsigned int, inflight) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->status = status; + __entry->step = step; + __entry->inflight = inflight; + ), + + TP_printk("%s: status=%u, step=%d, inflight=%u\n", __entry->name, + __entry->status, __entry->step, __entry->inflight) +); + +#endif /* _TRACE_WBT_H */ + +/* This part must be outside protection */ +#include Index: linux-4.7-ck5/lib/Kconfig =================================================================== --- linux-4.7-ck5.orig/lib/Kconfig 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/lib/Kconfig 2016-09-23 08:35:54.706649301 +1000 @@ -550,4 +550,7 @@ config STACKDEPOT bool select STACKTRACE +config WBT + bool + endmenu Index: linux-4.7-ck5/lib/Makefile =================================================================== --- linux-4.7-ck5.orig/lib/Makefile 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/lib/Makefile 2016-09-23 08:35:54.706649301 +1000 @@ -183,6 +183,7 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o obj-$(CONFIG_SG_POOL) += sg_pool.o obj-$(CONFIG_STMP_DEVICE) += stmp_device.o obj-$(CONFIG_IRQ_POLL) += irq_poll.o +obj-$(CONFIG_WBT) += wbt.o obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n Index: linux-4.7-ck5/lib/wbt.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-4.7-ck5/lib/wbt.c 2016-09-23 08:35:54.706649301 +1000 @@ -0,0 +1,679 @@ +/* + * buffered writeback throttling. loosely based on CoDel. We can't drop + * packets for IO scheduling, so the logic is something like this: + * + * - Monitor latencies in a defined window of time. + * - If the minimum latency in the above window exceeds some target, increment + * scaling step and scale down queue depth by a factor of 2x. The monitoring + * window is then shrunk to 100 / sqrt(scaling step + 1). + * - For any window where we don't have solid data on what the latencies + * look like, retain status quo. + * - If latencies look good, decrement scaling step. + * - If we're only doing writes, allow the scaling step to go negative. This + * will temporarily boost write performance, snapping back to a stable + * scaling step of 0 if reads show up or the heavy writers finish. Unlike + * positive scaling steps where we shrink the monitoring window, a negative + * scaling step retains the default step==0 window size. + * + * Copyright (C) 2016 Jens Axboe + * + */ +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +enum { + /* + * Default setting, we'll scale up (to 75% of QD max) or down (min 1) + * from here depending on device stats + */ + RWB_DEF_DEPTH = 16, + + /* + * 100msec window + */ + RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, + + /* + * Disregard stats, if we don't meet this minimum + */ + RWB_MIN_WRITE_SAMPLES = 3, + + /* + * If we have this number of consecutive windows with not enough + * information to scale up or down, scale up. + */ + RWB_UNKNOWN_BUMP = 5, +}; + +static inline bool rwb_enabled(struct rq_wb *rwb) +{ + return rwb && rwb->wb_normal != 0; +} + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) +{ + if (rwb_enabled(rwb)) { + const unsigned long cur = jiffies; + + if (cur != *var) + *var = cur; + } +} + +/* + * If a task was rate throttled in balance_dirty_pages() within the last + * second or so, use that to indicate a higher cleaning rate. + */ +static bool wb_recent_wait(struct rq_wb *rwb) +{ + struct bdi_writeback *wb = &rwb->bdi->wb; + + return time_before(jiffies, wb->dirty_sleep + HZ); +} + +void __wbt_done(struct rq_wb *rwb) +{ + int inflight, limit; + + inflight = atomic_dec_return(&rwb->inflight); + + /* + * wbt got disabled with IO in flight. Wake up any potential + * waiters, we don't have to do more than that. + */ + if (unlikely(!rwb_enabled(rwb))) { + wake_up_all(&rwb->wait); + return; + } + + /* + * If the device does write back caching, drop further down + * before we wake people up. + */ + if (rwb->wc && !wb_recent_wait(rwb)) + limit = 0; + else + limit = rwb->wb_normal; + + /* + * Don't wake anyone up if we are above the normal limit. + */ + if (inflight && inflight >= limit) + return; + + if (waitqueue_active(&rwb->wait)) { + int diff = limit - inflight; + + if (!inflight || diff >= rwb->wb_background / 2) + wake_up(&rwb->wait); + } +} + +/* + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. + */ +void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat) +{ + if (!rwb) + return; + + if (!wbt_tracked(stat)) { + if (rwb->sync_cookie == stat) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } + + if (wbt_is_read(stat)) + wb_timestamp(rwb, &rwb->last_comp); + wbt_clear_state(stat); + } else { + WARN_ON_ONCE(stat == rwb->sync_cookie); + __wbt_done(rwb); + wbt_clear_state(stat); + } +} + +/* + * Return true, if we can't increase the depth further by scaling + */ +static bool calc_wb_limits(struct rq_wb *rwb) +{ + unsigned int depth; + bool ret = false; + + if (!rwb->min_lat_nsec) { + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; + return false; + } + + /* + * For QD=1 devices, this is a special case. It's important for those + * to have one request ready when one completes, so force a depth of + * 2 for those devices. On the backend, it'll be a depth of 1 anyway, + * since the device can't have more than that in flight. If we're + * scaling down, then keep a setting of 1/1/1. + */ + if (rwb->queue_depth == 1) { + if (rwb->scale_step > 0) + rwb->wb_max = rwb->wb_normal = 1; + else { + rwb->wb_max = rwb->wb_normal = 2; + ret = true; + } + rwb->wb_background = 1; + } else { + /* + * scale_step == 0 is our default state. If we have suffered + * latency spikes, step will be > 0, and we shrink the + * allowed write depths. If step is < 0, we're only doing + * writes, and we allow a temporarily higher depth to + * increase performance. + */ + depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); + if (rwb->scale_step > 0) + depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); + else if (rwb->scale_step < 0) { + unsigned int maxd = 3 * rwb->queue_depth / 4; + + depth = 1 + ((depth - 1) << -rwb->scale_step); + if (depth > maxd) { + depth = maxd; + ret = true; + } + } + + /* + * Set our max/normal/bg queue depths based on how far + * we have scaled down (->scale_step). + */ + rwb->wb_max = depth; + rwb->wb_normal = (rwb->wb_max + 1) / 2; + rwb->wb_background = (rwb->wb_max + 3) / 4; + } + + return ret; +} + +static bool inline stat_sample_valid(struct blk_rq_stat *stat) +{ + /* + * We need at least one read sample, and a minimum of + * RWB_MIN_WRITE_SAMPLES. We require some write samples to know + * that it's writes impacting us, and not just some sole read on + * a device that is in a lower power state. + */ + return stat[0].nr_samples >= 1 && + stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES; +} + +static u64 rwb_sync_issue_lat(struct rq_wb *rwb) +{ + u64 now, issue = ACCESS_ONCE(rwb->sync_issue); + + if (!issue || !rwb->sync_cookie) + return 0; + + now = ktime_to_ns(ktime_get()); + return now - issue; +} + +enum { + LAT_OK = 1, + LAT_UNKNOWN, + LAT_UNKNOWN_WRITES, + LAT_EXCEEDED, +}; + +static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) +{ + u64 thislat; + + /* + * If our stored sync issue exceeds the window size, or it + * exceeds our min target AND we haven't logged any entries, + * flag the latency as exceeded. wbt works off completion latencies, + * but for a flooded device, a single sync IO can take a long time + * to complete after being issued. If this time exceeds our + * monitoring window AND we didn't see any other completions in that + * window, then count that sync IO as a violation of the latency. + */ + thislat = rwb_sync_issue_lat(rwb); + if (thislat > rwb->cur_win_nsec || + (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) { + trace_wbt_lat(rwb->bdi, thislat); + return LAT_EXCEEDED; + } + + /* + * No read/write mix, if stat isn't valid + */ + if (!stat_sample_valid(stat)) { + /* + * If we had writes in this stat window and the window is + * current, we're only doing writes. If a task recently + * waited or still has writes in flights, consider us doing + * just writes as well. + */ + if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) || + wb_recent_wait(rwb) || atomic_read(&rwb->inflight)) + return LAT_UNKNOWN_WRITES; + return LAT_UNKNOWN; + } + + /* + * If the 'min' latency exceeds our target, step down. + */ + if (stat[0].min > rwb->min_lat_nsec) { + trace_wbt_lat(rwb->bdi, stat[0].min); + trace_wbt_stat(rwb->bdi, stat); + return LAT_EXCEEDED; + } + + if (rwb->scale_step) + trace_wbt_stat(rwb->bdi, stat); + + return LAT_OK; +} + +static int latency_exceeded(struct rq_wb *rwb) +{ + struct blk_rq_stat stat[2]; + + rwb->stat_ops->get(rwb->ops_data, stat); + return __latency_exceeded(rwb, stat); +} + +static void rwb_trace_step(struct rq_wb *rwb, const char *msg) +{ + trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rwb->wb_max); +} + +static void scale_up(struct rq_wb *rwb) +{ + /* + * Hit max in previous round, stop here + */ + if (rwb->scaled_max) + return; + + rwb->scale_step--; + rwb->unknown_cnt = 0; + rwb->stat_ops->clear(rwb->ops_data); + + rwb->scaled_max = calc_wb_limits(rwb); + + if (waitqueue_active(&rwb->wait)) + wake_up_all(&rwb->wait); + + rwb_trace_step(rwb, "step up"); +} + +/* + * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we + * had a latency violation. + */ +static void scale_down(struct rq_wb *rwb, bool hard_throttle) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rwb->wb_max == 1) + return; + + if (rwb->scale_step < 0 && hard_throttle) + rwb->scale_step = 0; + else + rwb->scale_step++; + + rwb->scaled_max = false; + rwb->unknown_cnt = 0; + rwb->stat_ops->clear(rwb->ops_data); + calc_wb_limits(rwb); + rwb_trace_step(rwb, "step down"); +} + +static void rwb_arm_timer(struct rq_wb *rwb) +{ + unsigned long expires; + + if (rwb->scale_step > 0) { + /* + * We should speed this up, using some variant of a fast + * integer inverse square root calculation. Since we only do + * this for every window expiration, it's not a huge deal, + * though. + */ + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, + int_sqrt((rwb->scale_step + 1) << 8)); + } else { + /* + * For step < 0, we don't want to increase/decrease the + * window size. + */ + rwb->cur_win_nsec = rwb->win_nsec; + } + + expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); + mod_timer(&rwb->window_timer, expires); +} + +static void wb_timer_fn(unsigned long data) +{ + struct rq_wb *rwb = (struct rq_wb *) data; + int status, inflight; + + inflight = atomic_read(&rwb->inflight); + + status = latency_exceeded(rwb); + + trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight); + + /* + * If we exceeded the latency target, step down. If we did not, + * step one level up. If we don't know enough to say either exceeded + * or ok, then don't do anything. + */ + switch (status) { + case LAT_EXCEEDED: + scale_down(rwb, true); + break; + case LAT_OK: + scale_up(rwb); + break; + case LAT_UNKNOWN_WRITES: + scale_up(rwb); + break; + case LAT_UNKNOWN: + if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) + break; + /* + * We get here for two reasons: + * + * 1) We previously scaled reduced depth, and we currently + * don't have a valid read/write sample. For that case, + * slowly return to center state (step == 0). + * 2) We started a the center step, but don't have a valid + * read/write sample, but we do have writes going on. + * Allow step to go negative, to increase write perf. + */ + if (rwb->scale_step > 0) + scale_up(rwb); + else if (rwb->scale_step < 0) + scale_down(rwb, false); + break; + default: + break; + } + + /* + * Re-arm timer, if we have IO in flight + */ + if (rwb->scale_step || inflight) + rwb_arm_timer(rwb); +} + +void wbt_update_limits(struct rq_wb *rwb) +{ + rwb->scale_step = 0; + rwb->scaled_max = false; + calc_wb_limits(rwb); + + if (waitqueue_active(&rwb->wait)) + wake_up_all(&rwb->wait); +} + +static bool close_io(struct rq_wb *rwb) +{ + const unsigned long now = jiffies; + + return time_before(now, rwb->last_issue + HZ / 10) || + time_before(now, rwb->last_comp + HZ / 10); +} + +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) + +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +{ + unsigned int limit; + + /* + * At this point we know it's a buffered write. If REQ_SYNC is + * set, then it's WB_SYNC_ALL writeback, and we'll use the max + * limit for that. If the write is marked as a background write, + * then use the idle limit, or go to normal if we haven't had + * competing IO for a bit. + */ + if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb)) + limit = rwb->wb_max; + else if ((rw & REQ_BG) || close_io(rwb)) { + /* + * If less than 100ms since we completed unrelated IO, + * limit us to half the depth for background writeback. + */ + limit = rwb->wb_background; + } else + limit = rwb->wb_normal; + + return limit; +} + +static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) +{ + /* + * inc it here even if disabled, since we'll dec it at completion. + * this only happens if the task was sleeping in __wbt_wait(), + * and someone turned it off at the same time. + */ + if (!rwb_enabled(rwb)) { + atomic_inc(&rwb->inflight); + return true; + } + + return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw)); +} + +/* + * Block if we will exceed our limit, or if we are currently waiting for + * the timer to kick off queuing again. + */ +static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) +{ + DEFINE_WAIT(wait); + + if (may_queue(rwb, rw)) + return; + + do { + prepare_to_wait_exclusive(&rwb->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (may_queue(rwb, rw)) + break; + + if (lock) + spin_unlock_irq(lock); + + io_schedule(); + + if (lock) + spin_lock_irq(lock); + } while (1); + + finish_wait(&rwb->wait, &wait); +} + +static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw) +{ + /* + * If not a WRITE (or a discard), do nothing + */ + if (!(rw & (REQ_WRITE | REQ_DISCARD))) + return false; + + /* + * Don't throttle WRITE_ODIRECT + */ + if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC) + return false; + + return true; +} + +/* + * Returns true if the IO request should be accounted, false if not. + * May sleep, if we have exceeded the writeback limits. Caller can pass + * in an irq held spinlock, if it holds one when calling this function. + * If we do sleep, we'll release and re-grab it. + */ +unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) +{ + unsigned int ret; + + if (!rwb_enabled(rwb)) + return 0; + + if (!(rw & (REQ_WRITE | REQ_DISCARD))) + ret = WBT_READ; + + if (!wbt_should_throttle(rwb, rw)) { + if (ret & WBT_READ) + wb_timestamp(rwb, &rwb->last_issue); + return ret; + } + + __wbt_wait(rwb, rw, lock); + + if (!timer_pending(&rwb->window_timer)) + rwb_arm_timer(rwb); + + return ret | WBT_TRACKED; +} + +void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat) +{ + if (!rwb_enabled(rwb)) + return; + + wbt_issue_stat_set_time(stat); + + /* + * Track sync issue, in case it takes a long time to complete. Allows + * us to react quicker, if a sync IO takes a long time to complete. + * Note that this is just a hint. 'stat' can go away when the + * request completes, so it's important we never dereference it. We + * only use the address to compare with, which is why we store the + * sync_issue time locally. + */ + if (wbt_is_read(stat) && !rwb->sync_issue) { + rwb->sync_cookie = stat; + rwb->sync_issue = wbt_issue_stat_get_time(stat); + } +} + +void wbt_track(struct wb_issue_stat *stat, unsigned int wb_acct) +{ + if (wb_acct & WBT_TRACKED) + wbt_mark_tracked(stat); + else if (wb_acct & WBT_READ) + wbt_mark_read(stat); +} + +void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat) +{ + if (!rwb_enabled(rwb)) + return; + if (stat == rwb->sync_cookie) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } +} + +void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +{ + if (rwb) { + rwb->queue_depth = depth; + wbt_update_limits(rwb); + } +} + +void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) +{ + if (rwb) + rwb->wc = write_cache_on; +} + +void wbt_disable(struct rq_wb *rwb) +{ + if (rwb) { + del_timer_sync(&rwb->window_timer); + rwb->win_nsec = rwb->min_lat_nsec = 0; + wbt_update_limits(rwb); + } +} +EXPORT_SYMBOL_GPL(wbt_disable); + +struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops, + void *ops_data) +{ + struct rq_wb *rwb; + + if (!ops->get || !ops->is_current || !ops->clear) + return ERR_PTR(-EINVAL); + + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) + return ERR_PTR(-ENOMEM); + + atomic_set(&rwb->inflight, 0); + init_waitqueue_head(&rwb->wait); + setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); + rwb->wc = 1; + rwb->queue_depth = RWB_DEF_DEPTH; + rwb->last_comp = rwb->last_issue = jiffies; + rwb->bdi = bdi; + rwb->win_nsec = RWB_WINDOW_NSEC; + rwb->stat_ops = ops, + rwb->ops_data = ops_data; + wbt_update_limits(rwb); + return rwb; +} + +void wbt_exit(struct rq_wb *rwb) +{ + if (rwb) { + del_timer_sync(&rwb->window_timer); + kfree(rwb); + } +} Index: linux-4.7-ck5/Documentation/block/queue-sysfs.txt =================================================================== --- linux-4.7-ck5.orig/Documentation/block/queue-sysfs.txt 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/Documentation/block/queue-sysfs.txt 2016-09-23 08:35:54.706649301 +1000 @@ -151,5 +151,18 @@ device state. This means that it might n setting from "write back" to "write through", since that will also eliminate cache flushes issued by the kernel. +wb_lat_usec (RW) +---------------- +If the device is registered for writeback throttling, then this file shows +the target minimum read latency. If this latency is exceeded in a given +window of time (see wb_window_usec), then the writeback throttling will start +scaling back writes. + +wb_window_usec (RW) +------------------- +If the device is registered for writeback throttling, then this file shows +the value of the monitoring window in which we'll look at the target +latency. See wb_lat_usec. + Jens Axboe , February 2009 Index: linux-4.7-ck5/block/Kconfig =================================================================== --- linux-4.7-ck5.orig/block/Kconfig 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/block/Kconfig 2016-09-23 08:35:54.707649295 +1000 @@ -4,6 +4,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select WBT help Provide block layer support for the kernel. Index: linux-4.7-ck5/block/cfq-iosched.c =================================================================== --- linux-4.7-ck5.orig/block/cfq-iosched.c 2016-09-23 08:35:54.709649282 +1000 +++ linux-4.7-ck5/block/cfq-iosched.c 2016-09-23 08:35:54.707649295 +1000 @@ -3738,9 +3738,11 @@ static void check_blkcg_changed(struct c struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; uint64_t serial_nr; + bool nonroot_cg; rcu_read_lock(); serial_nr = bio_blkcg(bio)->css.serial_nr; + nonroot_cg = bio_blkcg(bio) != &blkcg_root; rcu_read_unlock(); /* @@ -3751,6 +3753,17 @@ static void check_blkcg_changed(struct c return; /* + * If we have a non-root cgroup, we can depend on that to + * do proper throttling of writes. Turn off wbt for that + * case. + */ + if (nonroot_cg) { + struct request_queue *q = cfqd->queue; + + wbt_disable(q->rq_wb); + } + + /* * Drop reference to queues. New queues will be assigned in new * group upon arrival of fresh requests. */