From 2f5c039340968e1fc0e1ca3235f31888c76a4873 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 17 May 2016 08:28:04 +0200 Subject: [PATCH 78/80] Turn BFQ-v7r11 into BFQ-v8r4 for 4.8.0 Signed-off-by: Paolo Valente --- block/Kconfig.iosched | 2 +- block/bfq-cgroup.c | 495 ++++---- block/bfq-iosched.c | 3230 +++++++++++++++++++++++++++++++------------------ block/bfq-sched.c | 480 ++++++-- block/bfq.h | 747 ++++++------ 5 files changed, 3073 insertions(+), 1881 deletions(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index f78cd1a..6d92579 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -53,7 +53,7 @@ config IOSCHED_BFQ config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" - depends on CGROUPS && IOSCHED_BFQ=y + depends on IOSCHED_BFQ && BLK_CGROUP default n ---help--- Enable hierarchical scheduling in BFQ, using the blkio controller. diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 0367996..b50ae8e 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -7,7 +7,9 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2010 Paolo Valente + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2016 Paolo Valente * * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ * file. @@ -163,8 +165,6 @@ static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) { struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); - BUG_ON(!pd); - return pd_to_bfqg(pd); } @@ -208,59 +208,49 @@ static void bfqg_put(struct bfq_group *bfqg) static void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - int rw) + int op, int op_flags) { - blkg_rwstat_add(&bfqg->stats.queued, rw, 1); + blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, 1); bfqg_stats_end_empty_time(&bfqg->stats); if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } -static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) -{ - blkg_rwstat_add(&bfqg->stats.queued, rw, -1); -} - -static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) +static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, + int op_flags) { - blkg_rwstat_add(&bfqg->stats.merged, rw, 1); + blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, -1); } -static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, - uint64_t bytes, int rw) +static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, + int op_flags) { - blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); + blkg_rwstat_add(&bfqg->stats.merged, op, op_flags, 1); } static void bfqg_stats_update_completion(struct bfq_group *bfqg, - uint64_t start_time, uint64_t io_start_time, int rw) + uint64_t start_time, uint64_t io_start_time, int op, + int op_flags) { struct bfqg_stats *stats = &bfqg->stats; unsigned long long now = sched_clock(); if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); + blkg_rwstat_add(&stats->service_time, op, op_flags, + now - io_start_time); if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, rw, + blkg_rwstat_add(&stats->wait_time, op, op_flags, io_start_time - start_time); } /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { - if (!stats) - return; - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); blkg_stat_reset(&stats->time); - blkg_stat_reset(&stats->unaccounted_time); blkg_stat_reset(&stats->avg_queue_size_sum); blkg_stat_reset(&stats->avg_queue_size_samples); blkg_stat_reset(&stats->dequeue); @@ -270,19 +260,16 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) } /* @to += @from */ -static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) +static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) { if (!to || !from) return; /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); - blkg_rwstat_add_aux(&to->serviced, &from->serviced); blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); blkg_stat_add_aux(&from->time, &from->time); - blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); @@ -311,10 +298,8 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) if (unlikely(!parent)) return; - bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); - bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); + bfqg_stats_add_aux(&parent->stats, &bfqg->stats); bfqg_stats_reset(&bfqg->stats); - bfqg_stats_reset(&bfqg->dead_stats); } static void bfq_init_entity(struct bfq_entity *entity, @@ -335,15 +320,11 @@ static void bfq_init_entity(struct bfq_entity *entity, static void bfqg_stats_exit(struct bfqg_stats *stats) { - blkg_rwstat_exit(&stats->service_bytes); - blkg_rwstat_exit(&stats->serviced); blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); - blkg_stat_exit(&stats->sectors); blkg_stat_exit(&stats->time); - blkg_stat_exit(&stats->unaccounted_time); blkg_stat_exit(&stats->avg_queue_size_sum); blkg_stat_exit(&stats->avg_queue_size_samples); blkg_stat_exit(&stats->dequeue); @@ -354,15 +335,11 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { - if (blkg_rwstat_init(&stats->service_bytes, gfp) || - blkg_rwstat_init(&stats->serviced, gfp) || - blkg_rwstat_init(&stats->merged, gfp) || + if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || - blkg_stat_init(&stats->sectors, gfp) || blkg_stat_init(&stats->time, gfp) || - blkg_stat_init(&stats->unaccounted_time, gfp) || blkg_stat_init(&stats->avg_queue_size_sum, gfp) || blkg_stat_init(&stats->avg_queue_size_samples, gfp) || blkg_stat_init(&stats->dequeue, gfp) || @@ -386,11 +363,27 @@ static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); } +static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) +{ + struct bfq_group_data *bgd; + + bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); + if (!bgd) + return NULL; + return &bgd->pd; +} + static void bfq_cpd_init(struct blkcg_policy_data *cpd) { struct bfq_group_data *d = cpd_to_bfqgd(cpd); - d->weight = BFQ_DEFAULT_GRP_WEIGHT; + d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? + CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; +} + +static void bfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_bfqgd(cpd)); } static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) @@ -401,8 +394,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) if (!bfqg) return NULL; - if (bfqg_stats_init(&bfqg->stats, gfp) || - bfqg_stats_init(&bfqg->dead_stats, gfp)) { + if (bfqg_stats_init(&bfqg->stats, gfp)) { kfree(bfqg); return NULL; } @@ -410,27 +402,20 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) return &bfqg->pd; } -static void bfq_group_set_parent(struct bfq_group *bfqg, - struct bfq_group *parent) +static void bfq_pd_init(struct blkg_policy_data *pd) { + struct blkcg_gq *blkg; + struct bfq_group *bfqg; + struct bfq_data *bfqd; struct bfq_entity *entity; + struct bfq_group_data *d; - BUG_ON(!parent); - BUG_ON(!bfqg); - BUG_ON(bfqg == parent); - + blkg = pd_to_blkg(pd); + BUG_ON(!blkg); + bfqg = blkg_to_bfqg(blkg); + bfqd = blkg->q->elevator->elevator_data; entity = &bfqg->entity; - entity->parent = parent->my_entity; - entity->sched_data = &parent->sched_data; -} - -static void bfq_pd_init(struct blkg_policy_data *pd) -{ - struct blkcg_gq *blkg = pd_to_blkg(pd); - struct bfq_group *bfqg = blkg_to_bfqg(blkg); - struct bfq_data *bfqd = blkg->q->elevator->elevator_data; - struct bfq_entity *entity = &bfqg->entity; - struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); + d = blkcg_to_bfqgd(blkg->blkcg); entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; @@ -448,70 +433,53 @@ static void bfq_pd_free(struct blkg_policy_data *pd) struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_exit(&bfqg->stats); - bfqg_stats_exit(&bfqg->dead_stats); - return kfree(bfqg); } -/* offset delta from bfqg->stats to bfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - - offsetof(struct bfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +static void bfq_pd_reset_stats(struct blkg_policy_data *pd) { - u64 sum = 0; + struct bfq_group *bfqg = pd_to_bfqg(pd); - sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); - sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, - off + dead_stats_off_delta); - return sum; + bfqg_stats_reset(&bfqg->stats); } -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat -bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +static void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) { - struct blkg_rwstat a, b; + struct bfq_entity *entity; - a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); - b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, - off + dead_stats_off_delta); - blkg_rwstat_add_aux(&a, &b); - return a; + BUG_ON(!parent); + BUG_ON(!bfqg); + BUG_ON(bfqg == parent); + + entity = &bfqg->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; } -static void bfq_pd_reset_stats(struct blkg_policy_data *pd) +static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, + struct blkcg *blkcg) { - struct bfq_group *bfqg = pd_to_bfqg(pd); + struct blkcg_gq *blkg; - bfqg_stats_reset(&bfqg->stats); - bfqg_stats_reset(&bfqg->dead_stats); + blkg = blkg_lookup(blkcg, bfqd->queue); + if (likely(blkg)) + return blkg_to_bfqg(blkg); + return NULL; } -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct blkcg *blkcg) +static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, + struct blkcg *blkcg) { - struct request_queue *q = bfqd->queue; - struct bfq_group *bfqg = NULL, *parent; - struct bfq_entity *entity = NULL; + struct bfq_group *bfqg, *parent; + struct bfq_entity *entity; assert_spin_locked(bfqd->queue->queue_lock); - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - bfqg = bfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - bfqg = blkg_to_bfqg(blkg); - else /* fallback to root_group */ - bfqg = bfqd->root_group; - } + bfqg = bfq_lookup_bfqg(bfqd, blkcg); - BUG_ON(!bfqg); + if (unlikely(!bfqg)) + return NULL; /* * Update chain of bfq_groups as we might be handling a leaf group @@ -537,11 +505,15 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason); + /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. * @bfqq: the queue to move. - * @entity: @bfqq's entity. * @bfqg: the group to move to. * * Move @bfqq to @bfqg, deactivating it from its old group and reactivating @@ -552,26 +524,40 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, * rcu_read_lock()). */ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_entity *entity, struct bfq_group *bfqg) + struct bfq_group *bfqg) { - int busy, resume; - - busy = bfq_bfqq_busy(bfqq); - resume = !RB_EMPTY_ROOT(&bfqq->sort_list); + struct bfq_entity *entity = &bfqq->entity; - BUG_ON(resume && !entity->on_st); - BUG_ON(busy && !resume && entity->on_st && + BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); + BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) + && entity->on_st && bfqq != bfqd->in_service_queue); + BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); - if (busy) { - BUG_ON(atomic_read(&bfqq->ref) < 2); + /* If bfqq is empty, then bfq_bfqq_expire also invokes + * bfq_del_bfqq_busy, thereby removing bfqq and its entity + * from data structures related to current group. Otherwise we + * need to remove bfqq explicitly with bfq_deactivate_bfqq, as + * we do below. + */ + if (bfqq == bfqd->in_service_queue) + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQ_BFQQ_PREEMPTED); + + BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) + && &bfq_entity_service_tree(entity)->idle != + entity->tree); + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - if (!resume) - bfq_del_bfqq_busy(bfqd, bfqq, 0); - else - bfq_deactivate_bfqq(bfqd, bfqq, 0); - } else if (entity->on_st) + if (bfq_bfqq_busy(bfqq)) + bfq_deactivate_bfqq(bfqd, bfqq, 0); + else if (entity->on_st) { + BUG_ON(&bfq_entity_service_tree(entity)->idle != + entity->tree); bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + } bfqg_put(bfqq_group(bfqq)); /* @@ -583,14 +569,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, entity->sched_data = &bfqg->sched_data; bfqg_get(bfqg); - if (busy) { + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); + if (bfq_bfqq_busy(bfqq)) { bfq_pos_tree_add_move(bfqd, bfqq); - if (resume) - bfq_activate_bfqq(bfqd, bfqq); + bfq_activate_bfqq(bfqd, bfqq); } if (!bfqd->in_service_queue && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); + BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) + && &bfq_entity_service_tree(entity)->idle != + entity->tree); } /** @@ -617,7 +606,11 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, lockdep_assert_held(bfqd->queue->queue_lock); - bfqg = bfq_find_alloc_group(bfqd, blkcg); + bfqg = bfq_find_set_group(bfqd, blkcg); + + if (unlikely(!bfqg)) + bfqg = bfqd->root_group; + if (async_bfqq) { entity = &async_bfqq->entity; @@ -625,7 +618,8 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, bic_set_bfqq(bic, NULL, 0); bfq_log_bfqq(bfqd, async_bfqq, "bic_change_group: %p %d", - async_bfqq, atomic_read(&async_bfqq->ref)); + async_bfqq, + async_bfqq->ref); bfq_put_queue(async_bfqq); } } @@ -633,7 +627,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, if (sync_bfqq) { entity = &sync_bfqq->entity; if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); } return bfqg; @@ -642,25 +636,23 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); - struct blkcg *blkcg; struct bfq_group *bfqg = NULL; - uint64_t id; + uint64_t serial_nr; rcu_read_lock(); - blkcg = bio_blkcg(bio); - id = blkcg->css.serial_nr; - rcu_read_unlock(); + serial_nr = bio_blkcg(bio)->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ - if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) - return; + if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) + goto out; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); - BUG_ON(!bfqg); - bic->blkcg_id = id; + bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); + bic->blkcg_serial_nr = serial_nr; +out: + rcu_read_unlock(); } /** @@ -686,7 +678,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); BUG_ON(!bfqq); - bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); + bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); } /** @@ -717,11 +709,12 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, } /** - * bfq_destroy_group - destroy @bfqg. - * @bfqg: the group being destroyed. + * bfq_pd_offline - deactivate the entity associated with @pd, + * and reparent its children entities. + * @pd: descriptor of the policy going offline. * - * Destroy @bfqg, making sure that it is not referenced from its parent. - * blkio already grabs the queue_lock for us, so no need to use RCU-based magic + * blkio already grabs the queue_lock for us, so no need to use + * RCU-based magic */ static void bfq_pd_offline(struct blkg_policy_data *pd) { @@ -780,6 +773,12 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) bfq_put_async_queues(bfqd, bfqg); BUG_ON(entity->tree); + /* + * @blkg is going offline and will be ignored by + * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so + * that they don't get lost. If IOs complete after this point, the + * stats for them will be lost. Oh well... + */ bfqg_stats_xfer_dead(bfqg); } @@ -789,46 +788,35 @@ static void bfq_end_wr_async(struct bfq_data *bfqd) list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); + BUG_ON(!bfqg); bfq_end_wr_async_queues(bfqd, bfqg); } bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, - struct cftype *cftype) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - int ret = -EINVAL; - - spin_lock_irq(&blkcg->lock); - ret = bfqgd->weight; - spin_unlock_irq(&blkcg->lock); - - return ret; -} - -static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) +static int bfq_io_show_weight(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + unsigned int val = 0; - spin_lock_irq(&blkcg->lock); - seq_printf(sf, "%u\n", bfqgd->weight); - spin_unlock_irq(&blkcg->lock); + if (bfqgd) + val = bfqgd->weight; + + seq_printf(sf, "%u\n", val); return 0; } -static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, - struct cftype *cftype, - u64 val) +static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); struct blkcg_gq *blkg; - int ret = -EINVAL; + int ret = -ERANGE; if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) return ret; @@ -873,13 +861,18 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, return ret; } -static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) +static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { + u64 weight; /* First unsigned long found in the file is used */ - return bfqio_cgroup_weight_write(of_css(of), NULL, - simple_strtoull(strim(buf), NULL, 0)); + int ret = kstrtoull(strim(buf), 0, &weight); + + if (ret) + return ret; + + return bfq_io_set_weight_legacy(of_css(of), NULL, weight); } static int bfqg_print_stat(struct seq_file *sf, void *v) @@ -899,16 +892,17 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = bfqg_stat_pd_recursive_sum(pd, off); - + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_bfq, off); return __blkg_prfill_u64(sf, pd, sum); } static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); - + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_bfq, + off); return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -928,6 +922,41 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); + return 0; +} + +static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, + false); + return 0; +} + + static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -964,38 +993,15 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) return blkg_to_bfqg(bfqd->queue->root_blkg); } -static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -{ - struct bfq_group_data *bgd; - - bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); - if (!bgd) - return NULL; - return &bgd->pd; -} - -static void bfq_cpd_free(struct blkcg_policy_data *cpd) -{ - kfree(cpd_to_bfqgd(cpd)); -} - -static struct cftype bfqio_files_dfl[] = { +static struct cftype bfq_blkcg_legacy_files[] = { { - .name = "weight", + .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfqio_cgroup_weight_read_dfl, - .write = bfqio_cgroup_weight_write_dfl, + .seq_show = bfq_io_show_weight, + .write_u64 = bfq_io_set_weight_legacy, }, - {} /* terminate */ -}; -static struct cftype bfqio_files[] = { - { - .name = "bfq.weight", - .read_u64 = bfqio_cgroup_weight_read, - .write_u64 = bfqio_cgroup_weight_write, - }, - /* statistics, cover only the tasks in the bfqg */ + /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), @@ -1003,18 +1009,17 @@ static struct cftype bfqio_files[] = { }, { .name = "bfq.sectors", - .private = offsetof(struct bfq_group, stats.sectors), - .seq_show = bfqg_print_stat, + .seq_show = bfqg_print_stat_sectors, }, { .name = "bfq.io_service_bytes", - .private = offsetof(struct bfq_group, stats.service_bytes), - .seq_show = bfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes, }, { .name = "bfq.io_serviced", - .private = offsetof(struct bfq_group, stats.serviced), - .seq_show = bfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios, }, { .name = "bfq.io_service_time", @@ -1045,18 +1050,17 @@ static struct cftype bfqio_files[] = { }, { .name = "bfq.sectors_recursive", - .private = offsetof(struct bfq_group, stats.sectors), - .seq_show = bfqg_print_stat_recursive, + .seq_show = bfqg_print_stat_sectors_recursive, }, { .name = "bfq.io_service_bytes_recursive", - .private = offsetof(struct bfq_group, stats.service_bytes), - .seq_show = bfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes_recursive, }, { .name = "bfq.io_serviced_recursive", - .private = offsetof(struct bfq_group, stats.serviced), - .seq_show = bfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios_recursive, }, { .name = "bfq.io_service_time_recursive", @@ -1102,31 +1106,39 @@ static struct cftype bfqio_files[] = { .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, - { - .name = "bfq.unaccounted_time", - .private = offsetof(struct bfq_group, stats.unaccounted_time), - .seq_show = bfqg_print_stat, - }, { } /* terminate */ }; -static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfqio_files_dfl, - .legacy_cftypes = bfqio_files, - - .pd_alloc_fn = bfq_pd_alloc, - .pd_init_fn = bfq_pd_init, - .pd_offline_fn = bfq_pd_offline, - .pd_free_fn = bfq_pd_free, - .pd_reset_stats_fn = bfq_pd_reset_stats, - - .cpd_alloc_fn = bfq_cpd_alloc, - .cpd_init_fn = bfq_cpd_init, - .cpd_bind_fn = bfq_cpd_init, - .cpd_free_fn = bfq_cpd_free, +static struct cftype bfq_blkg_files[] = { + { + .name = "bfq.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write = bfq_io_set_weight, + }, + {} /* terminate */ }; -#else +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + +static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, + struct bfq_queue *bfqq, int op, int op_flags) { } +static inline void +bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, int op_flags) { } +static inline void +bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, int op_flags) { } +static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, + uint64_t start_time, uint64_t io_start_time, int op, + int op_flags) { } +static inline void +bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, + struct bfq_group *curr_bfqg) { } +static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } +static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } +static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } static void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) @@ -1150,27 +1162,20 @@ bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) return bfqd->root_group; } -static void bfq_bfqq_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_entity *entity, - struct bfq_group *bfqg) -{ -} - static void bfq_end_wr_async(struct bfq_data *bfqd) { bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static void bfq_disconnect_groups(struct bfq_data *bfqd) +static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, + struct blkcg *blkcg) { - bfq_put_async_queues(bfqd, bfqd->root_group); + return bfqd->root_group; } -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct blkcg *blkcg) +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { - return bfqd->root_group; + return bfqq->bfqd->root_group; } static struct bfq_group * diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cf3e9b1..eef6ff4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7,25 +7,28 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2010 Paolo Valente + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2016 Paolo Valente * * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ * file. * - * BFQ is a proportional-share storage-I/O scheduling algorithm based on - * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, - * measured in number of sectors, to processes instead of time slices. The - * device is not granted to the in-service process for a given time slice, - * but until it has exhausted its assigned budget. This change from the time - * to the service domain allows BFQ to distribute the device throughput - * among processes as desired, without any distortion due to ZBR, workload - * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, - * called B-WF2Q+, to schedule processes according to their budgets. More - * precisely, BFQ schedules queues associated to processes. Thanks to the - * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to - * I/O-bound processes issuing sequential requests (to boost the - * throughput), and yet guarantee a low latency to interactive and soft - * real-time applications. + * BFQ is a proportional-share storage-I/O scheduling algorithm based + * on the slice-by-slice service scheme of CFQ. But BFQ assigns + * budgets, measured in number of sectors, to processes instead of + * time slices. The device is not granted to the in-service process + * for a given time slice, but until it has exhausted its assigned + * budget. This change from the time to the service domain enables BFQ + * to distribute the device throughput among processes as desired, + * without any distortion due to throughput fluctuations, or to device + * internal queueing. BFQ uses an ad hoc internal scheduler, called + * B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated with processes. Thanks to + * the accurate policy of B-WF2Q+, BFQ can afford to assign high + * budgets to I/O-bound processes issuing sequential requests (to + * boost the throughput), and yet guarantee a low latency to + * interactive and soft real-time applications. * * BFQ is described in [1], where also a reference to the initial, more * theoretical paper on BFQ can be found. The interested reader can find @@ -70,8 +73,8 @@ #include "bfq.h" #include "blk.h" -/* Expiration time of sync (0) and async (1) requests, in jiffies. */ -static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; +/* Expiration time of sync (0) and async (1) requests, in ns. */ +static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; /* Maximum backwards seek, in KiB. */ static const int bfq_back_max = 16 * 1024; @@ -79,15 +82,14 @@ static const int bfq_back_max = 16 * 1024; /* Penalty of a backwards seek, in number of sectors. */ static const int bfq_back_penalty = 2; -/* Idling period duration, in jiffies. */ -static int bfq_slice_idle = HZ / 125; +/* Idling period duration, in ns. */ +static u32 bfq_slice_idle = NSEC_PER_SEC / 125; /* Minimum number of assigned budgets for which stats are safe to compute. */ static const int bfq_stats_min_budgets = 194; /* Default maximum budget values, in sectors and number of requests. */ static const int bfq_default_max_budget = 16 * 1024; -static const int bfq_max_budget_async_rq = 4; /* * Async to sync throughput distribution is controlled as follows: @@ -97,23 +99,27 @@ static const int bfq_max_budget_async_rq = 4; static const int bfq_async_charge_factor = 10; /* Default timeout values, in jiffies, approximating CFQ defaults. */ -static const int bfq_timeout_sync = HZ / 8; -static int bfq_timeout_async = HZ / 25; +static const int bfq_timeout = HZ / 8; struct kmem_cache *bfq_pool; -/* Below this threshold (in ms), we consider thinktime immediate. */ -#define BFQ_MIN_TT 2 +/* Below this threshold (in ns), we consider thinktime immediate. */ +#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) /* hw_tag detection: parallel requests threshold and min samples needed. */ #define BFQ_HW_QUEUE_THRESHOLD 4 #define BFQ_HW_QUEUE_SAMPLES 32 -#define BFQQ_SEEK_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) +#define BFQQ_SEEK_THR (sector_t)(8 * 100) +#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -/* Min samples used for peak rate estimation (for autotuning). */ -#define BFQ_PEAK_RATE_SAMPLES 32 +/* Min number of samples required to perform peak-rate update */ +#define BFQ_RATE_MIN_SAMPLES 32 +/* Min observation time interval required to perform a peak-rate update (ns) */ +#define BFQ_RATE_MIN_INTERVAL 300*NSEC_PER_MSEC +/* Target observation time interval for a peak-rate update (ns) */ +#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC /* Shift used for peak rate fixed precision calculations. */ #define BFQ_RATE_SHIFT 16 @@ -141,16 +147,24 @@ struct kmem_cache *bfq_pool; * The device's speed class is dynamically (re)detected in * bfq_update_peak_rate() every time the estimated peak rate is updated. * - * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] - * are the reference values for a slow/fast rotational device, whereas - * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for - * a slow/fast non-rotational device. Finally, device_speed_thresh are the - * thresholds used to switch between speed classes. + * In the following definitions, R_slow[0]/R_fast[0] and + * T_slow[0]/T_fast[0] are the reference values for a slow/fast + * rotational device, whereas R_slow[1]/R_fast[1] and + * T_slow[1]/T_fast[1] are the reference values for a slow/fast + * non-rotational device. Finally, device_speed_thresh are the + * thresholds used to switch between speed classes. The reference + * rates are not the actual peak rates of the devices used as a + * reference, but slightly lower values. The reason for using these + * slightly lower values is that the peak-rate estimator tends to + * yield slightly lower values than the actual peak rate (it can yield + * the actual peak rate only if there is only one process doing I/O, + * and the process does sequential I/O). + * * Both the reference peak rates and the thresholds are measured in * sectors/usec, left-shifted by BFQ_RATE_SHIFT. */ -static int R_slow[2] = {1536, 10752}; -static int R_fast[2] = {17415, 34791}; +static int R_slow[2] = {1000, 10700}; +static int R_fast[2] = {14000, 33000}; /* * To improve readability, a conversion function is used to initialize the * following arrays, which entails that they can be initialized only in a @@ -183,10 +197,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd); */ static int bfq_bio_sync(struct bio *bio) { - if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) - return 1; - - return 0; + return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC); } /* @@ -409,11 +420,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) */ static bool bfq_symmetric_scenario(struct bfq_data *bfqd) { - return -#ifdef CONFIG_BFQ_GROUP_IOSCHED - !bfqd->active_numerous_groups && -#endif - !bfq_differentiated_weights(bfqd); + return !bfq_differentiated_weights(bfqd); } /* @@ -533,9 +540,19 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, static unsigned long bfq_serv_to_charge(struct request *rq, struct bfq_queue *bfqq) { - return blk_rq_sectors(rq) * - (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * - bfq_async_charge_factor)); + if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) + return blk_rq_sectors(rq); + + /* + * If there are no weight-raised queues, then amplify service + * by just the async charge factor; otherwise amplify service + * by twice the async charge factor, to further reduce latency + * for weight-raised queues. + */ + if (bfqq->bfqd->wr_busy_queues == 0) + return blk_rq_sectors(rq) * bfq_async_charge_factor; + + return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; } /** @@ -590,12 +607,23 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) dur = bfqd->RT_prod; do_div(dur, bfqd->peak_rate); - return dur; -} + /* + * Limit duration between 3 and 13 seconds. Tests show that + * higher values than 13 seconds often yield the opposite of + * the desired result, i.e., worsen responsiveness by letting + * non-interactive and non-soft-real-time applications + * preserve weight raising for a too long time interval. + * + * On the other end, lower values than 3 seconds make it + * difficult for most interactive tasks to complete their jobs + * before weight-raising finishes. + */ + if (dur > msecs_to_jiffies(13000)) + dur = msecs_to_jiffies(13000); + else if (dur < msecs_to_jiffies(3000)) + dur = msecs_to_jiffies(3000); -static unsigned int bfq_bfqq_cooperations(struct bfq_queue *bfqq) -{ - return bfqq->bic ? bfqq->bic->cooperations : 0; + return dur; } static void @@ -605,31 +633,28 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfq_mark_bfqq_idle_window(bfqq); else bfq_clear_bfqq_idle_window(bfqq); + if (bic->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); - /* Assuming that the flag in_large_burst is already correctly set */ - if (bic->wr_time_left && bfqq->bfqd->low_latency && - !bfq_bfqq_in_large_burst(bfqq) && - bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { - /* - * Start a weight raising period with the duration given by - * the raising_time_left snapshot. - */ - if (bfq_bfqq_busy(bfqq)) - bfqq->bfqd->wr_busy_queues++; - bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bic->wr_time_left; - bfqq->last_wr_start_finish = jiffies; - bfqq->entity.prio_changed = 1; + + bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); + bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr"); + + bfqq->wr_coeff = 1; } - /* - * Clear wr_time_left to prevent bfq_bfqq_save_state() from - * getting confused about the queue's need of a weight-raising - * period. - */ - bic->wr_time_left = 0; + /* make sure weight will be updated, however we got here */ + bfqq->entity.prio_changed = 1; } static int bfqq_process_refs(struct bfq_queue *bfqq) @@ -639,7 +664,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) lockdep_assert_held(bfqq->bfqd->queue->queue_lock); io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; - process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; + process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; BUG_ON(process_refs < 0); return process_refs; } @@ -654,6 +679,7 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) hlist_del_init(&item->burst_list_node); hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); bfqd->burst_size = 1; + bfqd->burst_parent_entity = bfqq->entity.parent; } /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ @@ -662,6 +688,10 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* Increment burst size to take into account also bfqq */ bfqd->burst_size++; + bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); + + BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { struct bfq_queue *pos, *bfqq_item; struct hlist_node *n; @@ -671,15 +701,19 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * other to consider this burst as large. */ bfqd->large_burst = true; + bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); /* * We can now mark all queues in the burst list as * belonging to a large burst. */ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, - burst_list_node) + burst_list_node) { bfq_mark_bfqq_in_large_burst(bfqq_item); + bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); + } bfq_mark_bfqq_in_large_burst(bfqq); + bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); /* * From now on, and until the current burst finishes, any @@ -691,67 +725,79 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, burst_list_node) hlist_del_init(&pos->burst_list_node); - } else /* burst not yet large: add bfqq to the burst list */ + } else /* + * Burst not yet large: add bfqq to the burst list. Do + * not increment the ref counter for bfqq, because bfqq + * is removed from the burst list before freeing bfqq + * in put_queue. + */ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } /* - * If many queues happen to become active shortly after each other, then, - * to help the processes associated to these queues get their job done as - * soon as possible, it is usually better to not grant either weight-raising - * or device idling to these queues. In this comment we describe, firstly, - * the reasons why this fact holds, and, secondly, the next function, which - * implements the main steps needed to properly mark these queues so that - * they can then be treated in a different way. + * If many queues belonging to the same group happen to be created + * shortly after each other, then the processes associated with these + * queues have typically a common goal. In particular, bursts of queue + * creations are usually caused by services or applications that spawn + * many parallel threads/processes. Examples are systemd during boot, + * or git grep. To help these processes get their job done as soon as + * possible, it is usually better to not grant either weight-raising + * or device idling to their queues. * - * As for the terminology, we say that a queue becomes active, i.e., - * switches from idle to backlogged, either when it is created (as a - * consequence of the arrival of an I/O request), or, if already existing, - * when a new request for the queue arrives while the queue is idle. - * Bursts of activations, i.e., activations of different queues occurring - * shortly after each other, are typically caused by services or applications - * that spawn or reactivate many parallel threads/processes. Examples are - * systemd during boot or git grep. + * In this comment we describe, firstly, the reasons why this fact + * holds, and, secondly, the next function, which implements the main + * steps needed to properly mark these queues so that they can then be + * treated in a different way. * - * These services or applications benefit mostly from a high throughput: - * the quicker the requests of the activated queues are cumulatively served, - * the sooner the target job of these queues gets completed. As a consequence, - * weight-raising any of these queues, which also implies idling the device - * for it, is almost always counterproductive: in most cases it just lowers - * throughput. + * The above services or applications benefit mostly from a high + * throughput: the quicker the requests of the activated queues are + * cumulatively served, the sooner the target job of these queues gets + * completed. As a consequence, weight-raising any of these queues, + * which also implies idling the device for it, is almost always + * counterproductive. In most cases it just lowers throughput. * - * On the other hand, a burst of activations may be also caused by the start - * of an application that does not consist in a lot of parallel I/O-bound - * threads. In fact, with a complex application, the burst may be just a - * consequence of the fact that several processes need to be executed to - * start-up the application. To start an application as quickly as possible, - * the best thing to do is to privilege the I/O related to the application - * with respect to all other I/O. Therefore, the best strategy to start as - * quickly as possible an application that causes a burst of activations is - * to weight-raise all the queues activated during the burst. This is the + * On the other hand, a burst of queue creations may be caused also by + * the start of an application that does not consist of a lot of + * parallel I/O-bound threads. In fact, with a complex application, + * several short processes may need to be executed to start-up the + * application. In this respect, to start an application as quickly as + * possible, the best thing to do is in any case to privilege the I/O + * related to the application with respect to all other + * I/O. Therefore, the best strategy to start as quickly as possible + * an application that causes a burst of queue creations is to + * weight-raise all the queues created during the burst. This is the * exact opposite of the best strategy for the other type of bursts. * - * In the end, to take the best action for each of the two cases, the two - * types of bursts need to be distinguished. Fortunately, this seems - * relatively easy to do, by looking at the sizes of the bursts. In - * particular, we found a threshold such that bursts with a larger size - * than that threshold are apparently caused only by services or commands - * such as systemd or git grep. For brevity, hereafter we call just 'large' - * these bursts. BFQ *does not* weight-raise queues whose activations occur - * in a large burst. In addition, for each of these queues BFQ performs or - * does not perform idling depending on which choice boosts the throughput - * most. The exact choice depends on the device and request pattern at + * In the end, to take the best action for each of the two cases, the + * two types of bursts need to be distinguished. Fortunately, this + * seems relatively easy, by looking at the sizes of the bursts. In + * particular, we found a threshold such that only bursts with a + * larger size than that threshold are apparently caused by + * services or commands such as systemd or git grep. For brevity, + * hereafter we call just 'large' these bursts. BFQ *does not* + * weight-raise queues whose creation occurs in a large burst. In + * addition, for each of these queues BFQ performs or does not perform + * idling depending on which choice boosts the throughput more. The + * exact choice depends on the device and request pattern at * hand. * - * Turning back to the next function, it implements all the steps needed - * to detect the occurrence of a large burst and to properly mark all the - * queues belonging to it (so that they can then be treated in a different - * way). This goal is achieved by maintaining a special "burst list" that - * holds, temporarily, the queues that belong to the burst in progress. The - * list is then used to mark these queues as belonging to a large burst if - * the burst does become large. The main steps are the following. + * Unfortunately, false positives may occur while an interactive task + * is starting (e.g., an application is being started). The + * consequence is that the queues associated with the task do not + * enjoy weight raising as expected. Fortunately these false positives + * are very rare. They typically occur if some service happens to + * start doing I/O exactly when the interactive task starts. * - * . when the very first queue is activated, the queue is inserted into the + * Turning back to the next function, it implements all the steps + * needed to detect the occurrence of a large burst and to properly + * mark all the queues belonging to it (so that they can then be + * treated in a different way). This goal is achieved by maintaining a + * "burst list" that holds, temporarily, the queues that belong to the + * burst in progress. The list is then used to mark these queues as + * belonging to a large burst if the burst does become large. The main + * steps are the following. + * + * . when the very first queue is created, the queue is inserted into the * list (as it could be the first queue in a possible burst) * * . if the current burst has not yet become large, and a queue Q that does @@ -772,13 +818,13 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * * . the device enters a large-burst mode * - * . if a queue Q that does not belong to the burst is activated while + * . if a queue Q that does not belong to the burst is created while * the device is in large-burst mode and shortly after the last time * at which a queue either entered the burst list or was marked as * belonging to the current large burst, then Q is immediately marked * as belonging to a large burst. * - * . if a queue Q that does not belong to the burst is activated a while + * . if a queue Q that does not belong to the burst is created a while * later, i.e., not shortly after, than the last time at which a queue * either entered the burst list or was marked as belonging to the * current large burst, then the current burst is deemed as finished and: @@ -791,52 +837,44 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) * in a possible new burst (then the burst list contains just Q * after this step). */ -static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool idle_for_long_time) +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* - * If bfqq happened to be activated in a burst, but has been idle - * for at least as long as an interactive queue, then we assume - * that, in the overall I/O initiated in the burst, the I/O - * associated to bfqq is finished. So bfqq does not need to be - * treated as a queue belonging to a burst anymore. Accordingly, - * we reset bfqq's in_large_burst flag if set, and remove bfqq - * from the burst list if it's there. We do not decrement instead - * burst_size, because the fact that bfqq does not need to belong - * to the burst list any more does not invalidate the fact that - * bfqq may have been activated during the current burst. - */ - if (idle_for_long_time) { - hlist_del_init(&bfqq->burst_list_node); - bfq_clear_bfqq_in_large_burst(bfqq); - } - - /* * If bfqq is already in the burst list or is part of a large - * burst, then there is nothing else to do. + * burst, or finally has just been split, then there is + * nothing else to do. */ if (!hlist_unhashed(&bfqq->burst_list_node) || - bfq_bfqq_in_large_burst(bfqq)) + bfq_bfqq_in_large_burst(bfqq) || + time_is_after_eq_jiffies(bfqq->split_time + + msecs_to_jiffies(10))) return; /* - * If bfqq's activation happens late enough, then the current - * burst is finished, and related data structures must be reset. + * If bfqq's creation happens late enough, or bfqq belongs to + * a different group than the burst group, then the current + * burst is finished, and related data structures must be + * reset. * - * In this respect, consider the special case where bfqq is the very - * first queue being activated. In this case, last_ins_in_burst is - * not yet significant when we get here. But it is easy to verify - * that, whether or not the following condition is true, bfqq will - * end up being inserted into the burst list. In particular the - * list will happen to contain only bfqq. And this is exactly what - * has to happen, as bfqq may be the first queue in a possible + * In this respect, consider the special case where bfqq is + * the very first queue created after BFQ is selected for this + * device. In this case, last_ins_in_burst and + * burst_parent_entity are not yet significant when we get + * here. But it is easy to verify that, whether or not the + * following condition is true, bfqq will end up being + * inserted into the burst list. In particular the list will + * happen to contain only bfqq. And this is exactly what has + * to happen, as bfqq may be the first queue of the first * burst. */ if (time_is_before_jiffies(bfqd->last_ins_in_burst + - bfqd->bfq_burst_interval)) { + bfqd->bfq_burst_interval) || + bfqq->entity.parent != bfqd->burst_parent_entity) { bfqd->large_burst = false; bfq_reset_burst_list(bfqd, bfqq); - return; + bfq_log_bfqq(bfqd, bfqq, + "handle_burst: late activation or different group"); + goto end; } /* @@ -845,8 +883,9 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, * bfqq as belonging to this large burst immediately. */ if (bfqd->large_burst) { + bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); bfq_mark_bfqq_in_large_burst(bfqq); - return; + goto end; } /* @@ -855,25 +894,491 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, * queue. Then we add bfqq to the burst. */ bfq_add_to_burst(bfqd, bfqq); +end: + /* + * At this point, bfqq either has been added to the current + * burst or has caused the current burst to terminate and a + * possible new burst to start. In particular, in the second + * case, bfqq has become the first queue in the possible new + * burst. In both cases last_ins_in_burst needs to be moved + * forward. + */ + bfqd->last_ins_in_burst = jiffies; + +} + +static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + return entity->budget - entity->service; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static int bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static int bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason); + +/* + * The next function, invoked after the input queue bfqq switches from + * idle to busy, updates the budget of bfqq. The function also tells + * whether the in-service queue should be expired, by returning + * true. The purpose of expiring the in-service queue is to give bfqq + * the chance to possibly preempt the in-service queue, and the reason + * for preempting the in-service queue is to achieve one of the two + * goals below. + * + * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has + * expired because it has remained idle. In particular, bfqq may have + * expired for one of the following two reasons: + * + * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and + * did not make it to issue a new request before its last request + * was served; + * + * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue + * a new request before the expiration of the idling-time. + * + * Even if bfqq has expired for one of the above reasons, the process + * associated with the queue may be however issuing requests greedily, + * and thus be sensitive to the bandwidth it receives (bfqq may have + * remained idle for other reasons: CPU high load, bfqq not enjoying + * idling, I/O throttling somewhere in the path from the process to + * the I/O scheduler, ...). But if, after every expiration for one of + * the above two reasons, bfqq has to wait for the service of at least + * one full budget of another queue before being served again, then + * bfqq is likely to get a much lower bandwidth or resource time than + * its reserved ones. To address this issue, two countermeasures need + * to be taken. + * + * First, the budget and the timestamps of bfqq need to be updated in + * a special way on bfqq reactivation: they need to be updated as if + * bfqq did not remain idle and did not expire. In fact, if they are + * computed as if bfqq expired and remained idle until reactivation, + * then the process associated with bfqq is treated as if, instead of + * being greedy, it stopped issuing requests when bfqq remained idle, + * and restarts issuing requests only on this reactivation. In other + * words, the scheduler does not help the process recover the "service + * hole" between bfqq expiration and reactivation. As a consequence, + * the process receives a lower bandwidth than its reserved one. In + * contrast, to recover this hole, the budget must be updated as if + * bfqq was not expired at all before this reactivation, i.e., it must + * be set to the value of the remaining budget when bfqq was + * expired. Along the same line, timestamps need to be assigned the + * value they had the last time bfqq was selected for service, i.e., + * before last expiration. Thus timestamps need to be back-shifted + * with respect to their normal computation (see [1] for more details + * on this tricky aspect). + * + * Secondly, to allow the process to recover the hole, the in-service + * queue must be expired too, to give bfqq the chance to preempt it + * immediately. In fact, if bfqq has to wait for a full budget of the + * in-service queue to be completed, then it may become impossible to + * let the process recover the hole, even if the back-shifted + * timestamps of bfqq are lower than those of the in-service queue. If + * this happens for most or all of the holes, then the process may not + * receive its reserved bandwidth. In this respect, it is worth noting + * that, being the service of outstanding requests unpreemptible, a + * little fraction of the holes may however be unrecoverable, thereby + * causing a little loss of bandwidth. + * + * The last important point is detecting whether bfqq does need this + * bandwidth recovery. In this respect, the next function deems the + * process associated with bfqq greedy, and thus allows it to recover + * the hole, if: 1) the process is waiting for the arrival of a new + * request (which implies that bfqq expired for one of the above two + * reasons), and 2) such a request has arrived soon. The first + * condition is controlled through the flag non_blocking_wait_rq, + * while the second through the flag arrived_in_time. If both + * conditions hold, then the function computes the budget in the + * above-described special way, and signals that the in-service queue + * should be expired. Timestamp back-shifting is done later in + * __bfq_activate_entity. + * + * 2. Reduce latency. Even if timestamps are not backshifted to let + * the process associated with bfqq recover a service hole, bfqq may + * however happen to have, after being (re)activated, a lower finish + * timestamp than the in-service queue. That is, the next budget of + * bfqq may have to be completed before the one of the in-service + * queue. If this is the case, then preempting the in-service queue + * allows this goal to be achieved, apart from the unpreemptible, + * outstanding requests mentioned above. + * + * Unfortunately, regardless of which of the above two goals one wants + * to achieve, service trees need first to be updated to know whether + * the in-service queue must be preempted. To have service trees + * correctly updated, the in-service queue must be expired and + * rescheduled, and bfqq must be scheduled too. This is one of the + * most costly operations (in future versions, the scheduling + * mechanism may be re-designed in such a way to make it possible to + * know whether preemption is needed without needing to update service + * trees). In addition, queue preemptions almost always cause random + * I/O, and thus loss of throughput. Because of these facts, the next + * function adopts the following simple scheme to avoid both costly + * operations and too frequent preemptions: it requests the expiration + * of the in-service queue (unconditionally) only for queues that need + * to recover a hole, or that either are weight-raised or deserve to + * be weight-raised. + */ +static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool arrived_in_time, + bool wr_or_deserves_wr) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { + /* + * We do not clear the flag non_blocking_wait_rq here, as + * the latter is used in bfq_activate_bfqq to signal + * that timestamps need to be back-shifted (and is + * cleared right after). + */ + + /* + * In next assignment we rely on that either + * entity->service or entity->budget are not updated + * on expiration if bfqq is empty (see + * __bfq_bfqq_recalc_budget). Thus both quantities + * remain unchanged after such an expiration, and the + * following statement therefore assigns to + * entity->budget the remaining budget on such an + * expiration. For clarity, entity->service is not + * updated on expiration in any case, and, in normal + * operation, is reset only when bfqq is selected for + * service (see bfq_get_next_queue). + */ + BUG_ON(bfqq->max_budget < 0); + entity->budget = min_t(unsigned long, + bfq_bfqq_budget_left(bfqq), + bfqq->max_budget); + + BUG_ON(entity->budget < 0); + return true; + } + + BUG_ON(bfqq->max_budget < 0); + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(bfqq->next_rq, bfqq)); + BUG_ON(entity->budget < 0); + + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); + return wr_or_deserves_wr; +} + +static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + unsigned int old_wr_coeff, + bool wr_or_deserves_wr, + bool interactive, + bool in_burst, + bool soft_rt) +{ + if (old_wr_coeff == 1 && wr_or_deserves_wr) { + /* start a weight-raising period */ + if (interactive) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else { + bfqq->wr_start_at_switch_to_srt = jiffies; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + /* + * If needed, further reduce budget to make sure it is + * close to bfqq's backlog, so as to reduce the + * scheduling-error component due to a too large + * budget. Do not care about throughput consequences, + * but only about latency. Finally, do not assign a + * too small budget either, to avoid increasing + * latency by causing too frequent expirations. + */ + bfqq->entity.budget = min_t(unsigned long, + bfqq->entity.budget, + 2 * bfq_min_budget(bfqd)); + + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } else if (old_wr_coeff > 1) { + if (interactive) { /* update wr coeff and duration */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else if (in_burst) { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq-> + wr_cur_max_time)); + } else if (soft_rt) { + /* + * The application is now or still meeting the + * requirements for being deemed soft rt. We + * can then correctly and safely (re)charge + * the weight-raising duration for the + * application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + if (bfqq->wr_cur_max_time != + bfqd->bfq_wr_rt_max_time) { + bfqq->wr_start_at_switch_to_srt = + bfqq->last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfq_log_bfqq(bfqd, bfqq, + "switching to soft_rt wr"); + } else + bfq_log_bfqq(bfqd, bfqq, + "moving forward soft_rt wr duration"); + bfqq->last_wr_start_finish = jiffies; + } + } +} + +static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return bfqq->dispatched == 0 && + time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); +} + +static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int old_wr_coeff, + struct request *rq, + bool *interactive) +{ + bool soft_rt, in_burst, wr_or_deserves_wr, + bfqq_wants_to_preempt, + idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), + /* + * See the comments on + * bfq_bfqq_update_budg_for_activation for + * details on the usage of the next variable. + */ + arrived_in_time = ktime_get_ns() <= + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle * 3; + + bfq_log_bfqq(bfqd, bfqq, + "bfq_add_request non-busy: " + "jiffies %lu, in_time %d, idle_long %d busyw %d " + "wr_coeff %u", + jiffies, arrived_in_time, + idle_for_long_time, + bfq_bfqq_non_blocking_wait_rq(bfqq), + old_wr_coeff); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + BUG_ON(bfqq == bfqd->in_service_queue); + bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, + req_op(rq), rq->cmd_flags); + + /* + * bfqq deserves to be weight-raised if: + * - it is sync, + * - it does not belong to a large burst, + * - it has been idle for enough time or is soft real-time, + * - is linked to a bfq_io_cq (it is not shared in any sense) + */ + in_burst = bfq_bfqq_in_large_burst(bfqq); + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); + *interactive = + !in_burst && + idle_for_long_time; + wr_or_deserves_wr = bfqd->low_latency && + (bfqq->wr_coeff > 1 || + (bfq_bfqq_sync(bfqq) && + bfqq->bic && (*interactive || soft_rt))); + + bfq_log_bfqq(bfqd, bfqq, + "bfq_add_request: " + "in_burst %d, " + "soft_rt %d (next %lu), inter %d, bic %p", + bfq_bfqq_in_large_burst(bfqq), soft_rt, + bfqq->soft_rt_next_start, + *interactive, + bfqq->bic); + + /* + * Using the last flag, update budget and check whether bfqq + * may want to preempt the in-service queue. + */ + bfqq_wants_to_preempt = + bfq_bfqq_update_budg_for_activation(bfqd, bfqq, + arrived_in_time, + wr_or_deserves_wr); + + /* + * If bfqq happened to be activated in a burst, but has been + * idle for much more than an interactive queue, then we + * assume that, in the overall I/O initiated in the burst, the + * I/O associated with bfqq is finished. So bfqq does not need + * to be treated as a queue belonging to a burst + * anymore. Accordingly, we reset bfqq's in_large_burst flag + * if set, and remove bfqq from the burst list if it's + * there. We do not decrement burst_size, because the fact + * that bfqq does not need to belong to the burst list any + * more does not invalidate the fact that bfqq was created in + * a burst. + */ + if (likely(!bfq_bfqq_just_created(bfqq)) && + idle_for_long_time && + time_is_before_jiffies( + bfqq->budget_timeout + + msecs_to_jiffies(10000))) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + + bfq_clear_bfqq_just_created(bfqq); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (arrived_in_time) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + bfq_log_bfqq(bfqd, bfqq, "requests in time %d", + bfqq->requests_within_timer); + } + + if (bfqd->low_latency) { + if (unlikely(time_is_after_jiffies(bfqq->split_time))) + /* wraparound */ + bfqq->split_time = + jiffies - bfqd->bfq_wr_min_idle_time - 1; + + if (time_is_before_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) { + bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, + old_wr_coeff, + wr_or_deserves_wr, + *interactive, + in_burst, + soft_rt); + + if (old_wr_coeff != bfqq->wr_coeff) + bfqq->entity.prio_changed = 1; + } + } + + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + + bfq_add_bfqq_busy(bfqd, bfqq); + + /* + * Expire in-service queue only if preemption may be needed + * for guarantees. In this respect, the function + * next_queue_may_preempt just checks a simple, necessary + * condition, and not a sufficient condition based on + * timestamps. In fact, for the latter condition to be + * evaluated, timestamps would need first to be updated, and + * this operation is quite costly (see the comments on the + * function bfq_bfqq_update_budg_for_activation). + */ + if (bfqd->in_service_queue && bfqq_wants_to_preempt && + bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && + next_queue_may_preempt(bfqd)) { + struct bfq_queue *in_serv = + bfqd->in_service_queue; + BUG_ON(in_serv == bfqq); + + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQ_BFQQ_PREEMPTED); + BUG_ON(in_serv->entity.budget < 0); + } } static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_entity *entity = &bfqq->entity; struct bfq_data *bfqd = bfqq->bfqd; struct request *next_rq, *prev; - unsigned long old_wr_coeff = bfqq->wr_coeff; + unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; - bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); + bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", + blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); + + if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + bfqq->queued[rq_is_sync(rq)]++; bfqd->queued++; elv_rb_add(&bfqq->sort_list, rq); /* - * Check if this request is a better next-serve candidate. + * Check if this request is a better next-to-serve candidate. */ prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); @@ -886,160 +1391,10 @@ static void bfq_add_request(struct request *rq) if (prev != bfqq->next_rq) bfq_pos_tree_add_move(bfqd, bfqq); - if (!bfq_bfqq_busy(bfqq)) { - bool soft_rt, coop_or_in_burst, - idle_for_long_time = time_is_before_jiffies( - bfqq->budget_timeout + - bfqd->bfq_wr_min_idle_time); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, - rq->cmd_flags); -#endif - if (bfq_bfqq_sync(bfqq)) { - bool already_in_burst = - !hlist_unhashed(&bfqq->burst_list_node) || - bfq_bfqq_in_large_burst(bfqq); - bfq_handle_burst(bfqd, bfqq, idle_for_long_time); - /* - * If bfqq was not already in the current burst, - * then, at this point, bfqq either has been - * added to the current burst or has caused the - * current burst to terminate. In particular, in - * the second case, bfqq has become the first - * queue in a possible new burst. - * In both cases last_ins_in_burst needs to be - * moved forward. - */ - if (!already_in_burst) - bfqd->last_ins_in_burst = jiffies; - } - - coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || - bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; - soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && - !coop_or_in_burst && - time_is_before_jiffies(bfqq->soft_rt_next_start); - interactive = !coop_or_in_burst && idle_for_long_time; - entity->budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (time_before(jiffies, - RQ_BIC(rq)->ttime.last_end_request + - bfqd->bfq_slice_idle)) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - } - - if (!bfqd->low_latency) - goto add_bfqq_busy; - - if (bfq_bfqq_just_split(bfqq)) - goto set_prio_changed; - - /* - * If the queue: - * - is not being boosted, - * - has been idle for enough time, - * - is not a sync queue or is linked to a bfq_io_cq (it is - * shared "for its nature" or it is not shared and its - * requests have not been redirected to a shared queue) - * start a weight-raising period. - */ - if (old_wr_coeff == 1 && (interactive || soft_rt) && - (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - if (interactive) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - bfq_log_bfqq(bfqd, bfqq, - "wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } else if (old_wr_coeff > 1) { - if (interactive) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else if (coop_or_in_burst || - (bfqq->wr_cur_max_time == - bfqd->bfq_wr_rt_max_time && - !soft_rt)) { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq-> - wr_cur_max_time)); - } else if (time_before( - bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time, - jiffies + - bfqd->bfq_wr_rt_max_time) && - soft_rt) { - /* - * - * The remaining weight-raising time is lower - * than bfqd->bfq_wr_rt_max_time, which means - * that the application is enjoying weight - * raising either because deemed soft-rt in - * the near past, or because deemed interactive - * a long ago. - * In both cases, resetting now the current - * remaining weight-raising time for the - * application to the weight-raising duration - * for soft rt applications would not cause any - * latency increase for the application (as the - * new duration would be higher than the - * remaining time). - * - * In addition, the application is now meeting - * the requirements for being deemed soft rt. - * In the end we can correctly and safely - * (re)charge the weight-raising duration for - * the application with the weight-raising - * duration for soft rt applications. - * - * In particular, doing this recharge now, i.e., - * before the weight-raising period for the - * application finishes, reduces the probability - * of the following negative scenario: - * 1) the weight of a soft rt application is - * raised at startup (as for any newly - * created application), - * 2) since the application is not interactive, - * at a certain time weight-raising is - * stopped for the application, - * 3) at that time the application happens to - * still have pending requests, and hence - * is destined to not have a chance to be - * deemed soft rt before these requests are - * completed (see the comments to the - * function bfq_bfqq_softrt_next_start() - * for details on soft rt detection), - * 4) these pending requests experience a high - * latency because the application is not - * weight-raised while they are pending. - */ - bfqq->last_wr_start_finish = jiffies; - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - } - } -set_prio_changed: - if (old_wr_coeff != bfqq->wr_coeff) - entity->prio_changed = 1; -add_bfqq_busy: - bfqq->last_idle_bklogged = jiffies; - bfqq->service_from_backlogged = 0; - bfq_clear_bfqq_softrt_update(bfqq); - bfq_add_bfqq_busy(bfqd, bfqq); - } else { + if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ + bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, + rq, &interactive); + else { if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && time_is_before_jiffies( bfqq->last_wr_start_finish + @@ -1048,16 +1403,43 @@ add_bfqq_busy: bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); bfqd->wr_busy_queues++; - entity->prio_changed = 1; + bfqq->entity.prio_changed = 1; bfq_log_bfqq(bfqd, bfqq, - "non-idle wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); + "non-idle wrais starting, " + "wr_max_time %u wr_busy %d", + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqd->wr_busy_queues); } if (prev != bfqq->next_rq) bfq_updated_next_req(bfqd, bfqq); } + /* + * Assign jiffies to last_wr_start_finish in the following + * cases: + * + * . if bfqq is not going to be weight-raised, because, for + * non weight-raised queues, last_wr_start_finish stores the + * arrival time of the last request; as of now, this piece + * of information is used only for deciding whether to + * weight-raise async queues + * + * . if bfqq is not weight-raised, because, if bfqq is now + * switching to weight-raised, then last_wr_start_finish + * stores the time when weight-raising starts + * + * . if bfqq is interactive, because, regardless of whether + * bfqq is currently weight-raised, the weight-raising + * period must start or restart (this case is considered + * separately because it is not detected by the above + * conditions, if bfqq is already weight-raised) + * + * last_wr_start_finish has to be updated also if bfqq is soft + * real-time, because the weight-raising period is constantly + * restarted on idle-to-busy transitions for these queues, but + * this is already done in bfq_bfqq_handle_idle_busy_switch if + * needed. + */ if (bfqd->low_latency && (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) bfqq->last_wr_start_finish = jiffies; @@ -1081,14 +1463,24 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, return NULL; } +static sector_t get_sdist(sector_t last_pos, struct request *rq) +{ + sector_t sdist = 0; + + if (last_pos) { + if (last_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - last_pos; + else + sdist = last_pos - blk_rq_pos(rq); + } + + return sdist; +} + static void bfq_activate_request(struct request_queue *q, struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; - bfqd->rq_in_driver++; - bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); - bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", - (unsigned long long) bfqd->last_position); } static void bfq_deactivate_request(struct request_queue *q, struct request *rq) @@ -1105,6 +1497,9 @@ static void bfq_remove_request(struct request *rq) struct bfq_data *bfqd = bfqq->bfqd; const int sync = rq_is_sync(rq); + BUG_ON(bfqq->entity.service > bfqq->entity.budget && + bfqq == bfqd->in_service_queue); + if (bfqq->next_rq == rq) { bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); bfq_updated_next_req(bfqd, bfqq); @@ -1118,8 +1513,25 @@ static void bfq_remove_request(struct request *rq) elv_rb_del(&bfqq->sort_list, rq); if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) + BUG_ON(bfqq->entity.budget < 0); + + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { bfq_del_bfqq_busy(bfqd, bfqq, 1); + + /* bfqq emptied. In normal operation, when + * bfqq is empty, bfqq->entity.service and + * bfqq->entity.budget must contain, + * respectively, the service received and the + * budget used last time bfqq emptied. These + * facts do not hold in this case, as at least + * this last removal occurred while bfqq is + * not in service. To avoid inconsistencies, + * reset both bfqq->entity.service and + * bfqq->entity.budget. + */ + bfqq->entity.budget = bfqq->entity.service = 0; + } + /* * Remove queue from request-position tree as it is empty. */ @@ -1133,9 +1545,8 @@ static void bfq_remove_request(struct request *rq) BUG_ON(bfqq->meta_pending == 0); bfqq->meta_pending--; } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -#endif + bfqg_stats_update_io_remove(bfqq_group(bfqq), req_op(rq), + rq->cmd_flags); } static int bfq_merge(struct request_queue *q, struct request **req, @@ -1145,7 +1556,7 @@ static int bfq_merge(struct request_queue *q, struct request **req, struct request *__rq; __rq = bfq_find_rq_fmerge(bfqd, bio); - if (__rq && elv_rq_merge_ok(__rq, bio)) { + if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_FRONT_MERGE; } @@ -1190,7 +1601,8 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, static void bfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); + bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio_op(bio), + bio->bi_opf); } #endif @@ -1210,7 +1622,7 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, */ if (bfqq == next_bfqq && !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && - time_before(next->fifo_time, rq->fifo_time)) { + next->fifo_time < rq->fifo_time) { list_del_init(&rq->queuelist); list_replace_init(&next->queuelist, &rq->queuelist); rq->fifo_time = next->fifo_time; @@ -1220,21 +1632,31 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfq_remove_request(next); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -#endif + bfqg_stats_update_io_merged(bfqq_group(bfqq), req_op(next), + next->cmd_flags); } /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { BUG_ON(!bfqq); + if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; bfqq->wr_cur_max_time = 0; - /* Trigger a weight change on the next activation of the queue */ + bfqq->last_wr_start_finish = jiffies; + /* + * Trigger a weight change on the next invocation of + * __bfq_entity_update_weight_prio. + */ bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "end_wr: wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", + bfqq->bfqd->wr_busy_queues); } static void bfq_end_wr_async_queues(struct bfq_data *bfqd, @@ -1277,7 +1699,7 @@ static int bfq_rq_close_to_sector(void *io_struct, bool request, sector_t sector) { return abs(bfq_io_struct_pos(io_struct, request) - sector) <= - BFQQ_SEEK_THR; + BFQQ_CLOSE_THR; } static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, @@ -1399,7 +1821,7 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * throughput. */ bfqq->new_bfqq = new_bfqq; - atomic_add(process_refs, &new_bfqq->ref); + new_bfqq->ref += process_refs; return new_bfqq; } @@ -1430,9 +1852,23 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, } /* - * Attempt to schedule a merge of bfqq with the currently in-service queue - * or with a close queue among the scheduled queues. - * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue + * If this function returns true, then bfqq cannot be merged. The idea + * is that true cooperation happens very early after processes start + * to do I/O. Usually, late cooperations are just accidental false + * positives. In case bfqq is weight-raised, such false positives + * would evidently degrade latency guarantees for bfqq. + */ +bool wr_from_too_long(struct bfq_queue *bfqq) +{ + return bfqq->wr_coeff > 1 && + time_is_before_jiffies(bfqq->last_wr_start_finish + + msecs_to_jiffies(100)); +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service + * queue or with a close queue among the scheduled queues. Return + * NULL if no merge was scheduled, a pointer to the shared bfq_queue * structure otherwise. * * The OOM queue is not allowed to participate to cooperation: in fact, since @@ -1441,6 +1877,18 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, * handle merging with the OOM queue would be quite complex and expensive * to maintain. Besides, in such a critical condition as an out of memory, * the benefits of queue merging may be little relevant, or even negligible. + * + * Weight-raised queues can be merged only if their weight-raising + * period has just started. In fact cooperating processes are usually + * started together. Thus, with this filter we avoid false positives + * that would jeopardize low-latency guarantees. + * + * WARNING: queue merging may impair fairness among non-weight raised + * queues, for at least two reasons: 1) the original weight of a + * merged queue may change during the merged state, 2) even being the + * weight the same, a merged queue may be bloated with many more + * requests than the ones produced by its originally-associated + * process. */ static struct bfq_queue * bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -1450,16 +1898,32 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->new_bfqq) return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + + if (io_struct && wr_from_too_long(bfqq) && + likely(bfqq != &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have looked for coop, but bfq%d wr", + bfqq->pid); + + if (!io_struct || + wr_from_too_long(bfqq) || + unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; - /* If device has only one backlogged bfq_queue, don't search. */ + + /* If there is only one backlogged queue, don't search. */ if (bfqd->busy_queues == 1) return NULL; in_service_bfqq = bfqd->in_service_queue; + if (in_service_bfqq && in_service_bfqq != bfqq && + bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) + && likely(in_service_bfqq == &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have tried merge with in-service-queue, but wr"); + if (!in_service_bfqq || in_service_bfqq == bfqq || - !bfqd->in_service_bic || + !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || unlikely(in_service_bfqq == &bfqd->oom_bfqq)) goto check_scheduled; @@ -1481,7 +1945,15 @@ check_scheduled: BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && + if (new_bfqq && wr_from_too_long(new_bfqq) && + likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have merged with bfq%d, but wr", + new_bfqq->pid); + + if (new_bfqq && !wr_from_too_long(new_bfqq) && + likely(new_bfqq != &bfqd->oom_bfqq) && bfq_may_be_close_cooperator(bfqq, new_bfqq)) return bfq_setup_merge(bfqq, new_bfqq); @@ -1490,53 +1962,24 @@ check_scheduled: static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { + struct bfq_io_cq *bic = bfqq->bic; + /* * If !bfqq->bic, the queue is already shared or its requests * have already been redirected to a shared queue; both idle window * and weight raising state have already been saved. Do nothing. */ - if (!bfqq->bic) + if (!bic) return; - if (bfqq->bic->wr_time_left) - /* - * This is the queue of a just-started process, and would - * deserve weight raising: we set wr_time_left to the full - * weight-raising duration to trigger weight-raising when - * and if the queue is split and the first request of the - * queue is enqueued. - */ - bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); - else if (bfqq->wr_coeff > 1) { - unsigned long wr_duration = - jiffies - bfqq->last_wr_start_finish; - /* - * It may happen that a queue's weight raising period lasts - * longer than its wr_cur_max_time, as weight raising is - * handled only when a request is enqueued or dispatched (it - * does not use any timer). If the weight raising period is - * about to end, don't save it. - */ - if (bfqq->wr_cur_max_time <= wr_duration) - bfqq->bic->wr_time_left = 0; - else - bfqq->bic->wr_time_left = - bfqq->wr_cur_max_time - wr_duration; - /* - * The bfq_queue is becoming shared or the requests of the - * process owning the queue are being redirected to a shared - * queue. Stop the weight raising period of the queue, as in - * both cases it should not be owned by an interactive or - * soft real-time application. - */ - bfq_bfqq_end_wr(bfqq); - } else - bfqq->bic->wr_time_left = 0; - bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); - bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bfqq->bic->cooperations++; - bfqq->bic->failed_cooperations = 0; + + bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); } static void bfq_get_bic_reference(struct bfq_queue *bfqq) @@ -1561,6 +2004,40 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, if (bfq_bfqq_IO_bound(bfqq)) bfq_mark_bfqq_IO_bound(new_bfqq); bfq_clear_bfqq_IO_bound(bfqq); + + /* + * If bfqq is weight-raised, then let new_bfqq inherit + * weight-raising. To reduce false positives, neglect the case + * where bfqq has just been created, but has not yet made it + * to be weight-raised (which may happen because EQM may merge + * bfqq even before bfq_add_request is executed for the first + * time for bfqq). Handling this case would however be very + * easy, thanks to the flag just_created. + */ + if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { + new_bfqq->wr_coeff = bfqq->wr_coeff; + new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; + new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; + new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + if (bfq_bfqq_busy(new_bfqq)) + bfqd->wr_busy_queues++; + new_bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, new_bfqq, + "wr start after merge with %d, rais_max_time %u", + bfqq->pid, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ + bfqq->wr_coeff = 1; + bfqq->entity.prio_changed = 1; + if (bfq_bfqq_busy(bfqq)) + bfqd->wr_busy_queues--; + } + + bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", + bfqd->wr_busy_queues); + /* * Grab a reference to the bic, to prevent it from being destroyed * before being possibly touched by a bfq_split_bfqq(). @@ -1587,20 +2064,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_put_queue(bfqq); } -static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) -{ - struct bfq_io_cq *bic = bfqq->bic; - struct bfq_data *bfqd = bfqq->bfqd; - - if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { - bic->failed_cooperations++; - if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) - bic->cooperations = 0; - } -} - -static int bfq_allow_merge(struct request_queue *q, struct request *rq, - struct bio *bio) +static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + struct bio *bio) { struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_io_cq *bic; @@ -1610,7 +2075,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, * Disallow merge of a sync bio into an async request. */ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) - return 0; + return false; /* * Lookup the bfqq that this bio will be queued with. Allow @@ -1619,7 +2084,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, */ bic = bfq_bic_lookup(bfqd, current->io_context); if (!bic) - return 0; + return false; bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); /* @@ -1636,30 +2101,107 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, * to decide whether bio and rq can be merged. */ bfqq = new_bfqq; - } else - bfq_bfqq_increase_failed_cooperations(bfqq); + } } return bfqq == RQ_BFQQ(rq); } +static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, + struct request *next) +{ + return RQ_BFQQ(rq) == RQ_BFQQ(next); +} + +/* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the throughput. + * In practice, a time-slice service scheme is used with seeky + * processes. + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + unsigned int timeout_coeff; + + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout * timeout_coeff; + + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); +} + static void __bfq_set_in_service_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq) { if (bfqq) { -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -#endif bfq_mark_bfqq_must_alloc(bfqq); - bfq_mark_bfqq_budget_new(bfqq); bfq_clear_bfqq_fifo_expire(bfqq); bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + BUG_ON(bfqq == bfqd->in_service_queue); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + if (time_is_before_jiffies(bfqq->last_wr_start_finish) && + bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_before_jiffies(bfqq->budget_timeout)) { + /* + * For soft real-time queues, move the start + * of the weight-raising period forward by the + * time the queue has not received any + * service. Otherwise, a relatively long + * service delay is likely to cause the + * weight-raising period of the queue to end, + * because of the short duration of the + * weight-raising period of a soft real-time + * queue. It is worth noting that this move + * is not so dangerous for the other queues, + * because soft real-time queues are not + * greedy. + * + * To not add a further variable, we use the + * overloaded field budget_timeout to + * determine for how long the queue has not + * received service, i.e., how much time has + * elapsed since the queue expired. However, + * this is a little imprecise, because + * budget_timeout is set to jiffies if bfqq + * not only expires, but also remains with no + * request. + */ + bfqq->last_wr_start_finish += jiffies - + max_t(unsigned long, bfqq->last_wr_start_finish, + bfqq->budget_timeout); + if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { + pr_crit( + "BFQ WARNING:last %lu budget %lu jiffies %lu", + bfqq->last_wr_start_finish, + bfqq->budget_timeout, + jiffies); + pr_crit("diff %lu", jiffies - + max_t(unsigned long, + bfqq->last_wr_start_finish, + bfqq->budget_timeout)); + bfqq->last_wr_start_finish = jiffies; + } + } + + bfq_set_budget_timeout(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, "set_in_service_queue, cur-budget = %d", bfqq->entity.budget); - } + } else + bfq_log(bfqd, "set_in_service_queue: NULL"); bfqd->in_service_queue = bfqq; } @@ -1675,36 +2217,11 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) return bfqq; } -/* - * If enough samples have been computed, return the current max budget - * stored in bfqd, which is dynamically updated according to the - * estimated disk peak rate; otherwise return the default max budget - */ -static int bfq_max_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget; - else - return bfqd->bfq_max_budget; -} - -/* - * Return min budget, which is a fraction of the current or default - * max budget (trying with 1/32) - */ -static int bfq_min_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < bfq_stats_min_budgets) - return bfq_default_max_budget / 32; - else - return bfqd->bfq_max_budget / 32; -} - static void bfq_arm_slice_timer(struct bfq_data *bfqd) { struct bfq_queue *bfqq = bfqd->in_service_queue; struct bfq_io_cq *bic; - unsigned long sl; + u32 sl; BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); @@ -1728,59 +2245,343 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) sl = bfqd->bfq_slice_idle; /* * Unless the queue is being weight-raised or the scenario is - * asymmetric, grant only minimum idle time if the queue either - * has been seeky for long enough or has already proved to be - * constantly seeky. + * asymmetric, grant only minimum idle time if the queue + * is seeky. A long idling is preserved for a weight-raised + * queue, or, more in general, in an asymemtric scenario, + * because a long idling is needed for guaranteeing to a queue + * its reserved share of the throughput (in particular, it is + * needed if the queue has a higher weight than some other + * queue). */ - if (bfq_sample_valid(bfqq->seek_samples) && - ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > - bfq_max_budget(bfqq->bfqd) / 8) || - bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && + if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && bfq_symmetric_scenario(bfqd)) - sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); - else if (bfqq->wr_coeff > 1) - sl = sl * 3; + sl = min_t(u32, sl, BFQ_MIN_TT); + bfqd->last_idling_start = ktime_get(); - mod_timer(&bfqd->idle_slice_timer, jiffies + sl); -#ifdef CONFIG_BFQ_GROUP_IOSCHED + hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), + HRTIMER_MODE_REL); bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -#endif - bfq_log(bfqd, "arm idle: %u/%u ms", - jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); + bfq_log(bfqd, "arm idle: %ld/%ld ms", + sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); } /* - * Set the maximum time for the in-service queue to consume its - * budget. This prevents seeky processes from lowering the disk - * throughput (always guaranteed with a time slice scheme as in CFQ). + * In autotuning mode, max_budget is dynamically recomputed as the + * amount of sectors transferred in timeout at the estimated peak + * rate. This enables BFQ to utilize a full timeslice with a full + * budget, even if the in-service queue is served at peak rate. And + * this maximises throughput with sequential workloads. */ -static void bfq_set_budget_timeout(struct bfq_data *bfqd) +static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) { - struct bfq_queue *bfqq = bfqd->in_service_queue; - unsigned int timeout_coeff; + return (u64)bfqd->peak_rate * USEC_PER_MSEC * + jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; +} - if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) - timeout_coeff = 1; +/* + * Update parameters related to throughput and responsiveness, as a + * function of the estimated peak rate. See comments on + * bfq_calc_max_budget(), and on T_slow and T_fast arrays. + */ +void update_thr_responsiveness_params(struct bfq_data *bfqd) +{ + int dev_type = blk_queue_nonrot(bfqd->queue); + + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd); + BUG_ON(bfqd->bfq_max_budget < 0); + bfq_log(bfqd, "new max_budget = %d", + bfqd->bfq_max_budget); + } + + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } + + bfq_log(bfqd, +"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", + dev_type == 0 ? "ROT" : "NONROT", + bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", + bfqd->device_speed == BFQ_BFQD_FAST ? + (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : + (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> + BFQ_RATE_SHIFT); +} + +void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) +{ + if (rq != NULL) { /* new rq dispatch now, reset accordingly */ + bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; + bfqd->peak_rate_samples = 1; + bfqd->sequential_samples = 0; + bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = + blk_rq_sectors(rq); + } else /* no new rq dispatched, just reset the number of samples */ + bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ + + bfq_log(bfqd, + "reset_rate_computation at end, sample %u/%u tot_sects %llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched); +} + +void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) +{ + u32 rate, weight, divisor; + + /* + * For the convergence property to hold (see comments on + * bfq_update_peak_rate()) and for the assessment to be + * reliable, a minimum number of samples must be present, and + * a minimum amount of time must have elapsed. If not so, do + * not compute new rate. Just reset parameters, to get ready + * for a new evaluation attempt. + */ + if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || + bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { + bfq_log(bfqd, + "update_rate_reset: only resetting, delta_first %lluus samples %d", + bfqd->delta_from_first>>10, bfqd->peak_rate_samples); + goto reset_computation; + } + + /* + * If a new request completion has occurred after last + * dispatch, then, to approximate the rate at which requests + * have been served by the device, it is more precise to + * extend the observation interval to the last completion. + */ + bfqd->delta_from_first = + max_t(u64, bfqd->delta_from_first, + bfqd->last_completion - bfqd->first_dispatch); + + BUG_ON(bfqd->delta_from_first == 0); + /* + * Rate computed in sects/usec, and not sects/nsec, for + * precision issues. + */ + rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); + + bfq_log(bfqd, +"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", + bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + rate > 20< 20M sectors/sec) + */ + if ((bfqd->peak_rate_samples > (3 * bfqd->sequential_samples)>>2 && + rate <= bfqd->peak_rate) || + rate > 20<peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + goto reset_computation; + } else { + bfq_log(bfqd, + "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + } + + /* + * We have to update the peak rate, at last! To this purpose, + * we use a low-pass filter. We compute the smoothing constant + * of the filter as a function of the 'weight' of the new + * measured rate. + * + * As can be seen in next formulas, we define this weight as a + * quantity proportional to how sequential the workload is, + * and to how long the observation time interval is. + * + * The weight runs from 0 to 8. The maximum value of the + * weight, 8, yields the minimum value for the smoothing + * constant. At this minimum value for the smoothing constant, + * the measured rate contributes for half of the next value of + * the estimated peak rate. + * + * So, the first step is to compute the weight as a function + * of how sequential the workload is. Note that the weight + * cannot reach 9, because bfqd->sequential_samples cannot + * become equal to bfqd->peak_rate_samples, which, in its + * turn, holds true because bfqd->sequential_samples is not + * incremented for the first sample. + */ + weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; + + /* + * Second step: further refine the weight as a function of the + * duration of the observation interval. + */ + weight = min_t(u32, 8, + div_u64(weight * bfqd->delta_from_first, + BFQ_RATE_REF_INTERVAL)); + + /* + * Divisor ranging from 10, for minimum weight, to 2, for + * maximum weight. + */ + divisor = 10 - weight; + BUG_ON(divisor == 0); + + /* + * Finally, update peak rate: + * + * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor + */ + bfqd->peak_rate *= divisor-1; + bfqd->peak_rate /= divisor; + rate /= divisor; /* smoothing constant alpha = 1/divisor */ + + bfq_log(bfqd, + "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", + divisor, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), + (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); + + BUG_ON(bfqd->peak_rate == 0); + BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; + update_thr_responsiveness_params(bfqd); + BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ + bfq_log(bfqd, + "update_peak_rate: goto reset, samples %d", + bfqd->peak_rate_samples) ; + bfq_reset_rate_computation(bfqd, rq); + goto update_last_values; /* will add one sample */ + } + + /* + * Device idle for very long: the observation interval lasting + * up to this dispatch cannot be a valid observation interval + * for computing a new peak rate (similarly to the late- + * completion event in bfq_completed_request()). Go to + * update_rate_and_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - start a new observation interval with this dispatch + */ + if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && + bfqd->rq_in_driver == 0) { + bfq_log(bfqd, +"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", + (now_ns - bfqd->last_dispatch)>>10, + bfqd->peak_rate_samples) ; + goto update_rate_and_reset; + } + + /* Update sampling information */ + bfqd->peak_rate_samples++; + + if ((bfqd->rq_in_driver > 0 || + now_ns - bfqd->last_completion < BFQ_MIN_TT) + && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) + bfqd->sequential_samples++; + + bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); + + /* Reset max observed rq size every 32 dispatches */ + if (likely(bfqd->peak_rate_samples % 32)) + bfqd->last_rq_max_size = + max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); else - timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + bfqd->last_rq_max_size = blk_rq_sectors(rq); - bfqd->last_budget_start = ktime_get(); + bfqd->delta_from_first = now_ns - bfqd->first_dispatch; - bfq_clear_bfqq_budget_new(bfqq); - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; + bfq_log(bfqd, + "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched, + bfqd->delta_from_first>>10); - bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", - jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * - timeout_coeff)); + /* Target observation interval not yet reached, go on sampling */ + if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) + goto update_last_values; + +update_rate_and_reset: + bfq_update_rate_reset(bfqd, rq); +update_last_values: + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfqd->last_dispatch = now_ns; + + bfq_log(bfqd, + "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", + (now_ns - bfqd->first_dispatch)>>10, + (unsigned long long) bfqd->last_position, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + bfq_log(bfqd, + "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); } /* - * Move request from internal lists to the request queue dispatch list. + * Move request from internal lists to the dispatch list of the request queue */ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) { - struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq = RQ_BFQQ(rq); /* @@ -1794,15 +2595,10 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) * incrementing bfqq->dispatched. */ bfqq->dispatched++; + bfq_update_peak_rate(q->elevator->elevator_data, rq); + bfq_remove_request(rq); elv_dispatch_sort(q, rq); - - if (bfq_bfqq_sync(bfqq)) - bfqd->sync_flight++; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), - rq->cmd_flags); -#endif } /* @@ -1822,19 +2618,12 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) rq = rq_entry_fifo(bfqq->fifo.next); - if (time_before(jiffies, rq->fifo_time)) + if (ktime_get_ns() < rq->fifo_time) return NULL; return rq; } -static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - return entity->budget - entity->service; -} - static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) { BUG_ON(bfqq != bfqd->in_service_queue); @@ -1851,12 +2640,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_mark_bfqq_split_coop(bfqq); if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - /* - * Overloading budget_timeout field to store the time - * at which the queue remains with no backlog; used by - * the weight-raising mechanism. - */ - bfqq->budget_timeout = jiffies; + if (bfqq->dispatched == 0) + /* + * Overloading budget_timeout field to store + * the time at which the queue remains with no + * backlog and no outstanding request; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + bfq_del_bfqq_busy(bfqd, bfqq, 1); } else { bfq_activate_bfqq(bfqd, bfqq); @@ -1883,10 +2675,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, struct request *next_rq; int budget, min_budget; - budget = bfqq->max_budget; + BUG_ON(bfqq != bfqd->in_service_queue); + min_budget = bfq_min_budget(bfqd); - BUG_ON(bfqq != bfqd->in_service_queue); + if (bfqq->wr_coeff == 1) + budget = bfqq->max_budget; + else /* + * Use a constant, low budget for weight-raised queues, + * to help achieve a low latency. Keep it slightly higher + * than the minimum possible budget, to cause a little + * bit fewer expirations. + */ + budget = 2 * min_budget; bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); @@ -1895,7 +2696,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - if (bfq_bfqq_sync(bfqq)) { + if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { switch (reason) { /* * Caveat: in all the following cases we trade latency @@ -1937,14 +2738,10 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, break; case BFQ_BFQQ_BUDGET_TIMEOUT: /* - * We double the budget here because: 1) it - * gives the chance to boost the throughput if - * this is not a seeky process (which may have - * bumped into this timeout because of, e.g., - * ZBR), 2) together with charge_full_budget - * it helps give seeky processes higher - * timestamps, and hence be served less - * frequently. + * We double the budget here because it gives + * the chance to boost the throughput if this + * is not a seeky process (and has bumped into + * this timeout because of, e.g., ZBR). */ budget = min(budget * 2, bfqd->bfq_max_budget); break; @@ -1961,17 +2758,49 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, budget = min(budget * 4, bfqd->bfq_max_budget); break; case BFQ_BFQQ_NO_MORE_REQUESTS: - /* - * Leave the budget unchanged. - */ + /* + * For queues that expire for this reason, it + * is particularly important to keep the + * budget close to the actual service they + * need. Doing so reduces the timestamp + * misalignment problem described in the + * comments in the body of + * __bfq_activate_entity. In fact, suppose + * that a queue systematically expires for + * BFQ_BFQQ_NO_MORE_REQUESTS and presents a + * new request in time to enjoy timestamp + * back-shifting. The larger the budget of the + * queue is with respect to the service the + * queue actually requests in each service + * slot, the more times the queue can be + * reactivated with the same virtual finish + * time. It follows that, even if this finish + * time is pushed to the system virtual time + * to reduce the consequent timestamp + * misalignment, the queue unjustly enjoys for + * many re-activations a lower finish time + * than all newly activated queues. + * + * The service needed by bfqq is measured + * quite precisely by bfqq->entity.service. + * Since bfqq does not enjoy device idling, + * bfqq->entity.service is equal to the number + * of sectors that the process associated with + * bfqq requested to read/write before waiting + * for request completions, or blocking for + * other reasons. + */ + budget = max_t(int, bfqq->entity.service, min_budget); + break; default: return; } - } else + } else if (!bfq_bfqq_sync(bfqq)) /* - * Async queues get always the maximum possible budget - * (their ability to dispatch is limited by - * @bfqd->bfq_max_budget_async_rq). + * Async queues get always the maximum possible + * budget, as for them we do not care about latency + * (in addition, their ability to dispatch is limited + * by the charging factor). */ budget = bfqd->bfq_max_budget; @@ -1982,160 +2811,120 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); /* - * Make sure that we have enough budget for the next request. - * Since the finish time of the bfqq must be kept in sync with - * the budget, be sure to call __bfq_bfqq_expire() after the + * If there is still backlog, then assign a new budget, making + * sure that it is large enough for the next request. Since + * the finish time of bfqq must be kept in sync with the + * budget, be sure to call __bfq_bfqq_expire() *after* this * update. + * + * If there is no backlog, then no need to update the budget; + * it will be updated on the arrival of a new request. */ next_rq = bfqq->next_rq; - if (next_rq) + if (next_rq) { + BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || + reason == BFQ_BFQQ_NO_MORE_REQUESTS); bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); - else - bfqq->entity.budget = bfqq->max_budget; + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + } bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", next_rq ? blk_rq_sectors(next_rq) : 0, bfqq->entity.budget); } -static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) -{ - unsigned long max_budget; - - /* - * The max_budget calculated when autotuning is equal to the - * amount of sectors transfered in timeout_sync at the - * estimated peak rate. - */ - max_budget = (unsigned long)(peak_rate * 1000 * - timeout >> BFQ_RATE_SHIFT); - - return max_budget; -} - /* - * In addition to updating the peak rate, checks whether the process - * is "slow", and returns 1 if so. This slow flag is used, in addition - * to the budget timeout, to reduce the amount of service provided to - * seeky processes, and hence reduce their chances to lower the - * throughput. See the code for more details. + * Return true if the process associated with bfqq is "slow". The slow + * flag is used, in addition to the budget timeout, to reduce the + * amount of service provided to seeky processes, and thus reduce + * their chances to lower the throughput. More details in the comments + * on the function bfq_bfqq_expire(). + * + * An important observation is in order: as discussed in the comments + * on the function bfq_update_peak_rate(), with devices with internal + * queues, it is hard if ever possible to know when and for how long + * an I/O request is processed by the device (apart from the trivial + * I/O pattern where a new request is dispatched only after the + * previous one has been completed). This makes it hard to evaluate + * the real rate at which the I/O requests of each bfq_queue are + * served. In fact, for an I/O scheduler like BFQ, serving a + * bfq_queue means just dispatching its requests during its service + * slot (i.e., until the budget of the queue is exhausted, or the + * queue remains idle, or, finally, a timeout fires). But, during the + * service slot of a bfq_queue, around 100 ms at most, the device may + * be even still processing requests of bfq_queues served in previous + * service slots. On the opposite end, the requests of the in-service + * bfq_queue may be completed after the service slot of the queue + * finishes. + * + * Anyway, unless more sophisticated solutions are used + * (where possible), the sum of the sizes of the requests dispatched + * during the service slot of a bfq_queue is probably the only + * approximation available for the service received by the bfq_queue + * during its service slot. And this sum is the quantity used in this + * function to evaluate the I/O speed of a process. */ -static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason) +static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool compensate, enum bfqq_expiration reason, + unsigned long *delta_ms) { - u64 bw, usecs, expected, timeout; - ktime_t delta; - int update = 0; + ktime_t delta_ktime; + u32 delta_usecs; + bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ - if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) + if (!bfq_bfqq_sync(bfqq)) return false; if (compensate) - delta = bfqd->last_idling_start; - else - delta = ktime_get(); - delta = ktime_sub(delta, bfqd->last_budget_start); - usecs = ktime_to_us(delta); - - /* Don't trust short/unrealistic values. */ - if (usecs < 100 || usecs >= LONG_MAX) - return false; - - /* - * Calculate the bandwidth for the last slice. We use a 64 bit - * value to store the peak rate, in sectors per usec in fixed - * point math. We do so to have enough precision in the estimate - * and to avoid overflows. - */ - bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; - do_div(bw, (unsigned long)usecs); - - timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); - - /* - * Use only long (> 20ms) intervals to filter out spikes for - * the peak rate estimation. - */ - if (usecs > 20000) { - if (bw > bfqd->peak_rate || - (!BFQQ_SEEKY(bfqq) && - reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { - bfq_log(bfqd, "measured bw =%llu", bw); - /* - * To smooth oscillations use a low-pass filter with - * alpha=7/8, i.e., - * new_rate = (7/8) * old_rate + (1/8) * bw - */ - do_div(bw, 8); - if (bw == 0) - return 0; - bfqd->peak_rate *= 7; - do_div(bfqd->peak_rate, 8); - bfqd->peak_rate += bw; - update = 1; - bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); - } - - update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; - - if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) - bfqd->peak_rate_samples++; - - if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && - update) { - int dev_type = blk_queue_nonrot(bfqd->queue); - - if (bfqd->bfq_user_max_budget == 0) { - bfqd->bfq_max_budget = - bfq_calc_max_budget(bfqd->peak_rate, - timeout); - bfq_log(bfqd, "new max_budget=%d", - bfqd->bfq_max_budget); - } - if (bfqd->device_speed == BFQ_BFQD_FAST && - bfqd->peak_rate < device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_SLOW; - bfqd->RT_prod = R_slow[dev_type] * - T_slow[dev_type]; - } else if (bfqd->device_speed == BFQ_BFQD_SLOW && - bfqd->peak_rate > device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_FAST; - bfqd->RT_prod = R_fast[dev_type] * - T_fast[dev_type]; - } - } + delta_ktime = bfqd->last_idling_start; + else + delta_ktime = ktime_get(); + delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); + delta_usecs = ktime_to_us(delta_ktime); + + /* don't trust short/unrealistic values. */ + if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) { + if (blk_queue_nonrot(bfqd->queue)) + /* + * give same worst-case guarantees as idling + * for seeky + */ + *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; + else /* charge at least one seek */ + *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; + + bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs); + + return slow; } - /* - * If the process has been served for a too short time - * interval to let its possible sequential accesses prevail on - * the initial seek time needed to move the disk head on the - * first sector it requested, then give the process a chance - * and for the moment return false. - */ - if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) - return false; + *delta_ms = delta_usecs / USEC_PER_MSEC; /* - * A process is considered ``slow'' (i.e., seeky, so that we - * cannot treat it fairly in the service domain, as it would - * slow down too much the other processes) if, when a slice - * ends for whatever reason, it has received service at a - * rate that would not be high enough to complete the budget - * before the budget timeout expiration. + * Use only long (> 20ms) intervals to filter out excessive + * spikes in service rate estimation. */ - expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; + if (delta_usecs > 20000) { + /* + * Caveat for rotational devices: processes doing I/O + * in the slower disk zones tend to be slow(er) even + * if not seeky. In this respect, the estimated peak + * rate is likely to be an average over the disk + * surface. Accordingly, to not be too harsh with + * unlucky processes, a process is deemed slow only if + * its rate has been lower than half of the estimated + * peak rate. + */ + slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; + bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", + bfqq->entity.service, bfqd->bfq_max_budget); + } - /* - * Caveat: processes doing IO in the slower disk zones will - * tend to be slow(er) even if not seeky. And the estimated - * peak rate will actually be an average over the disk - * surface. Hence, to not be too harsh with unlucky processes, - * we keep a budget/3 margin of safety before declaring a - * process slow. - */ - return expected > (4 * bfqq->entity.budget) / 3; + bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); + + return slow; } /* @@ -2193,20 +2982,35 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, struct bfq_queue *bfqq) { + bfq_log_bfqq(bfqd, bfqq, +"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", + bfqq->service_from_backlogged, + bfqd->bfq_wr_max_softrt_rate, + jiffies_to_msecs(HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate)); + return max(bfqq->last_idle_bklogged + HZ * bfqq->service_from_backlogged / bfqd->bfq_wr_max_softrt_rate, - jiffies + bfqq->bfqd->bfq_slice_idle + 4); + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); +} + +/* + * Return the farthest future time instant according to jiffies + * macros. + */ +static unsigned long bfq_greatest_from_now(void) +{ + return jiffies + MAX_JIFFY_OFFSET; } /* - * Return the largest-possible time instant such that, for as long as possible, - * the current time will be lower than this time instant according to the macro - * time_is_before_jiffies(). + * Return the farthest past time instant according to jiffies + * macros. */ -static unsigned long bfq_infinity_from_now(unsigned long now) +static unsigned long bfq_smallest_from_now(void) { - return now + ULONG_MAX / 2; + return jiffies - MAX_JIFFY_OFFSET; } /** @@ -2216,28 +3020,24 @@ static unsigned long bfq_infinity_from_now(unsigned long now) * @compensate: if true, compensate for the time spent idling. * @reason: the reason causing the expiration. * + * If the process associated with bfqq does slow I/O (e.g., because it + * issues random requests), we charge bfqq with the time it has been + * in service instead of the service it has received (see + * bfq_bfqq_charge_time for details on how this goal is achieved). As + * a consequence, bfqq will typically get higher timestamps upon + * reactivation, and hence it will be rescheduled as if it had + * received more service than what it has actually received. In the + * end, bfqq receives less service in proportion to how slowly its + * associated process consumes its budgets (and hence how seriously it + * tends to lower the throughput). In addition, this time-charging + * strategy guarantees time fairness among slow processes. In + * contrast, if the process associated with bfqq is not slow, we + * charge bfqq exactly with the service it has received. * - * If the process associated to the queue is slow (i.e., seeky), or in - * case of budget timeout, or, finally, if it is async, we - * artificially charge it an entire budget (independently of the - * actual service it received). As a consequence, the queue will get - * higher timestamps than the correct ones upon reactivation, and - * hence it will be rescheduled as if it had received more service - * than what it actually received. In the end, this class of processes - * will receive less service in proportion to how slowly they consume - * their budgets (and hence how seriously they tend to lower the - * throughput). - * - * In contrast, when a queue expires because it has been idling for - * too much or because it exhausted its budget, we do not touch the - * amount of service it has received. Hence when the queue will be - * reactivated and its timestamps updated, the latter will be in sync - * with the actual service received by the queue until expiration. - * - * Charging a full budget to the first type of queues and the exact - * service to the others has the effect of using the WF2Q+ policy to - * schedule the former on a timeslice basis, without violating the - * service domain guarantees of the latter. + * Charging time to the first type of queues and the exact service to + * the other has the effect of using the WF2Q+ policy to schedule the + * former on a timeslice basis, without violating service domain + * guarantees among the latter. */ static void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -2245,41 +3045,52 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, enum bfqq_expiration reason) { bool slow; + unsigned long delta = 0; + struct bfq_entity *entity = &bfqq->entity; BUG_ON(bfqq != bfqd->in_service_queue); /* - * Update disk peak rate for autotuning and check whether the - * process is slow (see bfq_update_peak_rate). + * Check whether the process is slow (see bfq_bfqq_is_slow). */ - slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); /* - * As above explained, 'punish' slow (i.e., seeky), timed-out - * and async queues, to favor sequential sync workloads. - * - * Processes doing I/O in the slower disk zones will tend to be - * slow(er) even if not seeky. Hence, since the estimated peak - * rate is actually an average over the disk surface, these - * processes may timeout just for bad luck. To avoid punishing - * them we do not charge a full budget to a process that - * succeeded in consuming at least 2/3 of its budget. + * Increase service_from_backlogged before next statement, + * because the possible next invocation of + * bfq_bfqq_charge_time would likely inflate + * entity->service. In contrast, service_from_backlogged must + * contain real service, to enable the soft real-time + * heuristic to correctly compute the bandwidth consumed by + * bfqq. */ - if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) - bfq_bfqq_charge_full_budget(bfqq); + bfqq->service_from_backlogged += entity->service; - bfqq->service_from_backlogged += bfqq->entity.service; + /* + * As above explained, charge slow (typically seeky) and + * timed-out queues with the time and not the service + * received, to favor sequential workloads. + * + * Processes doing I/O in the slower disk zones will tend to + * be slow(er) even if not seeky. Therefore, since the + * estimated peak rate is actually an average over the disk + * surface, these processes may timeout just for bad luck. To + * avoid punishing them, do not charge time to processes that + * succeeded in consuming at least 2/3 of their budget. This + * allows BFQ to preserve enough elasticity to still perform + * bandwidth, and not time, distribution with little unlucky + * or quasi-sequential processes. + */ + if (bfqq->wr_coeff == 1 && + (slow || + (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) + bfq_bfqq_charge_time(bfqd, bfqq, delta); - if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && - !bfq_bfqq_constantly_seeky(bfqq)) { - bfq_mark_bfqq_constantly_seeky(bfqq); - if (!blk_queue_nonrot(bfqd->queue)) - bfqd->const_seeky_busy_in_flight_queues++; - } + BUG_ON(bfqq->entity.budget < bfqq->entity.service); if (reason == BFQ_BFQQ_TOO_IDLE && - bfqq->entity.service <= 2 * bfqq->entity.budget / 10) + entity->service <= 2 * entity->budget / 10) bfq_clear_bfqq_IO_bound(bfqq); if (bfqd->low_latency && bfqq->wr_coeff == 1) @@ -2288,19 +3099,23 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && RB_EMPTY_ROOT(&bfqq->sort_list)) { /* - * If we get here, and there are no outstanding requests, - * then the request pattern is isochronous (see the comments - * to the function bfq_bfqq_softrt_next_start()). Hence we - * can compute soft_rt_next_start. If, instead, the queue - * still has outstanding requests, then we have to wait - * for the completion of all the outstanding requests to + * If we get here, and there are no outstanding + * requests, then the request pattern is isochronous + * (see the comments on the function + * bfq_bfqq_softrt_next_start()). Thus we can compute + * soft_rt_next_start. If, instead, the queue still + * has outstanding requests, then we have to wait for + * the completion of all the outstanding requests to * discover whether the request pattern is actually * isochronous. */ - if (bfqq->dispatched == 0) + BUG_ON(bfqd->busy_queues < 1); + if (bfqq->dispatched == 0) { bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); - else { + bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", + bfqq->soft_rt_next_start); + } else { /* * The application is still waiting for the * completion of one or more requests: @@ -2317,7 +3132,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, * happened to be in the past. */ bfqq->soft_rt_next_start = - bfq_infinity_from_now(jiffies); + bfq_greatest_from_now(); /* * Schedule an update of soft_rt_next_start to when * the task may be discovered to be isochronous. @@ -2327,15 +3142,27 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, } bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, - slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", + reason, slow, bfqq->dispatched, + bfq_bfqq_idle_window(bfqq), entity->weight); /* * Increase, decrease or leave budget unchanged according to * reason. */ + BUG_ON(bfqq->entity.budget < bfqq->entity.service); __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); __bfq_bfqq_expire(bfqd, bfqq); + + BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && + !bfq_class_idle(bfqq)); + + if (!bfq_bfqq_busy(bfqq) && + reason != BFQ_BFQQ_BUDGET_TIMEOUT && + reason != BFQ_BFQQ_BUDGET_EXHAUSTED) + bfq_mark_bfqq_non_blocking_wait_rq(bfqq); } /* @@ -2345,20 +3172,17 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, */ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) { - if (bfq_bfqq_budget_new(bfqq) || - time_before(jiffies, bfqq->budget_timeout)) - return false; - return true; + return time_is_before_eq_jiffies(bfqq->budget_timeout); } /* - * If we expire a queue that is waiting for the arrival of a new - * request, we may prevent the fictitious timestamp back-shifting that - * allows the guarantees of the queue to be preserved (see [1] for - * this tricky aspect). Hence we return true only if this condition - * does not hold, or if the queue is slow enough to deserve only to be - * kicked off for preserving a high throughput. -*/ + * If we expire a queue that is actively waiting (i.e., with the + * device idled) for the arrival of a new request, then we may incur + * the timestamp misalignment problem described in the body of the + * function __bfq_activate_entity. Hence we return true only if this + * condition does not hold, or if the queue is slow enough to deserve + * only to be kicked off for preserving a high throughput. + */ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, @@ -2400,10 +3224,12 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; bool idling_boosts_thr, idling_boosts_thr_without_issues, - all_queues_seeky, on_hdd_and_not_all_queues_seeky, idling_needed_for_service_guarantees, asymmetric_scenario; + if (bfqd->strict_guarantees) + return true; + /* * The next variable takes into account the cases where idling * boosts the throughput. @@ -2466,74 +3292,27 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) bfqd->wr_busy_queues == 0; /* - * There are then two cases where idling must be performed not + * There is then a case where idling must be performed not * for throughput concerns, but to preserve service - * guarantees. In the description of these cases, we say, for - * short, that a queue is sequential/random if the process - * associated to the queue issues sequential/random requests - * (in the second case the queue may be tagged as seeky or - * even constantly_seeky). - * - * To introduce the first case, we note that, since - * bfq_bfqq_idle_window(bfqq) is false if the device is - * NCQ-capable and bfqq is random (see - * bfq_update_idle_window()), then, from the above two - * assignments it follows that - * idling_boosts_thr_without_issues is false if the device is - * NCQ-capable and bfqq is random. Therefore, for this case, - * device idling would never be allowed if we used just - * idling_boosts_thr_without_issues to decide whether to allow - * it. And, beneficially, this would imply that throughput - * would always be boosted also with random I/O on NCQ-capable - * HDDs. + * guarantees. * - * But we must be careful on this point, to avoid an unfair - * treatment for bfqq. In fact, because of the same above - * assignments, idling_boosts_thr_without_issues is, on the - * other hand, true if 1) the device is an HDD and bfqq is - * sequential, and 2) there are no busy weight-raised - * queues. As a consequence, if we used just - * idling_boosts_thr_without_issues to decide whether to idle - * the device, then with an HDD we might easily bump into a - * scenario where queues that are sequential and I/O-bound - * would enjoy idling, whereas random queues would not. The - * latter might then get a low share of the device throughput, - * simply because the former would get many requests served - * after being set as in service, while the latter would not. - * - * To address this issue, we start by setting to true a - * sentinel variable, on_hdd_and_not_all_queues_seeky, if the - * device is rotational and not all queues with pending or - * in-flight requests are constantly seeky (i.e., there are - * active sequential queues, and bfqq might then be mistreated - * if it does not enjoy idling because it is random). - */ - all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && - bfqd->busy_in_flight_queues == - bfqd->const_seeky_busy_in_flight_queues; - - on_hdd_and_not_all_queues_seeky = - !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; - - /* - * To introduce the second case where idling needs to be - * performed to preserve service guarantees, we can note that - * allowing the drive to enqueue more than one request at a - * time, and hence delegating de facto final scheduling - * decisions to the drive's internal scheduler, causes loss of - * control on the actual request service order. In particular, - * the critical situation is when requests from different - * processes happens to be present, at the same time, in the - * internal queue(s) of the drive. In such a situation, the - * drive, by deciding the service order of the - * internally-queued requests, does determine also the actual - * throughput distribution among these processes. But the - * drive typically has no notion or concern about per-process - * throughput distribution, and makes its decisions only on a - * per-request basis. Therefore, the service distribution - * enforced by the drive's internal scheduler is likely to - * coincide with the desired device-throughput distribution - * only in a completely symmetric scenario where: + * To introduce this case, we can note that allowing the drive + * to enqueue more than one request at a time, and hence + * delegating de facto final scheduling decisions to the + * drive's internal scheduler, entails loss of control on the + * actual request service order. In particular, the critical + * situation is when requests from different processes happen + * to be present, at the same time, in the internal queue(s) + * of the drive. In such a situation, the drive, by deciding + * the service order of the internally-queued requests, does + * determine also the actual throughput distribution among + * these processes. But the drive typically has no notion or + * concern about per-process throughput distribution, and + * makes its decisions only on a per-request basis. Therefore, + * the service distribution enforced by the drive's internal + * scheduler is likely to coincide with the desired + * device-throughput distribution only in a completely + * symmetric scenario where: * (i) each of these processes must get the same throughput as * the others; * (ii) all these processes have the same I/O pattern @@ -2555,26 +3334,53 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * words, only if sub-condition (i) holds, then idling is * allowed, and the device tends to be prevented from queueing * many requests, possibly of several processes. The reason - * for not controlling also sub-condition (ii) is that, first, - * in the case of an HDD, the asymmetry in terms of types of - * I/O patterns is already taken in to account in the above - * sentinel variable - * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a - * flash-based device, we prefer however to privilege - * throughput (and idling lowers throughput for this type of - * devices), for the following reasons: - * 1) differently from HDDs, the service time of random - * requests is not orders of magnitudes lower than the service - * time of sequential requests; thus, even if processes doing - * sequential I/O get a preferential treatment with respect to - * others doing random I/O, the consequences are not as - * dramatic as with HDDs; - * 2) if a process doing random I/O does need strong - * throughput guarantees, it is hopefully already being - * weight-raised, or the user is likely to have assigned it a - * higher weight than the other processes (and thus - * sub-condition (i) is likely to be false, which triggers - * idling). + * for not controlling also sub-condition (ii) is that we + * exploit preemption to preserve guarantees in case of + * symmetric scenarios, even if (ii) does not hold, as + * explained in the next two paragraphs. + * + * Even if a queue, say Q, is expired when it remains idle, Q + * can still preempt the new in-service queue if the next + * request of Q arrives soon (see the comments on + * bfq_bfqq_update_budg_for_activation). If all queues and + * groups have the same weight, this form of preemption, + * combined with the hole-recovery heuristic described in the + * comments on function bfq_bfqq_update_budg_for_activation, + * are enough to preserve a correct bandwidth distribution in + * the mid term, even without idling. In fact, even if not + * idling allows the internal queues of the device to contain + * many requests, and thus to reorder requests, we can rather + * safely assume that the internal scheduler still preserves a + * minimum of mid-term fairness. The motivation for using + * preemption instead of idling is that, by not idling, + * service guarantees are preserved without minimally + * sacrificing throughput. In other words, both a high + * throughput and its desired distribution are obtained. + * + * More precisely, this preemption-based, idleless approach + * provides fairness in terms of IOPS, and not sectors per + * second. This can be seen with a simple example. Suppose + * that there are two queues with the same weight, but that + * the first queue receives requests of 8 sectors, while the + * second queue receives requests of 1024 sectors. In + * addition, suppose that each of the two queues contains at + * most one request at a time, which implies that each queue + * always remains idle after it is served. Finally, after + * remaining idle, each queue receives very quickly a new + * request. It follows that the two queues are served + * alternatively, preempting each other if needed. This + * implies that, although both queues have the same weight, + * the queue with large requests receives a service that is + * 1024/8 times as high as the service received by the other + * queue. + * + * On the other hand, device idling is performed, and thus + * pure sector-domain guarantees are provided, for the + * following queues, which are likely to need stronger + * throughput guarantees: weight-raised queues, and queues + * with a higher weight than other queues. When such queues + * are active, sub-condition (i) is false, which triggers + * device idling. * * According to the above considerations, the next variable is * true (only) if sub-condition (i) holds. To compute the @@ -2582,7 +3388,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * the function bfq_symmetric_scenario(), but also check * whether bfqq is being weight-raised, because * bfq_symmetric_scenario() does not take into account also - * weight-raised queues (see comments to + * weight-raised queues (see comments on * bfq_weights_tree_add()). * * As a side note, it is worth considering that the above @@ -2604,17 +3410,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * bfqq. Such a case is when bfqq became active in a burst of * queue activations. Queues that became active during a large * burst benefit only from throughput, as discussed in the - * comments to bfq_handle_burst. Thus, if bfqq became active + * comments on bfq_handle_burst. Thus, if bfqq became active * in a burst and not idling the device maximizes throughput, * then the device must no be idled, because not idling the * device provides bfqq and all other queues in the burst with - * maximum benefit. Combining this and the two cases above, we - * can now establish when idling is actually needed to - * preserve service guarantees. + * maximum benefit. Combining this and the above case, we can + * now establish when idling is actually needed to preserve + * service guarantees. */ idling_needed_for_service_guarantees = - (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && - !bfq_bfqq_in_large_burst(bfqq); + asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); /* * We have now all the components we need to compute the return @@ -2624,6 +3429,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * 2) idling either boosts the throughput (without issues), or * is necessary to preserve service guarantees. */ + bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", + bfq_bfqq_sync(bfqq), idling_boosts_thr); + + bfq_log_bfqq(bfqd, bfqq, + "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", + bfqd->wr_busy_queues, + idling_boosts_thr_without_issues, + bfq_bfqq_IO_bound(bfqq), + idling_needed_for_service_guarantees); + return bfq_bfqq_sync(bfqq) && (idling_boosts_thr_without_issues || idling_needed_for_service_guarantees); @@ -2635,7 +3450,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * 1) the queue must remain in service and cannot be expired, and * 2) the device must be idled to wait for the possible arrival of a new * request for the queue. - * See the comments to the function bfq_bfqq_may_idle for the reasons + * See the comments on the function bfq_bfqq_may_idle for the reasons * why performing device idling is the best choice to boost the throughput * and preserve service guarantees when bfq_bfqq_may_idle itself * returns true. @@ -2665,7 +3480,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); if (bfq_may_expire_for_budg_timeout(bfqq) && - !timer_pending(&bfqd->idle_slice_timer) && + !hrtimer_active(&bfqd->idle_slice_timer) && !bfq_bfqq_must_idle(bfqq)) goto expire; @@ -2685,7 +3500,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * not disable disk idling even when a new request * arrives. */ - if (timer_pending(&bfqd->idle_slice_timer)) { + if (bfq_bfqq_wait_request(bfqq)) { + BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); /* * If we get here: 1) at least a new request * has arrived but we have not disabled the @@ -2700,10 +3516,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * So we disable idling. */ bfq_clear_bfqq_wait_request(bfqq); - del_timer(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); bfqg_stats_update_idle_time(bfqq_group(bfqq)); -#endif } goto keep_queue; } @@ -2714,7 +3528,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * for a new request, or has requests waiting for a completion and * may idle after their completion, then keep it anyway. */ - if (timer_pending(&bfqd->idle_slice_timer) || + if (hrtimer_active(&bfqd->idle_slice_timer) || (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { bfqq = NULL; goto keep_queue; @@ -2736,6 +3550,9 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) struct bfq_entity *entity = &bfqq->entity; if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_after_jiffies(bfqq->last_wr_start_finish)); + bfq_log_bfqq(bfqd, bfqq, "raising period dur %u/%u msec, old coeff %u, w %d(%d)", jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), @@ -2749,22 +3566,30 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); /* - * If the queue was activated in a burst, or - * too much time has elapsed from the beginning - * of this weight-raising period, or the queue has - * exceeded the acceptable number of cooperations, - * then end weight raising. + * If the queue was activated in a burst, or too much + * time has elapsed from the beginning of this + * weight-raising period, then end weight raising. */ - if (bfq_bfqq_in_large_burst(bfqq) || - bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time)) { - bfqq->last_wr_start_finish = jiffies; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); + if (bfq_bfqq_in_large_burst(bfqq)) bfq_bfqq_end_wr(bfqq); + else if (time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || + time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) + bfq_bfqq_end_wr(bfqq); + else { + /* switch back to interactive wr */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = + bfqq->wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies( + bfqq->last_wr_start_finish)); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "back to interactive wr"); + } } } /* Update weight both if it must be raised and if it must be lowered */ @@ -2815,13 +3640,29 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, */ if (!bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); + BUG_ON(bfqq->entity.budget < bfqq->entity.service); goto expire; } + BUG_ON(bfqq->entity.budget < bfqq->entity.service); /* Finally, insert request into driver dispatch list. */ bfq_bfqq_served(bfqq, service_to_charge); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + bfq_dispatch_insert(bfqd->queue, rq); + /* + * If weight raising has to terminate for bfqq, then next + * function causes an immediate update of bfqq's weight, + * without waiting for next activation. As a consequence, on + * expiration, bfqq will be timestamped as if has never been + * weight-raised during this service slot, even if it has + * received part or even most of the service as a + * weight-raised queue. This inflates bfqq's timestamps, which + * is beneficial, as bfqq is then more willing to leave the + * device immediately to possible other weight-raised queues. + */ bfq_update_wr_data(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, @@ -2837,9 +3678,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, bfqd->in_service_bic = RQ_BIC(rq); } - if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && - dispatched >= bfqd->bfq_max_budget_async_rq) || - bfq_class_idle(bfqq))) + if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) goto expire; return dispatched; @@ -2885,8 +3724,8 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd) st = bfq_entity_service_tree(&bfqq->entity); dispatched += __bfq_forced_dispatch_bfqq(bfqq); - bfqq->max_budget = bfq_max_budget(bfqd); + bfqq->max_budget = bfq_max_budget(bfqd); bfq_forget_idle(st); } @@ -2899,37 +3738,37 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) { struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq; - int max_dispatch; bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + if (bfqd->busy_queues == 0) return 0; if (unlikely(force)) return bfq_forced_dispatch(bfqd); + /* + * Force device to serve one request at a time if + * strict_guarantees is true. Forcing this service scheme is + * currently the ONLY way to guarantee that the request + * service order enforced by the scheduler is respected by a + * queueing device. Otherwise the device is free even to make + * some unlucky request wait for as long as the device + * wishes. + * + * Of course, serving one request at at time may cause loss of + * throughput. + */ + if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + return 0; + bfqq = bfq_select_queue(bfqd); if (!bfqq) return 0; - if (bfq_class_idle(bfqq)) - max_dispatch = 1; - - if (!bfq_bfqq_sync(bfqq)) - max_dispatch = bfqd->bfq_max_budget_async_rq; - - if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { - if (bfqd->busy_queues > 1) - return 0; - if (bfqq->dispatched >= 4 * max_dispatch) - return 0; - } - - if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) - return 0; + BUG_ON(bfqq->entity.budget < bfqq->entity.service); - bfq_clear_bfqq_wait_request(bfqq); - BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + BUG_ON(bfq_bfqq_wait_request(bfqq)); if (!bfq_dispatch_request(bfqd, bfqq)) return 0; @@ -2937,6 +3776,8 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", bfq_bfqq_sync(bfqq) ? "sync" : "async"); + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); return 1; } @@ -2948,23 +3789,22 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) */ static void bfq_put_queue(struct bfq_queue *bfqq) { - struct bfq_data *bfqd = bfqq->bfqd; #ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_group *bfqg = bfqq_group(bfqq); #endif - BUG_ON(atomic_read(&bfqq->ref) <= 0); + BUG_ON(bfqq->ref <= 0); - bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, - atomic_read(&bfqq->ref)); - if (!atomic_dec_and_test(&bfqq->ref)) + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); + bfqq->ref--; + if (bfqq->ref) return; BUG_ON(rb_first(&bfqq->sort_list)); BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); - BUG_ON(bfqd->in_service_queue == bfqq); + BUG_ON(bfqq->bfqd->in_service_queue == bfqq); if (bfq_bfqq_sync(bfqq)) /* @@ -2977,7 +3817,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) */ hlist_del_init(&bfqq->burst_list_node); - bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); kmem_cache_free(bfq_pool, bfqq); #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -3011,8 +3851,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_schedule_dispatch(bfqd); } - bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, - atomic_read(&bfqq->ref)); + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); bfq_put_cooperator(bfqq); @@ -3021,28 +3860,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) static void bfq_init_icq(struct io_cq *icq) { - struct bfq_io_cq *bic = icq_to_bic(icq); - - bic->ttime.last_end_request = jiffies; - /* - * A newly created bic indicates that the process has just - * started doing I/O, and is probably mapping into memory its - * executable and libraries: it definitely needs weight raising. - * There is however the possibility that the process performs, - * for a while, I/O close to some other process. EQM intercepts - * this behavior and may merge the queue corresponding to the - * process with some other queue, BEFORE the weight of the queue - * is raised. Merged queues are not weight-raised (they are assumed - * to belong to processes that benefit only from high throughput). - * If the merge is basically the consequence of an accident, then - * the queue will be split soon and will get back its old weight. - * It is then important to write down somewhere that this queue - * does need weight raising, even if it did not make it to get its - * weight raised before being merged. To this purpose, we overload - * the field raising_time_left and assign 1 to it, to mark the queue - * as needing weight raising. - */ - bic->wr_time_left = 1; + icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); } static void bfq_exit_icq(struct io_cq *icq) @@ -3050,21 +3868,21 @@ static void bfq_exit_icq(struct io_cq *icq) struct bfq_io_cq *bic = icq_to_bic(icq); struct bfq_data *bfqd = bic_to_bfqd(bic); - if (bic->bfqq[BLK_RW_ASYNC]) { - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); - bic->bfqq[BLK_RW_ASYNC] = NULL; + if (bic_to_bfqq(bic, false)) { + bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); + bic_set_bfqq(bic, NULL, false); } - if (bic->bfqq[BLK_RW_SYNC]) { + if (bic_to_bfqq(bic, true)) { /* * If the bic is using a shared queue, put the reference * taken on the io_context when the bic started using a * shared bfq_queue. */ - if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) + if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) put_io_context(icq->ioc); - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); - bic->bfqq[BLK_RW_SYNC] = NULL; + bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); + bic_set_bfqq(bic, NULL, true); } } @@ -3072,8 +3890,8 @@ static void bfq_exit_icq(struct io_cq *icq) * Update the entity prio values; note that the new values will not * be used until the next (re)activation. */ -static void -bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, + struct bfq_io_cq *bic) { struct task_struct *tsk = current; int ioprio_class; @@ -3105,7 +3923,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) break; } - if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio >= IOPRIO_BE_NR) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); BUG(); @@ -3113,45 +3931,40 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "set_next_ioprio_data: bic_class %d prio %d class %d", + ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); } static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { - struct bfq_data *bfqd; - struct bfq_queue *bfqq, *new_bfqq; + struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_queue *bfqq; unsigned long uninitialized_var(flags); int ioprio = bic->icq.ioc->ioprio; - bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), - &flags); /* * This condition may trigger on a newly created bic, be sure to * drop the lock before returning. */ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) - goto out; + return; bic->ioprio = ioprio; - bfqq = bic->bfqq[BLK_RW_ASYNC]; + bfqq = bic_to_bfqq(bic, false); if (bfqq) { - new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, - GFP_ATOMIC); - if (new_bfqq) { - bic->bfqq[BLK_RW_ASYNC] = new_bfqq; - bfq_log_bfqq(bfqd, bfqq, - "check_ioprio_change: bfqq %p %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - } + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bic_set_bfqq(bic, bfqq, false); + bfq_log_bfqq(bfqd, bfqq, + "check_ioprio_change: bfqq %p %d", + bfqq, bfqq->ref); } - bfqq = bic->bfqq[BLK_RW_SYNC]; + bfqq = bic_to_bfqq(bic, true); if (bfqq) bfq_set_next_ioprio_data(bfqq, bic); - -out: - bfq_put_bfqd_unlock(bfqd, &flags); } static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -3160,8 +3973,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - atomic_set(&bfqq->ref, 0); + bfqq->ref = 0; bfqq->bfqd = bfqd; if (bic) @@ -3171,6 +3985,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfq_class_idle(bfqq)) bfq_mark_bfqq_idle_window(bfqq); bfq_mark_bfqq_sync(bfqq); + bfq_mark_bfqq_just_created(bfqq); } else bfq_clear_bfqq_sync(bfqq); bfq_mark_bfqq_IO_bound(bfqq); @@ -3180,72 +3995,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->pid = pid; bfqq->wr_coeff = 1; - bfqq->last_wr_start_finish = 0; + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); + bfqq->budget_timeout = bfq_smallest_from_now(); + bfqq->split_time = bfq_smallest_from_now(); + /* * Set to the value for which bfqq will not be deemed as * soft rt when it becomes backlogged. */ - bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); -} - -static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, - gfp_t gfp_mask) -{ - struct bfq_group *bfqg; - struct bfq_queue *bfqq, *new_bfqq = NULL; - struct blkcg *blkcg; - -retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - bfqg = bfq_find_alloc_group(bfqd, blkcg); - /* bic always exists here */ - bfqq = bic_to_bfqq(bic, is_sync); - - /* - * Always try a new alloc if we fall back to the OOM bfqq - * originally, since it should just be a temporary situation. - */ - if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = NULL; - if (new_bfqq) { - bfqq = new_bfqq; - new_bfqq = NULL; - } else if (gfpflags_allow_blocking(gfp_mask)) { - rcu_read_unlock(); - spin_unlock_irq(bfqd->queue->queue_lock); - new_bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - spin_lock_irq(bfqd->queue->queue_lock); - if (new_bfqq) - goto retry; - } else { - bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - } - - if (bfqq) { - bfq_init_bfqq(bfqd, bfqq, bic, current->pid, - is_sync); - bfq_init_entity(&bfqq->entity, bfqg); - bfq_log_bfqq(bfqd, bfqq, "allocated"); - } else { - bfqq = &bfqd->oom_bfqq; - bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); - } - } - - if (new_bfqq) - kmem_cache_free(bfq_pool, new_bfqq); - - rcu_read_unlock(); + bfqq->soft_rt_next_start = bfq_greatest_from_now(); - return bfqq; + /* first request is almost certainly seeky */ + bfqq->seek_history = 1; } static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, @@ -3268,90 +4030,84 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, } static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, gfp_t gfp_mask) + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic) { const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); struct bfq_queue **async_bfqq = NULL; - struct bfq_queue *bfqq = NULL; + struct bfq_queue *bfqq; + struct bfq_group *bfqg; - if (!is_sync) { - struct blkcg *blkcg; - struct bfq_group *bfqg; + rcu_read_lock(); + + bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); + if (!bfqg) { + bfqq = &bfqd->oom_bfqq; + goto out; + } - rcu_read_lock(); - blkcg = bio_blkcg(bio); - rcu_read_unlock(); - bfqg = bfq_find_alloc_group(bfqd, blkcg); + if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ioprio); bfqq = *async_bfqq; + if (bfqq) + goto out; } - if (!bfqq) - bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); + bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO, + bfqd->queue->node); + + if (bfqq) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + goto out; + } /* * Pin the queue now that it's allocated, scheduler exit will * prune it. */ - if (!is_sync && !(*async_bfqq)) { - atomic_inc(&bfqq->ref); + if (async_bfqq) { + bfqq->ref++; bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); *async_bfqq = bfqq; } - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, - atomic_read(&bfqq->ref)); +out: + bfqq->ref++; + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); + rcu_read_unlock(); return bfqq; } static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_io_cq *bic) { - unsigned long elapsed = jiffies - bic->ttime.last_end_request; - unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); + struct bfq_ttime *ttime = &bic->ttime; + u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; + + elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); - bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; - bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; - bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / - bic->ttime.ttime_samples; + ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); + ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, + ttime->ttime_samples); } -static void bfq_update_io_seektime(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct request *rq) +static void +bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) { - sector_t sdist; - u64 total; - - if (bfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - bfqq->last_request_pos; - else - sdist = bfqq->last_request_pos - blk_rq_pos(rq); - - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc. - */ - if (bfqq->seek_samples == 0) /* first request, not really a seek */ - sdist = 0; - else if (bfqq->seek_samples <= 60) /* second & third seek */ - sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); - else - sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); - - bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; - bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; - total = bfqq->seek_total + (bfqq->seek_samples/2); - do_div(total, bfqq->seek_samples); - bfqq->seek_mean = (sector_t)total; - - bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, - (u64)bfqq->seek_mean); + bfqq->seek_history <<= 1; + bfqq->seek_history |= + get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR; } /* @@ -3369,7 +4125,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, return; /* Idle window just restored, statistics are meaningless. */ - if (bfq_bfqq_just_split(bfqq)) + if (time_is_after_eq_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) return; enable_idle = bfq_bfqq_idle_window(bfqq); @@ -3409,22 +4166,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_update_io_thinktime(bfqd, bic); bfq_update_io_seektime(bfqd, bfqq, rq); - if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { - bfq_clear_bfqq_constantly_seeky(bfqq); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || !BFQQ_SEEKY(bfqq)) bfq_update_idle_window(bfqd, bfqq, bic); - bfq_clear_bfqq_just_split(bfqq); bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", - bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), - (unsigned long long) bfqq->seek_mean); + "rq_enqueued: idle_window=%d (seeky %d)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); @@ -3438,14 +4186,15 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, * is small and the queue is not to be expired, then * just exit. * - * In this way, if the disk is being idled to wait for - * a new request from the in-service queue, we avoid - * unplugging the device and committing the disk to serve - * just a small request. On the contrary, we wait for - * the block layer to decide when to unplug the device: - * hopefully, new requests will be merged to this one - * quickly, then the device will be unplugged and - * larger requests will be dispatched. + * In this way, if the device is being idled to wait + * for a new request from the in-service queue, we + * avoid unplugging the device and committing the + * device to serve just a small request. On the + * contrary, we wait for the block layer to decide + * when to unplug the device: hopefully, new requests + * will be merged to this one quickly, then the device + * will be unplugged and larger requests will be + * dispatched. */ if (small_req && !budget_timeout) return; @@ -3457,10 +4206,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, * timer. */ bfq_clear_bfqq_wait_request(bfqq); - del_timer(&bfqd->idle_slice_timer); -#ifdef CONFIG_BFQ_GROUP_IOSCHED + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); bfqg_stats_update_idle_time(bfqq_group(bfqq)); -#endif /* * The queue is not empty, because a new request just @@ -3504,28 +4251,21 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) */ new_bfqq->allocated[rq_data_dir(rq)]++; bfqq->allocated[rq_data_dir(rq)]--; - atomic_inc(&new_bfqq->ref); + new_bfqq->ref++; + bfq_clear_bfqq_just_created(bfqq); bfq_put_queue(bfqq); if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); rq->elv.priv[1] = new_bfqq; bfqq = new_bfqq; - } else - bfq_bfqq_increase_failed_cooperations(bfqq); + } } bfq_add_request(rq); - /* - * Here a newly-created bfq_queue has already started a weight-raising - * period: clear raising_time_left to prevent bfq_bfqq_save_state() - * from assigning it a full weight-raising period. See the detailed - * comments about this field in bfq_init_icq(). - */ - if (bfqq->bic) - bfqq->bic->wr_time_left = 0; - rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + rq->fifo_time = ktime_get_ns() + + jiffies_to_nsecs(bfqd->bfq_fifo_expire[rq_is_sync(rq)]); list_add_tail(&rq->queuelist, &bfqq->fifo); bfq_rq_enqueued(bfqd, bfqq, rq); @@ -3533,8 +4273,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) static void bfq_update_hw_tag(struct bfq_data *bfqd) { - bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, - bfqd->rq_in_driver); + bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, + bfqd->rq_in_driver); if (bfqd->hw_tag == 1) return; @@ -3560,48 +4300,85 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd = bfqq->bfqd; - bool sync = bfq_bfqq_sync(bfqq); + u64 now_ns; + u32 delta_us; - bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", - blk_rq_sectors(rq), sync); + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", + blk_rq_sectors(rq)); + assert_spin_locked(bfqd->queue->queue_lock); bfq_update_hw_tag(bfqd); BUG_ON(!bfqd->rq_in_driver); BUG_ON(!bfqq->dispatched); bfqd->rq_in_driver--; bfqq->dispatched--; -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_completion(bfqq_group(bfqq), rq_start_time_ns(rq), - rq_io_start_time_ns(rq), rq->cmd_flags); -#endif + rq_io_start_time_ns(rq), req_op(rq), + rq->cmd_flags); if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + /* + * Set budget_timeout (which we overload to store the + * time at which the queue remains with no backlog and + * no outstanding request; used by the weight-raising + * mechanism). + */ + bfqq->budget_timeout = jiffies; + bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->busy_in_flight_queues); - bfqd->busy_in_flight_queues--; - if (bfq_bfqq_constantly_seeky(bfqq)) { - BUG_ON(!bfqd-> - const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } } - if (sync) { - bfqd->sync_flight--; - RQ_BIC(rq)->ttime.last_end_request = jiffies; - } + now_ns = ktime_get_ns(); + + RQ_BIC(rq)->ttime.last_end_request = now_ns; + + /* + * Using us instead of ns, to get a reasonable precision in + * computing rate in next check. + */ + delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); + + bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, + (USEC_PER_SEC* + (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); + + /* + * If the request took rather long to complete, and, according + * to the maximum request size recorded, this completion latency + * implies that the request was certainly served at a very low + * rate (less than 1M sectors/sec), then the whole observation + * interval that lasts up to this time instant cannot be a + * valid time interval for computing a new peak rate. Invoke + * bfq_update_rate_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - reset to zero samples, which will trigger a proper + * re-initialization of the observation interval on next + * dispatch + */ + if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && + (bfqd->last_rq_max_size<last_completion = now_ns; /* - * If we are waiting to discover whether the request pattern of the - * task associated with the queue is actually isochronous, and - * both requisites for this condition to hold are satisfied, then - * compute soft_rt_next_start (see the comments to the function - * bfq_bfqq_softrt_next_start()). + * If we are waiting to discover whether the request pattern + * of the task associated with the queue is actually + * isochronous, and both requisites for this condition to hold + * are now satisfied, then compute soft_rt_next_start (see the + * comments on the function bfq_bfqq_softrt_next_start()). We + * schedule this delayed check when bfqq expires, if it still + * has in-flight requests. */ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && RB_EMPTY_ROOT(&bfqq->sort_list)) @@ -3613,10 +4390,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) * or if we want to idle in case it has no pending requests. */ if (bfqd->in_service_queue == bfqq) { - if (bfq_bfqq_budget_new(bfqq)) - bfq_set_budget_timeout(bfqd); - - if (bfq_bfqq_must_idle(bfqq)) { + if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { bfq_arm_slice_timer(bfqd); goto out; } else if (bfq_may_expire_for_budg_timeout(bfqq)) @@ -3646,7 +4420,7 @@ static int __bfq_may_queue(struct bfq_queue *bfqq) return ELV_MQUEUE_MAY; } -static int bfq_may_queue(struct request_queue *q, int rw) +static int bfq_may_queue(struct request_queue *q, int op, int op_flags) { struct bfq_data *bfqd = q->elevator->elevator_data; struct task_struct *tsk = current; @@ -3663,7 +4437,7 @@ static int bfq_may_queue(struct request_queue *q, int rw) if (!bic) return ELV_MQUEUE_MAY; - bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); + bfqq = bic_to_bfqq(bic, rw_is_sync(op, op_flags)); if (bfqq) return __bfq_may_queue(bfqq); @@ -3687,14 +4461,14 @@ static void bfq_put_request(struct request *rq) rq->elv.priv[1] = NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); bfq_put_queue(bfqq); } } /* * Returns NULL if a new bfqq should be allocated, or the old bfqq if this - * was the last process referring to said bfqq. + * was the last process referring to that bfqq. */ static struct bfq_queue * bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) @@ -3732,11 +4506,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, unsigned long flags; bool split = false; - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); - - bfq_check_ioprio_change(bic, bio); - spin_lock_irqsave(q->queue_lock, flags); + bfq_check_ioprio_change(bic, bio); if (!bic) goto queue_fail; @@ -3746,23 +4517,47 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, new_queue: bfqq = bic_to_bfqq(bic, is_sync); if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); + if (bfqq) + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { + bfq_log_bfqq(bfqd, bfqq, + "set_request: was_in_list %d " + "was_in_large_burst %d " + "large burst in progress %d", + bic->was_in_burst_list, + bic->saved_in_large_burst, + bfqd->large_burst); + if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) + bic->saved_in_large_burst) { + bfq_log_bfqq(bfqd, bfqq, + "set_request: marking in " + "large burst"); bfq_mark_bfqq_in_large_burst(bfqq); - else { + } else { + bfq_log_bfqq(bfqd, bfqq, + "set_request: clearing in " + "large burst"); bfq_clear_bfqq_in_large_burst(bfqq); if (bic->was_in_burst_list) hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } + bfqq->split_time = jiffies; } } else { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + + /* Update bic before losing reference to bfqq */ + if (bfq_bfqq_in_large_burst(bfqq)) + bic->saved_in_large_burst = true; + bfqq = bfq_split_bfqq(bic, bfqq); split = true; if (!bfqq) @@ -3771,9 +4566,8 @@ new_queue: } bfqq->allocated[rw]++; - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, - atomic_read(&bfqq->ref)); + bfqq->ref++; + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); rq->elv.priv[0] = bic; rq->elv.priv[1] = bfqq; @@ -3788,7 +4582,6 @@ new_queue: if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { bfqq->bic = bic; if (split) { - bfq_mark_bfqq_just_split(bfqq); /* * If the queue has just been split from a shared * queue, restore the idle window and the possible @@ -3798,6 +4591,9 @@ new_queue: } } + if (unlikely(bfq_bfqq_just_created(bfqq))) + bfq_handle_burst(bfqd, bfqq); + spin_unlock_irqrestore(q->queue_lock, flags); return 0; @@ -3824,9 +4620,10 @@ static void bfq_kick_queue(struct work_struct *work) * Handler of the expiration of the timer running if the in-service queue * is idling inside its time slice. */ -static void bfq_idle_slice_timer(unsigned long data) +static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) { - struct bfq_data *bfqd = (struct bfq_data *)data; + struct bfq_data *bfqd = container_of(timer, struct bfq_data, + idle_slice_timer); struct bfq_queue *bfqq; unsigned long flags; enum bfqq_expiration reason; @@ -3844,6 +4641,8 @@ static void bfq_idle_slice_timer(unsigned long data) */ if (bfqq) { bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); + bfq_clear_bfqq_wait_request(bfqq); + if (bfq_bfqq_budget_timeout(bfqq)) /* * Also here the queue can be safely expired @@ -3869,14 +4668,16 @@ schedule_dispatch: bfq_schedule_dispatch(bfqd); spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); + return HRTIMER_NORESTART; } static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) { - del_timer_sync(&bfqd->idle_slice_timer); + hrtimer_cancel(&bfqd->idle_slice_timer); cancel_work_sync(&bfqd->unplug_work); } +#ifdef CONFIG_BFQ_GROUP_IOSCHED static void __bfq_put_async_bfqq(struct bfq_data *bfqd, struct bfq_queue **bfqq_ptr) { @@ -3885,9 +4686,9 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, bfq_log(bfqd, "put_async_bfqq: %p", bfqq); if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); + bfq_bfqq_move(bfqd, bfqq, root_group); bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); bfq_put_queue(bfqq); *bfqq_ptr = NULL; } @@ -3909,6 +4710,7 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); } +#endif static void bfq_exit_queue(struct elevator_queue *e) { @@ -3928,9 +4730,7 @@ static void bfq_exit_queue(struct elevator_queue *e) bfq_shutdown_timer_wq(bfqd); - synchronize_rcu(); - - BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); #ifdef CONFIG_BFQ_GROUP_IOSCHED blkcg_deactivate_policy(q, &blkcg_policy_bfq); @@ -3978,11 +4778,14 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * will not attempt to free it. */ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); - atomic_inc(&bfqd->oom_bfqq.ref); + bfqd->oom_bfqq.ref++; bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; bfqd->oom_bfqq.entity.new_weight = bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); + + /* oom_bfqq does not participate to bursts */ + bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); /* * Trigger weight initialization, according to ioprio, at the * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio @@ -4001,13 +4804,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqd->active_numerous_groups = 0; -#endif - init_timer(&bfqd->idle_slice_timer); + hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); bfqd->idle_slice_timer.function = bfq_idle_slice_timer; - bfqd->idle_slice_timer.data = (unsigned long)bfqd; bfqd->queue_weights_tree = RB_ROOT; bfqd->group_weights_tree = RB_ROOT; @@ -4028,20 +4828,19 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_back_penalty = bfq_back_penalty; bfqd->bfq_slice_idle = bfq_slice_idle; bfqd->bfq_class_idle_last_service = 0; - bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; - bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; - bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + bfqd->bfq_timeout = bfq_timeout; - bfqd->bfq_coop_thresh = 2; - bfqd->bfq_failed_cooperations = 7000; bfqd->bfq_requests_within_timer = 120; - bfqd->bfq_large_burst_thresh = 11; - bfqd->bfq_burst_interval = msecs_to_jiffies(500); + bfqd->bfq_large_burst_thresh = 8; + bfqd->bfq_burst_interval = msecs_to_jiffies(180); bfqd->low_latency = true; - bfqd->bfq_wr_coeff = 20; + /* + * Trade-off between responsiveness and fairness. + */ + bfqd->bfq_wr_coeff = 30; bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); bfqd->bfq_wr_max_time = 0; bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); @@ -4053,16 +4852,15 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * video. */ bfqd->wr_busy_queues = 0; - bfqd->busy_in_flight_queues = 0; - bfqd->const_seeky_busy_in_flight_queues = 0; /* - * Begin by assuming, optimistically, that the device peak rate is - * equal to the highest reference rate. + * Begin by assuming, optimistically, that the device is a + * high-speed one, and that its peak rate is equal to 2/3 of + * the highest reference rate. */ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * T_fast[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; bfqd->device_speed = BFQ_BFQD_FAST; return 0; @@ -4088,7 +4886,7 @@ static int __init bfq_slab_setup(void) static ssize_t bfq_var_show(unsigned int var, char *page) { - return sprintf(page, "%d\n", var); + return sprintf(page, "%u\n", var); } static ssize_t bfq_var_store(unsigned long *var, const char *page, @@ -4159,21 +4957,21 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct bfq_data *bfqd = e->elevator_data; \ - unsigned int __data = __VAR; \ - if (__CONV) \ + u64 __data = __VAR; \ + if (__CONV == 1) \ __data = jiffies_to_msecs(__data); \ + else if (__CONV == 2) \ + __data = div_u64(__data, NSEC_PER_MSEC); \ return bfq_var_show(__data, (page)); \ } -SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); -SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -SHOW_FUNCTION(bfq_max_budget_async_rq_show, - bfqd->bfq_max_budget_async_rq, 0); -SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); -SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); +SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); @@ -4183,6 +4981,17 @@ SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); #undef SHOW_FUNCTION +#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + u64 __data = __VAR; \ + __data = div_u64(__data, NSEC_PER_USEC); \ + return bfq_var_show(__data, (page)); \ +} +USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); +#undef USEC_SHOW_FUNCTION + #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t \ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ @@ -4194,24 +5003,22 @@ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ - if (__CONV) \ + if (__CONV == 1) \ *(__PTR) = msecs_to_jiffies(__data); \ + else if (__CONV == 2) \ + *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ else \ *(__PTR) = __data; \ return ret; \ } STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, - INT_MAX, 1); + INT_MAX, 2); STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, - INT_MAX, 1); + INT_MAX, 2); STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, INT_MAX, 0); -STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, - 1, INT_MAX, 0); -STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, - INT_MAX, 1); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, @@ -4224,6 +5031,23 @@ STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, INT_MAX, 0); #undef STORE_FUNCTION +#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long __data; \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + *(__PTR) = (u64)__data * NSEC_PER_USEC; \ + return ret; \ +} +USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, + UINT_MAX); +#undef USEC_STORE_FUNCTION + /* do nothing for the moment */ static ssize_t bfq_weights_store(struct elevator_queue *e, const char *page, size_t count) @@ -4231,16 +5055,6 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, return count; } -static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) -{ - u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); - - if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) - return bfq_calc_max_budget(bfqd->peak_rate, timeout); - else - return bfq_default_max_budget; -} - static ssize_t bfq_max_budget_store(struct elevator_queue *e, const char *page, size_t count) { @@ -4249,7 +5063,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, int ret = bfq_var_store(&__data, (page), count); if (__data == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); else { if (__data > INT_MAX) __data = INT_MAX; @@ -4261,6 +5075,10 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, return ret; } +/* + * Leaving this name to preserve name compatibility with cfq + * parameters, but this timeout is used for both sync and async. + */ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, const char *page, size_t count) { @@ -4273,9 +5091,27 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, else if (__data > INT_MAX) __data = INT_MAX; - bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); + bfqd->bfq_timeout = msecs_to_jiffies(__data); if (bfqd->bfq_user_max_budget == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); + + return ret; +} + +static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (!bfqd->strict_guarantees && __data == 1 + && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) + bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; + + bfqd->strict_guarantees = __data; return ret; } @@ -4305,10 +5141,10 @@ static struct elv_fs_entry bfq_attrs[] = { BFQ_ATTR(back_seek_max), BFQ_ATTR(back_seek_penalty), BFQ_ATTR(slice_idle), + BFQ_ATTR(slice_idle_us), BFQ_ATTR(max_budget), - BFQ_ATTR(max_budget_async_rq), BFQ_ATTR(timeout_sync), - BFQ_ATTR(timeout_async), + BFQ_ATTR(strict_guarantees), BFQ_ATTR(low_latency), BFQ_ATTR(wr_coeff), BFQ_ATTR(wr_max_time), @@ -4328,7 +5164,8 @@ static struct elevator_type iosched_bfq = { #ifdef CONFIG_BFQ_GROUP_IOSCHED .elevator_bio_merged_fn = bfq_bio_merged, #endif - .elevator_allow_merge_fn = bfq_allow_merge, + .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, + .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, .elevator_dispatch_fn = bfq_dispatch_requests, .elevator_add_req_fn = bfq_insert_request, .elevator_activate_req_fn = bfq_activate_request, @@ -4351,18 +5188,28 @@ static struct elevator_type iosched_bfq = { .elevator_owner = THIS_MODULE, }; +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static struct blkcg_policy blkcg_policy_bfq = { + .dfl_cftypes = bfq_blkg_files, + .legacy_cftypes = bfq_blkcg_legacy_files, + + .cpd_alloc_fn = bfq_cpd_alloc, + .cpd_init_fn = bfq_cpd_init, + .cpd_bind_fn = bfq_cpd_init, + .cpd_free_fn = bfq_cpd_free, + + .pd_alloc_fn = bfq_pd_alloc, + .pd_init_fn = bfq_pd_init, + .pd_offline_fn = bfq_pd_offline, + .pd_free_fn = bfq_pd_free, + .pd_reset_stats_fn = bfq_pd_reset_stats, +}; +#endif + static int __init bfq_init(void) { int ret; - - /* - * Can be 0 on HZ < 1000 setups. - */ - if (bfq_slice_idle == 0) - bfq_slice_idle = 1; - - if (bfq_timeout_async == 0) - bfq_timeout_async = 1; + char msg[50] = "BFQ I/O-scheduler: v8r4"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); @@ -4375,27 +5222,46 @@ static int __init bfq_init(void) goto err_pol_unreg; /* - * Times to load large popular applications for the typical systems - * installed on the reference devices (see the comments before the - * definitions of the two arrays). + * Times to load large popular applications for the typical + * systems installed on the reference devices (see the + * comments before the definitions of the next two + * arrays). Actually, we use slightly slower values, as the + * estimated peak rate tends to be smaller than the actual + * peak rate. The reason for this last fact is that estimates + * are computed over much shorter time intervals than the long + * intervals typically used for benchmarking. Why? First, to + * adapt more quickly to variations. Second, because an I/O + * scheduler cannot rely on a peak-rate-evaluation workload to + * be run for a long time. */ - T_slow[0] = msecs_to_jiffies(2600); - T_slow[1] = msecs_to_jiffies(1000); - T_fast[0] = msecs_to_jiffies(5500); - T_fast[1] = msecs_to_jiffies(2000); + T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ + T_slow[1] = msecs_to_jiffies(1000); /* actually 1.5 sec */ + T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ + T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ /* - * Thresholds that determine the switch between speed classes (see - * the comments before the definition of the array). + * Thresholds that determine the switch between speed classes + * (see the comments before the definition of the array + * device_speed_thresh). These thresholds are biased towards + * transitions to the fast class. This is safer than the + * opposite bias. In fact, a wrong transition to the slow + * class results in short weight-raising periods, because the + * speed of the device then tends to be higher that the + * reference peak rate. On the opposite end, a wrong + * transition to the fast class tends to increase + * weight-raising periods, because of the opposite reason. */ - device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; - device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; + device_speed_thresh[0] = (4 * R_slow[0]) / 3; + device_speed_thresh[1] = (4 * R_slow[1]) / 3; ret = elv_register(&iosched_bfq); if (ret) goto err_pol_unreg; - pr_info("BFQ I/O-scheduler: v7r11"); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + strcat(msg, " (with cgroups support)"); +#endif + pr_info("%s", msg); return 0; diff --git a/block/bfq-sched.c b/block/bfq-sched.c index a5ed694..45d63d3 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -7,9 +7,13 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2010 Paolo Valente + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2016 Paolo Valente */ +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + #ifdef CONFIG_BFQ_GROUP_IOSCHED #define for_each_entity(entity) \ for (; entity ; entity = entity->parent) @@ -22,8 +26,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, int extract, struct bfq_data *bfqd); -static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static void bfq_update_budget(struct bfq_entity *next_in_service) { struct bfq_entity *bfqg_entity; @@ -48,6 +50,7 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) static int bfq_update_next_in_service(struct bfq_sched_data *sd) { struct bfq_entity *next_in_service; + struct bfq_queue *bfqq; if (sd->in_service_entity) /* will update/requeue at the end of service */ @@ -65,14 +68,29 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd) if (next_in_service) bfq_update_budget(next_in_service); + else + goto exit; + + bfqq = bfq_entity_to_bfqq(next_in_service); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_next_in_service: chosen this queue"); + else { + struct bfq_group *bfqg = + container_of(next_in_service, + struct bfq_group, entity); + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_next_in_service: chosen this entity"); + } +exit: return 1; } static void bfq_check_next_in_service(struct bfq_sched_data *sd, struct bfq_entity *entity) { - BUG_ON(sd->next_in_service != entity); + WARN_ON(sd->next_in_service != entity); } #else #define for_each_entity(entity) \ @@ -151,20 +169,36 @@ static u64 bfq_delta(unsigned long service, unsigned long weight) static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned long long start, finish, delta; BUG_ON(entity->weight == 0); entity->finish = entity->start + bfq_delta(service, entity->weight); + start = ((entity->start>>10)*1000)>>12; + finish = ((entity->finish>>10)*1000)>>12; + delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; + if (bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: serv %lu, w %d", service, entity->weight); bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: start %llu, finish %llu, delta %llu", - entity->start, entity->finish, - bfq_delta(service, entity->weight)); + start, finish, delta); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_finish group: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_finish group: start %llu, finish %llu, delta %llu", + start, finish, delta); +#endif } } @@ -293,10 +327,26 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) static void bfq_update_active_node(struct rb_node *node) { struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->min_start = entity->start; bfq_update_min(entity, node->rb_right); bfq_update_min(entity, node->rb_left); + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +#endif + } } /** @@ -386,8 +436,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, BUG_ON(!bfqg); BUG_ON(!bfqd); bfqg->active_entities++; - if (bfqg->active_entities == 2) - bfqd->active_numerous_groups++; } #endif } @@ -399,7 +447,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, static unsigned short bfq_ioprio_to_weight(int ioprio) { BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); - return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; + return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; } /** @@ -422,9 +470,9 @@ static void bfq_get_entity(struct bfq_entity *entity) struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); if (bfqq) { - atomic_inc(&bfqq->ref); + bfqq->ref++; bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); } } @@ -499,10 +547,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, BUG_ON(!bfqd); BUG_ON(!bfqg->active_entities); bfqg->active_entities--; - if (bfqg->active_entities == 1) { - BUG_ON(!bfqd->active_numerous_groups); - bfqd->active_numerous_groups--; - } } #endif } @@ -552,7 +596,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st, if (bfqq) { sd = entity->sched_data; bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", - bfqq, atomic_read(&bfqq->ref)); + bfqq, bfqq->ref); bfq_put_queue(bfqq); } } @@ -602,7 +646,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (entity->prio_changed) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - unsigned short prev_weight, new_weight; + unsigned int prev_weight, new_weight; struct bfq_data *bfqd = NULL; struct rb_root *root; #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -630,7 +674,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, entity->new_weight > BFQ_MAX_WEIGHT) { pr_crit("update_weight_prio: new_weight %d\n", entity->new_weight); - BUG(); + if (entity->new_weight < BFQ_MIN_WEIGHT) + entity->new_weight = BFQ_MIN_WEIGHT; + else + entity->new_weight = BFQ_MAX_WEIGHT; } entity->orig_weight = entity->new_weight; if (bfqq) @@ -661,6 +708,13 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, * associated with its new weight. */ if (prev_weight != new_weight) { + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "weight changed %d %d(%d %d)", + prev_weight, new_weight, + entity->orig_weight, + bfqq->wr_coeff); + root = bfqq ? &bfqd->queue_weights_tree : &bfqd->group_weights_tree; bfq_weights_tree_remove(bfqd, entity, root); @@ -707,7 +761,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) st = bfq_entity_service_tree(entity); entity->service += served; - BUG_ON(entity->service > entity->budget); + BUG_ON(st->wsum == 0); st->vtime += bfq_delta(served, st->wsum); @@ -716,31 +770,69 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) #ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); #endif - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); + st = bfq_entity_service_tree(&bfqq->entity); + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", + served, ((st->vtime>>10)*1000)>>12, st); } /** - * bfq_bfqq_charge_full_budget - set the service to the entity budget. + * bfq_bfqq_charge_time - charge an amount of service equivalent to the length + * of the time interval during which bfqq has been in + * service. + * @bfqd: the device * @bfqq: the queue that needs a service update. + * @time_ms: the amount of time during which the queue has received service * - * When it's not possible to be fair in the service domain, because - * a queue is not consuming its budget fast enough (the meaning of - * fast depends on the timeout parameter), we charge it a full - * budget. In this way we should obtain a sort of time-domain - * fairness among all the seeky/slow queues. + * If a queue does not consume its budget fast enough, then providing + * the queue with service fairness may impair throughput, more or less + * severely. For this reason, queues that consume their budget slowly + * are provided with time fairness instead of service fairness. This + * goal is achieved through the BFQ scheduling engine, even if such an + * engine works in the service, and not in the time domain. The trick + * is charging these queues with an inflated amount of service, equal + * to the amount of service that they would have received during their + * service slot if they had been fast, i.e., if their requests had + * been dispatched at a rate equal to the estimated peak rate. + * + * It is worth noting that time fairness can cause important + * distortions in terms of bandwidth distribution, on devices with + * internal queueing. The reason is that I/O requests dispatched + * during the service slot of a queue may be served after that service + * slot is finished, and may have a total processing time loosely + * correlated with the duration of the service slot. This is + * especially true for short service slots. */ -static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) +static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, + unsigned long time_ms) { struct bfq_entity *entity = &bfqq->entity; + int tot_serv_to_charge = entity->service; + unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); + + if (time_ms > 0 && time_ms < timeout_ms) + tot_serv_to_charge = + (bfqd->bfq_max_budget * time_ms) / timeout_ms; - bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); + if (tot_serv_to_charge < entity->service) + tot_serv_to_charge = entity->service; - bfq_bfqq_served(bfqq, entity->budget - entity->service); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "charge_time: %lu/%u ms, %d/%d/%d sectors", + time_ms, timeout_ms, entity->service, + tot_serv_to_charge, entity->budget); + + /* Increase budget to avoid inconsistencies */ + if (tot_serv_to_charge > entity->budget) + entity->budget = tot_serv_to_charge; + + bfq_bfqq_served(bfqq, + max_t(int, 0, tot_serv_to_charge - entity->service)); } /** * __bfq_activate_entity - activate an entity. * @entity: the entity being activated. + * @non_blocking_wait_rq: true if this entity was waiting for a request * * Called whenever an entity is activated, i.e., it is not active and one * of its children receives a new request, or has to be reactivated due to @@ -748,11 +840,16 @@ static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) * service received if @entity is active) of the queue to calculate its * timestamps. */ -static void __bfq_activate_entity(struct bfq_entity *entity) +static void __bfq_activate_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq) { struct bfq_sched_data *sd = entity->sched_data; struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + bool backshifted = false; + BUG_ON(!sd); + BUG_ON(!st); if (entity == sd->in_service_entity) { BUG_ON(entity->tree); /* @@ -770,45 +867,133 @@ static void __bfq_activate_entity(struct bfq_entity *entity) * old start time. */ bfq_active_extract(st, entity); - } else if (entity->tree == &st->idle) { - /* - * Must be on the idle tree, bfq_idle_extract() will - * check for that. - */ - bfq_idle_extract(st, entity); - entity->start = bfq_gt(st->vtime, entity->finish) ? - st->vtime : entity->finish; } else { - /* - * The finish time of the entity may be invalid, and - * it is in the past for sure, otherwise the queue - * would have been on the idle tree. - */ - entity->start = st->vtime; - st->wsum += entity->weight; - bfq_get_entity(entity); + unsigned long long min_vstart; + + /* See comments on bfq_fqq_update_budg_for_activation */ + if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { + backshifted = true; + min_vstart = entity->finish; + } else + min_vstart = st->vtime; + + if (entity->tree == &st->idle) { + /* + * Must be on the idle tree, bfq_idle_extract() will + * check for that. + */ + bfq_idle_extract(st, entity); + entity->start = bfq_gt(min_vstart, entity->finish) ? + min_vstart : entity->finish; + } else { + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue + * would have been on the idle tree. + */ + entity->start = min_vstart; + st->wsum += entity->weight; + bfq_get_entity(entity); - BUG_ON(entity->on_st); - entity->on_st = 1; + BUG_ON(entity->on_st); + entity->on_st = 1; + } } st = __bfq_entity_update_weight_prio(st, entity); bfq_calc_finish(entity, entity->budget); + + /* + * If some queues enjoy backshifting for a while, then their + * (virtual) finish timestamps may happen to become lower and + * lower than the system virtual time. In particular, if + * these queues often happen to be idle for short time + * periods, and during such time periods other queues with + * higher timestamps happen to be busy, then the backshifted + * timestamps of the former queues can become much lower than + * the system virtual time. In fact, to serve the queues with + * higher timestamps while the ones with lower timestamps are + * idle, the system virtual time may be pushed-up to much + * higher values than the finish timestamps of the idle + * queues. As a consequence, the finish timestamps of all new + * or newly activated queues may end up being much larger than + * those of lucky queues with backshifted timestamps. The + * latter queues may then monopolize the device for a lot of + * time. This would simply break service guarantees. + * + * To reduce this problem, push up a little bit the + * backshifted timestamps of the queue associated with this + * entity (only a queue can happen to have the backshifted + * flag set): just enough to let the finish timestamp of the + * queue be equal to the current value of the system virtual + * time. This may introduce a little unfairness among queues + * with backshifted timestamps, but it does not break + * worst-case fairness guarantees. + * + * As a special case, if bfqq is weight-raised, push up + * timestamps much less, to keep very low the probability that + * this push up causes the backshifted finish timestamps of + * weight-raised queues to become higher than the backshifted + * finish timestamps of non weight-raised queues. + */ + if (backshifted && bfq_gt(st->vtime, entity->finish)) { + unsigned long delta = st->vtime - entity->finish; + + if (bfqq) + delta /= bfqq->wr_coeff; + + entity->start += delta; + entity->finish += delta; + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__activate_entity: new queue finish %llu", + ((entity->finish>>10)*1000)>>12); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__activate_entity: new group finish %llu", + ((entity->finish>>10)*1000)>>12); +#endif + } + } + bfq_active_insert(st, entity); + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__activate_entity: queue %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__activate_entity: group %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); +#endif + } } /** * bfq_activate_entity - activate an entity and its ancestors if necessary. * @entity: the entity to activate. + * @non_blocking_wait_rq: true if this entity was waiting for a request * * Activate @entity and all the entities on the path from it to the root. */ -static void bfq_activate_entity(struct bfq_entity *entity) +static void bfq_activate_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq) { struct bfq_sched_data *sd; for_each_entity(entity) { - __bfq_activate_entity(entity); + BUG_ON(!entity); + __bfq_activate_entity(entity, non_blocking_wait_rq); sd = entity->sched_data; if (!bfq_update_next_in_service(sd)) @@ -889,23 +1074,24 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) if (!__bfq_deactivate_entity(entity, requeue)) /* - * The parent entity is still backlogged, and - * we don't need to update it as it is still - * in service. + * next_in_service has not been changed, so + * no upwards update is needed */ break; if (sd->next_in_service) /* - * The parent entity is still backlogged and - * the budgets on the path towards the root - * need to be updated. + * The parent entity is still backlogged, + * because next_in_service is not NULL, and + * next_in_service has been updated (see + * comment on the body of the above if): + * upwards update of the schedule is needed. */ goto update; /* - * If we reach there the parent is no more backlogged and - * we want to propagate the dequeue upwards. + * If we get here, then the parent is no more backlogged and + * we want to propagate the deactivation upwards. */ requeue = 1; } @@ -915,9 +1101,23 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) update: entity = parent; for_each_entity(entity) { - __bfq_activate_entity(entity); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + __bfq_activate_entity(entity, false); sd = entity->sched_data; + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "invoking udpdate_next for this queue"); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "invoking udpdate_next for this entity"); + } +#endif if (!bfq_update_next_in_service(sd)) break; } @@ -943,7 +1143,23 @@ static void bfq_update_vtime(struct bfq_service_tree *st) entry = rb_entry(node, struct bfq_entity, rb_node); if (bfq_gt(entry->min_start, st->vtime)) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entry); st->vtime = entry->min_start; + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_vtime: new vtime %llu %p", + ((st->vtime>>10)*1000)>>12, st); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entry, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_vtime: new vtime %llu %p", + ((st->vtime>>10)*1000)>>12, st); + } +#endif bfq_forget_idle(st); } } @@ -996,10 +1212,11 @@ left: * Update the virtual time in @st and return the first eligible entity * it contains. */ -static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, - bool force) +static struct bfq_entity * +__bfq_lookup_next_entity(struct bfq_service_tree *st, bool force) { struct bfq_entity *entity, *new_next_in_service = NULL; + struct bfq_queue *bfqq; if (RB_EMPTY_ROOT(&st->active)) return NULL; @@ -1008,6 +1225,24 @@ static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, entity = bfq_first_active_entity(st); BUG_ON(bfq_gt(entity->start, st->vtime)); + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__lookup_next: start %llu vtime %llu st %p", + ((entity->start>>10)*1000)>>12, + ((st->vtime>>10)*1000)>>12, st); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__lookup_next: start %llu vtime %llu st %p", + ((entity->start>>10)*1000)>>12, + ((st->vtime>>10)*1000)>>12, st); + } +#endif + /* * If the chosen entity does not match with the sched_data's * next_in_service and we are forcedly serving the IDLE priority @@ -1043,11 +1278,36 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, BUG_ON(sd->in_service_entity); + /* + * Choose from idle class, if needed to guarantee a minimum + * bandwidth to this class. This should also mitigate + * priority-inversion problems in case a low priority task is + * holding file system resources. + */ if (bfqd && - jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { + jiffies - bfqd->bfq_class_idle_last_service > + BFQ_CL_IDLE_TIMEOUT) { entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true); if (entity) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "idle chosen from st %p %d", + st + BFQ_IOPRIO_CLASSES - 1, + BFQ_IOPRIO_CLASSES - 1); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "idle chosen from st %p %d", + st + BFQ_IOPRIO_CLASSES - 1, + BFQ_IOPRIO_CLASSES - 1); + } +#endif i = BFQ_IOPRIO_CLASSES - 1; bfqd->bfq_class_idle_last_service = jiffies; sd->next_in_service = entity; @@ -1056,6 +1316,25 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, for (; i < BFQ_IOPRIO_CLASSES; i++) { entity = __bfq_lookup_next_entity(st + i, false); if (entity) { + if (bfqd != NULL) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "chosen from st %p %d", + st + i, i); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "chosen from st %p %d", + st + i, i); + } +#endif + } + if (extract) { bfq_check_next_in_service(sd, entity); bfq_active_extract(st + i, entity); @@ -1069,6 +1348,13 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, return entity; } +static bool next_queue_may_preempt(struct bfq_data *bfqd) +{ + struct bfq_sched_data *sd = &bfqd->root_group->sched_data; + + return sd->next_in_service != sd->in_service_entity; +} + /* * Get next queue for service. */ @@ -1085,7 +1371,36 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) sd = &bfqd->root_group->sched_data; for (; sd ; sd = entity->my_sched_data) { +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (entity) { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "get_next_queue: lookup in this group"); + } else + bfq_log_bfqg(bfqd, bfqd->root_group, + "get_next_queue: lookup in root group"); +#endif + entity = bfq_lookup_next_entity(sd, 1, bfqd); + + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "get_next_queue: this queue, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "get_next_queue: this entity, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); + } +#endif + BUG_ON(!entity); entity->service = 0; } @@ -1103,8 +1418,9 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) bfqd->in_service_bic = NULL; } + bfq_clear_bfqq_wait_request(bfqd->in_service_queue); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); bfqd->in_service_queue = NULL; - del_timer(&bfqd->idle_slice_timer); } static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -1112,9 +1428,7 @@ static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_entity *entity = &bfqq->entity; - if (bfqq == bfqd->in_service_queue) - __bfq_bfqd_reset_in_service(bfqd); - + BUG_ON(bfqq == bfqd->in_service_queue); bfq_deactivate_entity(entity, requeue); } @@ -1122,12 +1436,11 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; - bfq_activate_entity(entity); + bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq)); + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); } -#ifdef CONFIG_BFQ_GROUP_IOSCHED static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -#endif /* * Called when the bfqq no longer has requests pending, remove it from @@ -1138,6 +1451,7 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, { BUG_ON(!bfq_bfqq_busy(bfqq)); BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + BUG_ON(bfqq == bfqd->in_service_queue); bfq_log_bfqq(bfqd, bfqq, "del from busy"); @@ -1146,27 +1460,20 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, BUG_ON(bfqd->busy_queues == 0); bfqd->busy_queues--; - if (!bfqq->dispatched) { + if (!bfqq->dispatched) bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - BUG_ON(!bfqd->busy_in_flight_queues); - bfqd->busy_in_flight_queues--; - if (bfq_bfqq_constantly_seeky(bfqq)) { - BUG_ON(!bfqd-> - const_seeky_busy_in_flight_queues); - bfqd->const_seeky_busy_in_flight_queues--; - } - } - } + if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; -#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqg_stats_update_dequeue(bfqq_group(bfqq)); -#endif + + BUG_ON(bfqq->entity.budget < 0); bfq_deactivate_bfqq(bfqd, bfqq, requeue); + + BUG_ON(bfqq->entity.budget < 0); } /* @@ -1184,16 +1491,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_mark_bfqq_busy(bfqq); bfqd->busy_queues++; - if (!bfqq->dispatched) { + if (!bfqq->dispatched) if (bfqq->wr_coeff == 1) bfq_weights_tree_add(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); - if (!blk_queue_nonrot(bfqd->queue)) { - bfqd->busy_in_flight_queues++; - if (bfq_bfqq_constantly_seeky(bfqq)) - bfqd->const_seeky_busy_in_flight_queues++; - } - } + if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; } diff --git a/block/bfq.h b/block/bfq.h index fcce855..ea1e7d8 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. + * BFQ-v8r4 for 4.8.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -7,7 +7,9 @@ * Copyright (C) 2008 Fabio Checconi * Paolo Valente * - * Copyright (C) 2010 Paolo Valente + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2016 Paolo Valente */ #ifndef _BFQ_H @@ -28,20 +30,21 @@ #define BFQ_DEFAULT_QUEUE_IOPRIO 4 -#define BFQ_DEFAULT_GRP_WEIGHT 10 +#define BFQ_WEIGHT_LEGACY_DFL 100 #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE +/* + * Soft real-time applications are extremely more latency sensitive + * than interactive ones. Over-raise the weight of the former to + * privilege them against the latter. + */ +#define BFQ_SOFTRT_WEIGHT_FACTOR 100 + struct bfq_entity; /** * struct bfq_service_tree - per ioprio_class service tree. - * @active: tree for active entities (i.e., those backlogged). - * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). - * @first_idle: idle entity with minimum F_i. - * @last_idle: idle entity with maximum F_i. - * @vtime: scheduler virtual time. - * @wsum: scheduler weight sum; active and idle entities contribute to it. * * Each service tree represents a B-WF2Q+ scheduler on its own. Each * ioprio_class has its own independent scheduler, and so its own @@ -49,27 +52,28 @@ struct bfq_entity; * of the containing bfqd. */ struct bfq_service_tree { + /* tree for active entities (i.e., those backlogged) */ struct rb_root active; + /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ struct rb_root idle; - struct bfq_entity *first_idle; - struct bfq_entity *last_idle; + struct bfq_entity *first_idle; /* idle entity with minimum F_i */ + struct bfq_entity *last_idle; /* idle entity with maximum F_i */ - u64 vtime; + u64 vtime; /* scheduler virtual time */ + /* scheduler weight sum; active and idle entities contribute to it */ unsigned long wsum; }; /** * struct bfq_sched_data - multi-class scheduler. - * @in_service_entity: entity in service. - * @next_in_service: head-of-the-line entity in the scheduler. - * @service_tree: array of service trees, one per ioprio_class. * * bfq_sched_data is the basic scheduler queue. It supports three - * ioprio_classes, and can be used either as a toplevel queue or as - * an intermediate queue on a hierarchical setup. - * @next_in_service points to the active entity of the sched_data - * service trees that will be scheduled next. + * ioprio_classes, and can be used either as a toplevel queue or as an + * intermediate queue on a hierarchical setup. @next_in_service + * points to the active entity of the sched_data service trees that + * will be scheduled next. It is used to reduce the number of steps + * needed for each hierarchical-schedule update. * * The supported ioprio_classes are the same as in CFQ, in descending * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. @@ -79,48 +83,29 @@ struct bfq_service_tree { * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_sched_data { - struct bfq_entity *in_service_entity; + struct bfq_entity *in_service_entity; /* entity in service */ + /* head-of-the-line entity in the scheduler (see comments above) */ struct bfq_entity *next_in_service; + /* array of service trees, one per ioprio_class */ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; }; /** * struct bfq_weight_counter - counter of the number of all active entities * with a given weight. - * @weight: weight of the entities that this counter refers to. - * @num_active: number of active entities with this weight. - * @weights_node: weights tree member (see bfq_data's @queue_weights_tree - * and @group_weights_tree). */ struct bfq_weight_counter { - short int weight; - unsigned int num_active; + unsigned int weight; /* weight of the entities this counter refers to */ + unsigned int num_active; /* nr of active entities with this weight */ + /* + * Weights tree member (see bfq_data's @queue_weights_tree and + * @group_weights_tree) + */ struct rb_node weights_node; }; /** * struct bfq_entity - schedulable entity. - * @rb_node: service_tree member. - * @weight_counter: pointer to the weight counter associated with this entity. - * @on_st: flag, true if the entity is on a tree (either the active or - * the idle one of its service_tree). - * @finish: B-WF2Q+ finish timestamp (aka F_i). - * @start: B-WF2Q+ start timestamp (aka S_i). - * @tree: tree the entity is enqueued into; %NULL if not on a tree. - * @min_start: minimum start time of the (active) subtree rooted at - * this entity; used for O(log N) lookups into active trees. - * @service: service received during the last round of service. - * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. - * @weight: weight of the queue - * @parent: parent entity, for hierarchical scheduling. - * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the - * associated scheduler queue, %NULL on leaf nodes. - * @sched_data: the scheduler queue this entity belongs to. - * @ioprio: the ioprio in use. - * @new_weight: when a weight change is requested, the new weight value. - * @orig_weight: original weight, used to implement weight boosting - * @prio_changed: flag, true when the user requested a weight, ioprio or - * ioprio_class change. * * A bfq_entity is used to represent either a bfq_queue (leaf node in the * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each @@ -147,27 +132,52 @@ struct bfq_weight_counter { * containing bfqd. */ struct bfq_entity { - struct rb_node rb_node; + struct rb_node rb_node; /* service_tree member */ + /* pointer to the weight counter associated with this entity */ struct bfq_weight_counter *weight_counter; + /* + * flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree). + */ int on_st; - u64 finish; - u64 start; + u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ + u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ + /* tree the entity is enqueued into; %NULL if not on a tree */ struct rb_root *tree; + /* + * minimum start time of the (active) subtree rooted at this + * entity; used for O(log N) lookups into active trees + */ u64 min_start; - int service, budget; - unsigned short weight, new_weight; - unsigned short orig_weight; + /* amount of service received during the last service slot */ + int service; + + /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ + int budget; + + unsigned int weight; /* weight of the queue */ + unsigned int new_weight; /* next weight if a change is in progress */ + + /* original weight, used to implement weight boosting */ + unsigned int orig_weight; + /* parent entity, for hierarchical scheduling */ struct bfq_entity *parent; + /* + * For non-leaf nodes in the hierarchy, the associated + * scheduler queue, %NULL on leaf nodes. + */ struct bfq_sched_data *my_sched_data; + /* the scheduler queue this entity belongs to */ struct bfq_sched_data *sched_data; + /* flag, set to request a weight, ioprio or ioprio_class change */ int prio_changed; }; @@ -175,56 +185,6 @@ struct bfq_group; /** * struct bfq_queue - leaf schedulable entity. - * @ref: reference counter. - * @bfqd: parent bfq_data. - * @new_ioprio: when an ioprio change is requested, the new ioprio value. - * @ioprio_class: the ioprio_class in use. - * @new_ioprio_class: when an ioprio_class change is requested, the new - * ioprio_class value. - * @new_bfqq: shared bfq_queue if queue is cooperating with - * one or more other queues. - * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). - * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). - * @sort_list: sorted list of pending requests. - * @next_rq: if fifo isn't expired, next request to serve. - * @queued: nr of requests queued in @sort_list. - * @allocated: currently allocated requests. - * @meta_pending: pending metadata requests. - * @fifo: fifo list of requests in sort_list. - * @entity: entity representing this queue in the scheduler. - * @max_budget: maximum budget allowed from the feedback mechanism. - * @budget_timeout: budget expiration (in jiffies). - * @dispatched: number of requests on the dispatch list or inside driver. - * @flags: status flags. - * @bfqq_list: node for active/idle bfqq list inside our bfqd. - * @burst_list_node: node for the device's burst list. - * @seek_samples: number of seeks sampled - * @seek_total: sum of the distances of the seeks sampled - * @seek_mean: mean seek distance - * @last_request_pos: position of the last request enqueued - * @requests_within_timer: number of consecutive pairs of request completion - * and arrival, such that the queue becomes idle - * after the completion, but the next request arrives - * within an idle time slice; used only if the queue's - * IO_bound has been cleared. - * @pid: pid of the process owning the queue, used for logging purposes. - * @last_wr_start_finish: start time of the current weight-raising period if - * the @bfq-queue is being weight-raised, otherwise - * finish time of the last weight-raising period - * @wr_cur_max_time: current max raising time for this queue - * @soft_rt_next_start: minimum time instant such that, only if a new - * request is enqueued after this time instant in an - * idle @bfq_queue with no outstanding requests, then - * the task associated with the queue it is deemed as - * soft real-time (see the comments to the function - * bfq_bfqq_softrt_next_start()) - * @last_idle_bklogged: time of the last transition of the @bfq_queue from - * idle to backlogged - * @service_from_backlogged: cumulative service received from the @bfq_queue - * since the last transition from idle to - * backlogged - * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the - * queue is shared * * A bfq_queue is a leaf request queue; it can be associated with an * io_context or more, if it is async or shared between cooperating @@ -235,117 +195,174 @@ struct bfq_group; * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_queue { - atomic_t ref; + /* reference counter */ + int ref; + /* parent bfq_data */ struct bfq_data *bfqd; - unsigned short ioprio, new_ioprio; - unsigned short ioprio_class, new_ioprio_class; + /* current ioprio and ioprio class */ + unsigned short ioprio, ioprio_class; + /* next ioprio and ioprio class if a change is in progress */ + unsigned short new_ioprio, new_ioprio_class; - /* fields for cooperating queues handling */ + /* + * Shared bfq_queue if queue is cooperating with one or more + * other queues. + */ struct bfq_queue *new_bfqq; + /* request-position tree member (see bfq_group's @rq_pos_tree) */ struct rb_node pos_node; + /* request-position tree root (see bfq_group's @rq_pos_tree) */ struct rb_root *pos_root; + /* sorted list of pending requests */ struct rb_root sort_list; + /* if fifo isn't expired, next request to serve */ struct request *next_rq; + /* number of sync and async requests queued */ int queued[2]; + /* number of sync and async requests currently allocated */ int allocated[2]; + /* number of pending metadata requests */ int meta_pending; + /* fifo list of requests in sort_list */ struct list_head fifo; + /* entity representing this queue in the scheduler */ struct bfq_entity entity; + /* maximum budget allowed from the feedback mechanism */ int max_budget; + /* budget expiration (in jiffies) */ unsigned long budget_timeout; + /* number of requests on the dispatch list or inside driver */ int dispatched; - unsigned int flags; + unsigned int flags; /* status flags.*/ + /* node for active/idle bfqq list inside parent bfqd */ struct list_head bfqq_list; + /* bit vector: a 1 for each seeky requests in history */ + u32 seek_history; + + /* node for the device's burst list */ struct hlist_node burst_list_node; - unsigned int seek_samples; - u64 seek_total; - sector_t seek_mean; + /* position of the last request enqueued */ sector_t last_request_pos; + /* Number of consecutive pairs of request completion and + * arrival, such that the queue becomes idle after the + * completion, but the next request arrives within an idle + * time slice; used only if the queue's IO_bound flag has been + * cleared. + */ unsigned int requests_within_timer; + /* pid of the process owning the queue, used for logging purposes */ pid_t pid; + + /* + * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL + * if the queue is shared. + */ struct bfq_io_cq *bic; - /* weight-raising fields */ + /* current maximum weight-raising time for this queue */ unsigned long wr_cur_max_time; + /* + * Minimum time instant such that, only if a new request is + * enqueued after this time instant in an idle @bfq_queue with + * no outstanding requests, then the task associated with the + * queue it is deemed as soft real-time (see the comments on + * the function bfq_bfqq_softrt_next_start()) + */ unsigned long soft_rt_next_start; + /* + * Start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period. + */ unsigned long last_wr_start_finish; + /* factor by which the weight of this queue is multiplied */ unsigned int wr_coeff; + /* + * Time of the last transition of the @bfq_queue from idle to + * backlogged. + */ unsigned long last_idle_bklogged; + /* + * Cumulative service received from the @bfq_queue since the + * last transition from idle to backlogged. + */ unsigned long service_from_backlogged; + /* + * Value of wr start time when switching to soft rt + */ + unsigned long wr_start_at_switch_to_srt; + + unsigned long split_time; /* time of last split */ }; /** * struct bfq_ttime - per process thinktime stats. - * @ttime_total: total process thinktime - * @ttime_samples: number of thinktime samples - * @ttime_mean: average process thinktime */ struct bfq_ttime { - unsigned long last_end_request; + u64 last_end_request; /* completion time of last request */ + + u64 ttime_total; /* total process thinktime */ + unsigned long ttime_samples; /* number of thinktime samples */ + u64 ttime_mean; /* average process thinktime */ - unsigned long ttime_total; - unsigned long ttime_samples; - unsigned long ttime_mean; }; /** * struct bfq_io_cq - per (request_queue, io_context) structure. - * @icq: associated io_cq structure - * @bfqq: array of two process queues, the sync and the async - * @ttime: associated @bfq_ttime struct - * @ioprio: per (request_queue, blkcg) ioprio. - * @blkcg_id: id of the blkcg the related io_cq belongs to. - * @wr_time_left: snapshot of the time left before weight raising ends - * for the sync queue associated to this process; this - * snapshot is taken to remember this value while the weight - * raising is suspended because the queue is merged with a - * shared queue, and is used to set @raising_cur_max_time - * when the queue is split from the shared queue and its - * weight is raised again - * @saved_idle_window: same purpose as the previous field for the idle - * window - * @saved_IO_bound: same purpose as the previous two fields for the I/O - * bound classification of a queue - * @saved_in_large_burst: same purpose as the previous fields for the - * value of the field keeping the queue's belonging - * to a large burst - * @was_in_burst_list: true if the queue belonged to a burst list - * before its merge with another cooperating queue - * @cooperations: counter of consecutive successful queue merges underwent - * by any of the process' @bfq_queues - * @failed_cooperations: counter of consecutive failed queue merges of any - * of the process' @bfq_queues */ struct bfq_io_cq { + /* associated io_cq structure */ struct io_cq icq; /* must be the first member */ + /* array of two process queues, the sync and the async */ struct bfq_queue *bfqq[2]; + /* associated @bfq_ttime struct */ struct bfq_ttime ttime; + /* per (request_queue, blkcg) ioprio */ int ioprio; - #ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_id; /* the current blkcg ID */ + uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif - unsigned int wr_time_left; + /* + * Snapshot of the idle window before merging; taken to + * remember this value while the queue is merged, so as to be + * able to restore it in case of split. + */ bool saved_idle_window; + /* + * Same purpose as the previous two fields for the I/O bound + * classification of a queue. + */ bool saved_IO_bound; + /* + * Same purpose as the previous fields for the value of the + * field keeping the queue's belonging to a large burst + */ bool saved_in_large_burst; + /* + * True if the queue belonged to a burst list before its merge + * with another cooperating queue. + */ bool was_in_burst_list; - unsigned int cooperations; - unsigned int failed_cooperations; + /* + * Similar to previous fields: save wr information. + */ + unsigned long saved_wr_coeff; + unsigned long saved_last_wr_start_finish; + unsigned long saved_wr_start_at_switch_to_srt; }; enum bfq_device_speed { @@ -354,224 +371,234 @@ enum bfq_device_speed { }; /** - * struct bfq_data - per device data structure. - * @queue: request queue for the managed device. - * @root_group: root bfq_group for the device. - * @active_numerous_groups: number of bfq_groups containing more than one - * active @bfq_entity. - * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by - * weight. Used to keep track of whether all @bfq_queues - * have the same weight. The tree contains one counter - * for each distinct weight associated to some active - * and not weight-raised @bfq_queue (see the comments to - * the functions bfq_weights_tree_[add|remove] for - * further details). - * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted - * by weight. Used to keep track of whether all - * @bfq_groups have the same weight. The tree contains - * one counter for each distinct weight associated to - * some active @bfq_group (see the comments to the - * functions bfq_weights_tree_[add|remove] for further - * details). - * @busy_queues: number of bfq_queues containing requests (including the - * queue in service, even if it is idling). - * @busy_in_flight_queues: number of @bfq_queues containing pending or - * in-flight requests, plus the @bfq_queue in - * service, even if idle but waiting for the - * possible arrival of its next sync request. This - * field is updated only if the device is rotational, - * but used only if the device is also NCQ-capable. - * The reason why the field is updated also for non- - * NCQ-capable rotational devices is related to the - * fact that the value of @hw_tag may be set also - * later than when busy_in_flight_queues may need to - * be incremented for the first time(s). Taking also - * this possibility into account, to avoid unbalanced - * increments/decrements, would imply more overhead - * than just updating busy_in_flight_queues - * regardless of the value of @hw_tag. - * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues - * (that is, seeky queues that expired - * for budget timeout at least once) - * containing pending or in-flight - * requests, including the in-service - * @bfq_queue if constantly seeky. This - * field is updated only if the device - * is rotational, but used only if the - * device is also NCQ-capable (see the - * comments to @busy_in_flight_queues). - * @wr_busy_queues: number of weight-raised busy @bfq_queues. - * @queued: number of queued requests. - * @rq_in_driver: number of requests dispatched and waiting for completion. - * @sync_flight: number of sync requests in the driver. - * @max_rq_in_driver: max number of reqs in driver in the last - * @hw_tag_samples completed requests. - * @hw_tag_samples: nr of samples used to calculate hw_tag. - * @hw_tag: flag set to one if the driver is showing a queueing behavior. - * @budgets_assigned: number of budgets assigned. - * @idle_slice_timer: timer set when idling for the next sequential request - * from the queue in service. - * @unplug_work: delayed work to restart dispatching on the request queue. - * @in_service_queue: bfq_queue in service. - * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. - * @last_position: on-disk position of the last served request. - * @last_budget_start: beginning of the last budget. - * @last_idling_start: beginning of the last idle slice. - * @peak_rate: peak transfer rate observed for a budget. - * @peak_rate_samples: number of samples used to calculate @peak_rate. - * @bfq_max_budget: maximum budget allotted to a bfq_queue before - * rescheduling. - * @active_list: list of all the bfq_queues active on the device. - * @idle_list: list of all the bfq_queues idle on the device. - * @bfq_fifo_expire: timeout for async/sync requests; when it expires - * requests are served in fifo order. - * @bfq_back_penalty: weight of backward seeks wrt forward ones. - * @bfq_back_max: maximum allowed backward seek. - * @bfq_slice_idle: maximum idling time. - * @bfq_user_max_budget: user-configured max budget value - * (0 for auto-tuning). - * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to - * async queues. - * @bfq_timeout: timeout for bfq_queues to consume their budget; used to - * to prevent seeky queues to impose long latencies to well - * behaved ones (this also implies that seeky queues cannot - * receive guarantees in the service domain; after a timeout - * they are charged for the whole allocated budget, to try - * to preserve a behavior reasonably fair among them, but - * without service-domain guarantees). - * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is - * no more granted any weight-raising. - * @bfq_failed_cooperations: number of consecutive failed cooperation - * chances after which weight-raising is restored - * to a queue subject to more than bfq_coop_thresh - * queue merges. - * @bfq_requests_within_timer: number of consecutive requests that must be - * issued within the idle time slice to set - * again idling to a queue which was marked as - * non-I/O-bound (see the definition of the - * IO_bound flag for further details). - * @last_ins_in_burst: last time at which a queue entered the current - * burst of queues being activated shortly after - * each other; for more details about this and the - * following parameters related to a burst of - * activations, see the comments to the function - * @bfq_handle_burst. - * @bfq_burst_interval: reference time interval used to decide whether a - * queue has been activated shortly after - * @last_ins_in_burst. - * @burst_size: number of queues in the current burst of queue activations. - * @bfq_large_burst_thresh: maximum burst size above which the current - * queue-activation burst is deemed as 'large'. - * @large_burst: true if a large queue-activation burst is in progress. - * @burst_list: head of the burst list (as for the above fields, more details - * in the comments to the function bfq_handle_burst). - * @low_latency: if set to true, low-latency heuristics are enabled. - * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised - * queue is multiplied. - * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). - * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. - * @bfq_wr_min_idle_time: minimum idle period after which weight-raising - * may be reactivated for a queue (in jiffies). - * @bfq_wr_min_inter_arr_async: minimum period between request arrivals - * after which weight-raising may be - * reactivated for an already busy queue - * (in jiffies). - * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, - * sectors per seconds. - * @RT_prod: cached value of the product R*T used for computing the maximum - * duration of the weight raising automatically. - * @device_speed: device-speed class for the low-latency heuristic. - * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. + * struct bfq_data - per-device data structure. * * All the fields are protected by the @queue lock. */ struct bfq_data { + /* request queue for the device */ struct request_queue *queue; + /* root bfq_group for the device */ struct bfq_group *root_group; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - int active_numerous_groups; -#endif - + /* + * rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active and not + * weight-raised @bfq_queue (see the comments to the functions + * bfq_weights_tree_[add|remove] for further details). + */ struct rb_root queue_weights_tree; + /* + * rbtree of non-queue @bfq_entity weight counters, sorted by + * weight. Used to keep track of whether all @bfq_groups have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active @bfq_group (see + * the comments to the functions bfq_weights_tree_[add|remove] + * for further details). + */ struct rb_root group_weights_tree; + /* + * Number of bfq_queues containing requests (including the + * queue in service, even if it is idling). + */ int busy_queues; - int busy_in_flight_queues; - int const_seeky_busy_in_flight_queues; + /* number of weight-raised busy @bfq_queues */ int wr_busy_queues; + /* number of queued requests */ int queued; + /* number of requests dispatched and waiting for completion */ int rq_in_driver; - int sync_flight; + /* + * Maximum number of requests in driver in the last + * @hw_tag_samples completed requests. + */ int max_rq_in_driver; + /* number of samples used to calculate hw_tag */ int hw_tag_samples; + /* flag set to one if the driver is showing a queueing behavior */ int hw_tag; + /* number of budgets assigned */ int budgets_assigned; - struct timer_list idle_slice_timer; + /* + * Timer set when idling (waiting) for the next request from + * the queue in service. + */ + struct hrtimer idle_slice_timer; + /* delayed work to restart dispatching on the request queue */ struct work_struct unplug_work; + /* bfq_queue in service */ struct bfq_queue *in_service_queue; + /* bfq_io_cq (bic) associated with the @in_service_queue */ struct bfq_io_cq *in_service_bic; + /* on-disk position of the last served request */ sector_t last_position; + /* time of last request completion (ns) */ + u64 last_completion; + + /* time of first rq dispatch in current observation interval (ns) */ + u64 first_dispatch; + /* time of last rq dispatch in current observation interval (ns) */ + u64 last_dispatch; + + /* beginning of the last budget */ ktime_t last_budget_start; + /* beginning of the last idle slice */ ktime_t last_idling_start; + + /* number of samples in current observation interval */ int peak_rate_samples; - u64 peak_rate; + /* num of samples of seq dispatches in current observation interval */ + u32 sequential_samples; + /* total num of sectors transferred in current observation interval */ + u64 tot_sectors_dispatched; + /* max rq size seen during current observation interval (sectors) */ + u32 last_rq_max_size; + /* time elapsed from first dispatch in current observ. interval (us) */ + u64 delta_from_first; + /* current estimate of device peak rate */ + u32 peak_rate; + + /* maximum budget allotted to a bfq_queue before rescheduling */ int bfq_max_budget; + /* list of all the bfq_queues active on the device */ struct list_head active_list; + /* list of all the bfq_queues idle on the device */ struct list_head idle_list; - unsigned int bfq_fifo_expire[2]; + /* + * Timeout for async/sync requests; when it fires, requests + * are served in fifo order. + */ + u64 bfq_fifo_expire[2]; + /* weight of backward seeks wrt forward ones */ unsigned int bfq_back_penalty; + /* maximum allowed backward seek */ unsigned int bfq_back_max; - unsigned int bfq_slice_idle; + /* maximum idling time */ + u32 bfq_slice_idle; + /* last time CLASS_IDLE was served */ u64 bfq_class_idle_last_service; + /* user-configured max budget value (0 for auto-tuning) */ int bfq_user_max_budget; - int bfq_max_budget_async_rq; - unsigned int bfq_timeout[2]; - - unsigned int bfq_coop_thresh; - unsigned int bfq_failed_cooperations; + /* + * Timeout for bfq_queues to consume their budget; used to + * prevent seeky queues from imposing long latencies to + * sequential or quasi-sequential ones (this also implies that + * seeky queues cannot receive guarantees in the service + * domain; after a timeout they are charged for the time they + * have been in service, to preserve fairness among them, but + * without service-domain guarantees). + */ + unsigned int bfq_timeout; + + /* + * Number of consecutive requests that must be issued within + * the idle time slice to set again idling to a queue which + * was marked as non-I/O-bound (see the definition of the + * IO_bound flag for further details). + */ unsigned int bfq_requests_within_timer; + /* + * Force device idling whenever needed to provide accurate + * service guarantees, without caring about throughput + * issues. CAVEAT: this may even increase latencies, in case + * of useless idling for processes that did stop doing I/O. + */ + bool strict_guarantees; + + /* + * Last time at which a queue entered the current burst of + * queues being activated shortly after each other; for more + * details about this and the following parameters related to + * a burst of activations, see the comments on the function + * bfq_handle_burst. + */ unsigned long last_ins_in_burst; + /* + * Reference time interval used to decide whether a queue has + * been activated shortly after @last_ins_in_burst. + */ unsigned long bfq_burst_interval; + /* number of queues in the current burst of queue activations */ int burst_size; + + /* common parent entity for the queues in the burst */ + struct bfq_entity *burst_parent_entity; + /* Maximum burst size above which the current queue-activation + * burst is deemed as 'large'. + */ unsigned long bfq_large_burst_thresh; + /* true if a large queue-activation burst is in progress */ bool large_burst; + /* + * Head of the burst list (as for the above fields, more + * details in the comments on the function bfq_handle_burst). + */ struct hlist_head burst_list; + /* if set to true, low-latency heuristics are enabled */ bool low_latency; - - /* parameters of the low_latency heuristics */ + /* + * Maximum factor by which the weight of a weight-raised queue + * is multiplied. + */ unsigned int bfq_wr_coeff; + /* maximum duration of a weight-raising period (jiffies) */ unsigned int bfq_wr_max_time; + + /* Maximum weight-raising duration for soft real-time processes */ unsigned int bfq_wr_rt_max_time; + /* + * Minimum idle period after which weight-raising may be + * reactivated for a queue (in jiffies). + */ unsigned int bfq_wr_min_idle_time; + /* + * Minimum period between request arrivals after which + * weight-raising may be reactivated for an already busy async + * queue (in jiffies). + */ unsigned long bfq_wr_min_inter_arr_async; + + /* Max service-rate for a soft real-time queue, in sectors/sec */ unsigned int bfq_wr_max_softrt_rate; + /* + * Cached value of the product R*T, used for computing the + * maximum duration of weight raising automatically. + */ u64 RT_prod; + /* device-speed class for the low-latency heuristic */ enum bfq_device_speed device_speed; + /* fallback dummy bfqq for extreme OOM conditions */ struct bfq_queue oom_bfqq; }; enum bfqq_state_flags { - BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ + BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ + BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* + * waiting for a request + * without idling the device + */ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ - BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ BFQ_BFQQ_FLAG_IO_bound, /* * bfqq has timed-out at least once * having consumed at most 2/10 of @@ -581,17 +608,12 @@ enum bfqq_state_flags { * bfqq activated in a large burst, * see comments to bfq_handle_burst. */ - BFQ_BFQQ_FLAG_constantly_seeky, /* - * bfqq has proved to be slow and - * seeky until budget timeout - */ BFQ_BFQQ_FLAG_softrt_update, /* * may need softrt-next-start * update */ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ - BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ + BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ }; #define BFQ_BFQQ_FNS(name) \ @@ -608,25 +630,53 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ } +BFQ_BFQQ_FNS(just_created); BFQ_BFQQ_FNS(busy); BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(must_alloc); BFQ_BFQQ_FNS(fifo_expire); BFQ_BFQQ_FNS(idle_window); BFQ_BFQQ_FNS(sync); -BFQ_BFQQ_FNS(budget_new); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); -BFQ_BFQQ_FNS(constantly_seeky); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); -BFQ_BFQQ_FNS(just_split); BFQ_BFQQ_FNS(softrt_update); #undef BFQ_BFQQ_FNS /* Logging facilities. */ -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); +static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ + assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + __pbuf, ##args); \ +} while (0) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ +} while (0) + +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + ##args) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +#endif /* CONFIG_BFQ_GROUP_IOSCHED */ #define bfq_log(bfqd, fmt, args...) \ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) @@ -640,15 +690,12 @@ enum bfqq_expiration { BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ + BFQ_BFQQ_PREEMPTED /* preemption in progress */ }; -#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfqg_stats { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -657,12 +704,8 @@ struct bfqg_stats { struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; - /* time not charged to this cgroup */ - struct blkg_stat unaccounted_time; /* sum of number of ios queued across all samples */ struct blkg_stat avg_queue_size_sum; /* count of samples taken for average */ @@ -680,8 +723,10 @@ struct bfqg_stats { uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; +#endif }; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* * struct bfq_group_data - per-blkcg storage for the blkio subsystem. * @@ -692,7 +737,7 @@ struct bfq_group_data { /* must be the first member */ struct blkcg_policy_data pd; - unsigned short weight; + unsigned int weight; }; /** @@ -712,7 +757,7 @@ struct bfq_group_data { * unused for the root group. Used to know whether there * are groups with more than one active @bfq_entity * (see the comments to the function - * bfq_bfqq_must_not_expire()). + * bfq_bfqq_may_idle()). * @rq_pos_tree: rbtree sorted by next_request position, used when * determining if two or more queues have interleaving * requests (see bfq_find_close_cooperator()). @@ -745,7 +790,6 @@ struct bfq_group { struct rb_root rq_pos_tree; struct bfqg_stats stats; - struct bfqg_stats dead_stats; /* stats pushed from dead children */ }; #else @@ -767,11 +811,25 @@ bfq_entity_service_tree(struct bfq_entity *entity) struct bfq_sched_data *sched_data = entity->sched_data; struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : - BFQ_DEFAULT_GRP_CLASS; + BFQ_DEFAULT_GRP_CLASS - 1; BUG_ON(idx >= BFQ_IOPRIO_CLASSES); BUG_ON(sched_data == NULL); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx); + } +#endif return sched_data->service_tree + idx; } @@ -791,47 +849,6 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) return bic->icq.q->elevator->elevator_data; } -/** - * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. - * @ptr: a pointer to a bfqd. - * @flags: storage for the flags to be saved. - * - * This function allows bfqg->bfqd to be protected by the - * queue lock of the bfqd they reference; the pointer is dereferenced - * under RCU, so the storage for bfqd is assured to be safe as long - * as the RCU read side critical section does not end. After the - * bfqd->queue->queue_lock is taken the pointer is rechecked, to be - * sure that no other writer accessed it. If we raced with a writer, - * the function returns NULL, with the queue unlocked, otherwise it - * returns the dereferenced pointer, with the queue locked. - */ -static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) -{ - struct bfq_data *bfqd; - - rcu_read_lock(); - bfqd = rcu_dereference(*(struct bfq_data **)ptr); - - if (bfqd != NULL) { - spin_lock_irqsave(bfqd->queue->queue_lock, *flags); - if (ptr == NULL) - printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); - else if (*ptr == bfqd) - goto out; - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); - } - - bfqd = NULL; -out: - rcu_read_unlock(); - return bfqd; -} - -static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) -{ - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -} - #ifdef CONFIG_BFQ_GROUP_IOSCHED static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) @@ -857,11 +874,13 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); static void bfq_put_queue(struct bfq_queue *bfqq); static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, int is_sync, - struct bfq_io_cq *bic, gfp_t gfp_mask); + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic); static void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +#ifdef CONFIG_BFQ_GROUP_IOSCHED static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +#endif static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); #endif /* _BFQ_H */ -- 2.7.4