diff -Naurp linux-2.6.4/arch/i386/kernel/smpboot.c linux-2.6.4-ck1/arch/i386/kernel/smpboot.c --- linux-2.6.4/arch/i386/kernel/smpboot.c 2004-03-11 22:25:06.951671222 +1100 +++ linux-2.6.4-ck1/arch/i386/kernel/smpboot.c 2004-03-11 22:25:48.405213977 +1100 @@ -1159,8 +1159,12 @@ __init void arch_init_sched_domains(void int j; first_cpu = last_cpu = NULL; - if (i != first_cpu(cpu_domain->span)) + if (i != first_cpu(cpu_domain->span)) { + cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= + SD_FLAG_SHARE_CPUPOWER; continue; + } for_each_cpu_mask(j, cpu_domain->span) { struct sched_group *cpu = &sched_group_cpus[j]; @@ -1279,8 +1283,12 @@ __init void arch_init_sched_domains(void int j; first_cpu = last_cpu = NULL; - if (i != first_cpu(cpu_domain->span)) + if (i != first_cpu(cpu_domain->span)) { + cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= + SD_FLAG_SHARE_CPUPOWER; continue; + } for_each_cpu_mask(j, cpu_domain->span) { struct sched_group *cpu = &sched_group_cpus[j]; diff -Naurp linux-2.6.4/include/linux/sched.h linux-2.6.4-ck1/include/linux/sched.h --- linux-2.6.4/include/linux/sched.h 2004-03-11 22:25:06.956670443 +1100 +++ linux-2.6.4-ck1/include/linux/sched.h 2004-03-11 22:25:48.407213666 +1100 @@ -537,6 +537,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define SD_FLAG_EXEC 2 /* Balance on exec */ #define SD_FLAG_WAKE 4 /* Balance on task wakeup */ #define SD_FLAG_FASTMIGRATE 8 /* Sync wakes put task on waking CPU */ +#define SD_FLAG_SHARE_CPUPOWER 16 /* Domain members share cpu power */ struct sched_group { struct sched_group *next; /* Must be a circular list */ @@ -562,6 +563,7 @@ struct sched_domain { unsigned int imbalance_pct; /* No balance until over watermark */ unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ int flags; /* See SD_FLAG_* */ /* Runtime fields. */ @@ -581,6 +583,7 @@ struct sched_domain { .imbalance_pct = 110, \ .cache_hot_time = 0, \ .cache_nice_tries = 0, \ + .per_cpu_gain = 15, \ .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE | SD_FLAG_WAKE,\ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -598,6 +601,7 @@ struct sched_domain { .imbalance_pct = 125, \ .cache_hot_time = (5*1000000/2), \ .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE,\ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -616,6 +620,7 @@ struct sched_domain { .imbalance_pct = 125, \ .cache_hot_time = (10*1000000), \ .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ .flags = SD_FLAG_EXEC, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff -Naurp linux-2.6.4/kernel/sched.c linux-2.6.4-ck1/kernel/sched.c --- linux-2.6.4/kernel/sched.c 2004-03-11 22:25:06.962669508 +1100 +++ linux-2.6.4-ck1/kernel/sched.c 2004-03-11 22:25:48.411213043 +1100 @@ -207,9 +207,8 @@ struct runqueue { struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int best_expired_prio; - + int cpu; atomic_t nr_iowait; - #ifdef CONFIG_SMP unsigned long cpu_load[NR_CPUS]; #endif @@ -1765,6 +1764,25 @@ static inline void rebalance_tick(int th } #endif +#ifdef CONFIG_SCHED_SMT +static inline int wake_priority_sleeper(runqueue_t *rq) +{ /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + return 1; + } + return 0; +} +#else +static inline int wake_priority_sleeper(runqueue_t *rq) +{ + return 0; +} +#endif + DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); @@ -1818,6 +1836,8 @@ void scheduler_tick(int user_ticks, int cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; + if (wake_priority_sleeper(rq)) + goto out; rebalance_tick(cpu, rq, IDLE); return; } @@ -1905,6 +1925,91 @@ out: rebalance_tick(cpu, rq, NOT_IDLE); } +#ifdef CONFIG_SCHED_SMT +static inline void wake_sleeping_dependent(runqueue_t *rq) +{ + int i, this_cpu = rq->cpu; + struct sched_domain *sd = cpu_sched_domain(this_cpu); + cpumask_t sibling_map; + + if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { + /* Not SMT */ + return; + } + + cpus_and(sibling_map, sd->span, cpu_online_map); + cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + + smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } +} + +static inline int dependent_sleeper(runqueue_t *rq, task_t *p) +{ + int ret = 0, i, this_cpu = rq->cpu; + struct sched_domain *sd = cpu_sched_domain(this_cpu); + cpumask_t sibling_map; + + if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { + /* Not SMT */ + return 0; + } + + cpus_and(sibling_map, sd->span, cpu_online_map); + cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + task_t *smt_curr; + + smt_rq = cpu_rq(i); + smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(p) || rt_task(smt_curr)) && + p->mm && smt_curr->mm && !rt_task(p)) + ret |= 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(smt_curr) || rt_task(p)) && + smt_curr->mm && p->mm && !rt_task(smt_curr)) || + (smt_curr == smt_rq->idle && smt_rq->nr_running)) + resched_task(smt_curr); + } + return ret; +} +#else +static inline void wake_sleeping_dependent(runqueue_t *rq) +{ +} + +static inline int dependent_sleeper(runqueue_t *rq, task_t *p) +{ + return 0; +} +#endif + void scheduling_functions_start_here(void) { } /* @@ -1976,6 +2081,7 @@ need_resched: if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; + wake_sleeping_dependent(rq); goto switch_tasks; } } @@ -1996,6 +2102,11 @@ need_resched: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (dependent_sleeper(rq, next)) { + next = rq->idle; + goto switch_tasks; + } + if (next->activated > 0) { unsigned long long delta = now - next->timestamp; @@ -3417,6 +3528,7 @@ void __init sched_init(void) #endif rq = cpu_rq(i); + rq->cpu = i; rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO;