--- init/Kconfig | 2 kernel/sched/bfs.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 125 insertions(+), 11 deletions(-) Index: linux-3.7-bfs/init/Kconfig =================================================================== --- linux-3.7-bfs.orig/init/Kconfig 2012-12-12 20:57:48.000000000 +1100 +++ linux-3.7-bfs/init/Kconfig 2012-12-12 21:19:34.585159323 +1100 @@ -366,11 +366,11 @@ config VIRT_CPU_ACCOUNTING small performance impact. In the case of s390 or IBM POWER > 5, this also enables accounting of stolen time on logically-partitioned systems. +endchoice config IRQ_TIME_ACCOUNTING def_bool y -endchoice config BSD_PROCESS_ACCT bool "BSD Process Accounting" Index: linux-3.7-bfs/kernel/sched/bfs.c =================================================================== --- linux-3.7-bfs.orig/kernel/sched/bfs.c 2012-12-12 20:54:47.000000000 +1100 +++ linux-3.7-bfs/kernel/sched/bfs.c 2012-12-12 21:31:24.281203137 +1100 @@ -1188,7 +1188,7 @@ static inline void return_task(struct ta #ifdef CONFIG_SMP #ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#define tsk_is_polling(t) 0 #endif static void resched_task(struct task_struct *p) @@ -1927,14 +1927,9 @@ static inline void finish_task_switch(st * Manfred Spraul */ prev_state = prev->state; + vtime_task_switch(prev); finish_arch_switch(prev); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_disable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ perf_event_task_sched_in(prev, current); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -2012,6 +2007,7 @@ context_switch(struct rq *rq, struct tas #endif /* Here we just switch the register state and the stack. */ + rcu_switch(prev, next); switch_to(prev, next, prev); barrier(); @@ -2216,7 +2212,7 @@ static inline u64 irq_time_read(int cpu) * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void account_system_vtime(struct task_struct *curr) +void vtime_account(struct task_struct *curr) { unsigned long flags; s64 delta; @@ -2246,7 +2242,7 @@ void account_system_vtime(struct task_st irq_time_write_end(); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(account_system_vtime); +EXPORT_SYMBOL_GPL(vtime_account); #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ @@ -3194,6 +3190,40 @@ static void reset_rq_task(struct rq *rq, /* * schedule() is the main scheduler function. + * + * The main means of driving the scheduler and thus entering this function are: + * + * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. + * + * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return + * paths. For example, see arch/x86/entry_64.S. + * + * To drive preemption between tasks, the scheduler sets the flag in timer + * interrupt handler scheduler_tick(). + * + * 3. Wakeups don't really cause entry into schedule(). They add a + * task to the run-queue and that's it. + * + * Now, if the new task added to the run-queue preempts the current + * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets + * called on the nearest possible occasion: + * + * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * + * - in syscall or exception context, at the next outmost + * preempt_enable(). (this might be as soon as the wake_up()'s + * spin_unlock()!) + * + * - in IRQ context, return from interrupt-handler to + * preemptible context + * + * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * then at the next: + * + * - cond_resched() call + * - explicit schedule() call + * - return from syscall or exception to user-space + * - return from interrupt-handler to user-space */ asmlinkage void __sched schedule(void) { @@ -3341,6 +3371,21 @@ rerun_prev_unlocked: } EXPORT_SYMBOL(schedule); +#ifdef CONFIG_RCU_USER_QS +asmlinkage void __sched schedule_user(void) +{ + /* + * If we come here after a random call to set_need_resched(), + * or we have been woken up remotely but the IPI has not yet arrived, + * we haven't yet exited the RCU idle mode. Do it here manually until + * we find a better solution. + */ + rcu_user_exit(); + schedule(); + rcu_user_enter(); +} +#endif + /** * schedule_preempt_disabled - called with preemption disabled * @@ -3439,6 +3484,7 @@ asmlinkage void __sched preempt_schedule /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); + rcu_user_exit(); do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); @@ -6396,7 +6442,6 @@ sd_numa_init(struct sched_domain_topolog | 0*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE | 0*SD_WAKE_AFFINE - | 0*SD_PREFER_LOCAL | 0*SD_SHARE_CPUPOWER | 0*SD_SHARE_PKG_RESOURCES | 1*SD_SERIALIZE @@ -6519,6 +6564,17 @@ static void sched_init_numa(void) * numbers. */ + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -6573,11 +6629,68 @@ static void sched_init_numa(void) } sched_domain_topology = tl; + + sched_domains_numa_levels = level; +} + +static void sched_domains_numa_masks_set(int cpu) +{ + int i, j; + int node = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +static void sched_domains_numa_masks_clear(int cpu) +{ + int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +/* + * Update sched_domains_numa_masks[level][node] array when new cpus + * are onlined. + */ +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + sched_domains_numa_masks_set(cpu); + break; + + case CPU_DEAD: + sched_domains_numa_masks_clear(cpu); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; } #else static inline void sched_init_numa(void) { } + +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + return 0; +} #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -7034,6 +7147,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); + hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);