diff -Naurp linux-2.6.4-base/arch/i386/Kconfig linux-2.6.4-stair5.2/arch/i386/Kconfig --- linux-2.6.4-base/arch/i386/Kconfig 2004-03-11 21:28:53.000000000 +1100 +++ linux-2.6.4-stair5.2/arch/i386/Kconfig 2004-03-30 19:25:27.000000000 +1000 @@ -478,6 +478,16 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + config PREEMPT bool "Preemptible Kernel" help diff -Naurp linux-2.6.4-base/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c linux-2.6.4-stair5.2/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c --- linux-2.6.4-base/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-03-11 21:28:53.000000000 +1100 +++ linux-2.6.4-stair5.2/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-03-30 19:25:27.000000000 +1000 @@ -57,8 +57,7 @@ static int cpufreq_p4_setdc(unsigned int u32 l, h; cpumask_t cpus_allowed, affected_cpu_map; struct cpufreq_freqs freqs; - int hyperthreading = 0; - int sibling = 0; + int j; if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV)) @@ -68,13 +67,10 @@ static int cpufreq_p4_setdc(unsigned int cpus_allowed = current->cpus_allowed; /* only run on CPU to be set, or on its sibling */ - affected_cpu_map = cpumask_of_cpu(cpu); -#ifdef CONFIG_X86_HT - hyperthreading = ((cpu_has_ht) && (smp_num_siblings == 2)); - if (hyperthreading) { - sibling = cpu_sibling_map[cpu]; - cpu_set(sibling, affected_cpu_map); - } +#ifdef CONFIG_SMP + affected_cpu_map = cpu_sibling_map[cpu]; +#else + affected_cpu_map = cpumask_of_cpu(cpu); #endif set_cpus_allowed(current, affected_cpu_map); BUG_ON(!cpu_isset(smp_processor_id(), affected_cpu_map)); @@ -97,11 +93,11 @@ static int cpufreq_p4_setdc(unsigned int /* notifiers */ freqs.old = stock_freq * l / 8; freqs.new = stock_freq * newstate / 8; - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - if (hyperthreading) { - freqs.cpu = sibling; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } } rdmsr(MSR_IA32_THERM_STATUS, l, h); @@ -132,10 +128,11 @@ static int cpufreq_p4_setdc(unsigned int set_cpus_allowed(current, cpus_allowed); /* notifiers */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - if (hyperthreading) { - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } } return 0; diff -Naurp linux-2.6.4-base/arch/i386/kernel/io_apic.c linux-2.6.4-stair5.2/arch/i386/kernel/io_apic.c --- linux-2.6.4-base/arch/i386/kernel/io_apic.c 2004-03-11 21:28:53.000000000 +1100 +++ linux-2.6.4-stair5.2/arch/i386/kernel/io_apic.c 2004-03-30 19:25:27.000000000 +1000 @@ -317,8 +317,7 @@ struct irq_cpu_info { #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) -#define CPU_TO_PACKAGEINDEX(i) \ - ((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i) +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) @@ -401,6 +400,7 @@ static void do_irq_balance(void) unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); unsigned long move_this_load = 0; int max_loaded = 0, min_loaded = 0; + int load; unsigned long useful_load_threshold = balanced_irq_interval + 10; int selected_irq; int tmp_loaded, first_attempt = 1; @@ -452,7 +452,7 @@ static void do_irq_balance(void) for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (min_cpu_irq > CPU_IRQ(i)) { min_cpu_irq = CPU_IRQ(i); @@ -471,7 +471,7 @@ tryanothercpu: for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (max_cpu_irq <= CPU_IRQ(i)) continue; @@ -551,9 +551,14 @@ tryanotherirq: * We seek the least loaded sibling by making the comparison * (A+B)/2 vs B */ - if (physical_balance && (CPU_IRQ(min_loaded) >> 1) > - CPU_IRQ(cpu_sibling_map[min_loaded])) - min_loaded = cpu_sibling_map[min_loaded]; + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); target_cpu_mask = cpumask_of_cpu(min_loaded); diff -Naurp linux-2.6.4-base/arch/i386/kernel/smpboot.c linux-2.6.4-stair5.2/arch/i386/kernel/smpboot.c --- linux-2.6.4-base/arch/i386/kernel/smpboot.c 2004-03-11 21:28:53.000000000 +1100 +++ linux-2.6.4-stair5.2/arch/i386/kernel/smpboot.c 2004-03-30 19:25:27.000000000 +1000 @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -934,7 +935,7 @@ static int boot_cpu_logical_apicid; /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -int cpu_sibling_map[NR_CPUS] __cacheline_aligned; +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; static void __init smp_boot_cpus(unsigned int max_cpus) { @@ -953,6 +954,8 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); + cpus_clear(cpu_sibling_map[0]); + cpu_set(0, cpu_sibling_map[0]); /* * If we couldn't find an SMP configuration at boot time, @@ -1079,32 +1082,34 @@ static void __init smp_boot_cpus(unsigne Dprintk("Boot done.\n"); /* - * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so - * that we can tell the sibling CPU efficiently. + * construct cpu_sibling_map[], so that we can tell sibling CPUs + * efficiently. */ - if (cpu_has_ht && smp_num_siblings > 1) { - for (cpu = 0; cpu < NR_CPUS; cpu++) - cpu_sibling_map[cpu] = NO_PROC_ID; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - int i; - if (!cpu_isset(cpu, cpu_callout_map)) - continue; + for (cpu = 0; cpu < NR_CPUS; cpu++) + cpus_clear(cpu_sibling_map[cpu]); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int siblings = 0; + int i; + if (!cpu_isset(cpu, cpu_callout_map)) + continue; + if (smp_num_siblings > 1) { for (i = 0; i < NR_CPUS; i++) { - if (i == cpu || !cpu_isset(i, cpu_callout_map)) + if (!cpu_isset(i, cpu_callout_map)) continue; if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_sibling_map[cpu] = i; - printk("cpu_sibling_map[%d] = %d\n", cpu, cpu_sibling_map[cpu]); - break; + siblings++; + cpu_set(i, cpu_sibling_map[cpu]); } } - if (cpu_sibling_map[cpu] == NO_PROC_ID) { - smp_num_siblings = 1; - printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu); - } + } else { + siblings++; + cpu_set(cpu, cpu_sibling_map[cpu]); } + + if (siblings != smp_num_siblings) + printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); } smpboot_setup_io_apic(); @@ -1118,6 +1123,224 @@ static void __init smp_boot_cpus(unsigne synchronize_tsc_bp(); } +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) { + cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= + SD_FLAG_SHARE_CPUPOWER; + continue; + } + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpu->cpumask = CPU_MASK_NONE; + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = cpu_sched_domain(j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + /* Set up nodes */ + first_cpu = last_cpu = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + /* ->cpu_power already setup */ + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + int node = cpu_to_node(i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_group *cpu_group = &sched_group_cpus[i]; + struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; + struct sched_group *node_group = &sched_group_nodes[node]; + + cpu_domain->parent = phys_domain; + phys_domain->parent = node_domain; + + node_domain->groups = node_group; + phys_domain->groups = phys_group; + cpu_domain->groups = cpu_group; + } +} +#else /* CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_possible_map; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) { + cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= + SD_FLAG_SHARE_CPUPOWER; + continue; + } + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_group *cpu_group = &sched_group_cpus[i]; + struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; + cpu_domain->parent = phys_domain; + phys_domain->groups = phys_group; + cpu_domain->groups = cpu_group; + } +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_SCHED_SMT */ + /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) diff -Naurp linux-2.6.4-base/arch/i386/oprofile/op_model_p4.c linux-2.6.4-stair5.2/arch/i386/oprofile/op_model_p4.c --- linux-2.6.4-base/arch/i386/oprofile/op_model_p4.c 2004-03-11 21:28:53.000000000 +1100 +++ linux-2.6.4-stair5.2/arch/i386/oprofile/op_model_p4.c 2004-03-30 19:25:27.000000000 +1000 @@ -382,11 +382,8 @@ static struct p4_event_binding p4_events static unsigned int get_stagger(void) { #ifdef CONFIG_SMP - int cpu; - if (smp_num_siblings > 1) { - cpu = smp_processor_id(); - return (cpu_sibling_map[cpu] > cpu) ? 0 : 1; - } + int cpu = smp_processor_id(); + return (cpu != first_cpu(cpu_sibling_map[cpu])); #endif return 0; } diff -Naurp linux-2.6.4-base/arch/ppc64/Kconfig linux-2.6.4-stair5.2/arch/ppc64/Kconfig --- linux-2.6.4-base/arch/ppc64/Kconfig 2004-03-11 21:28:54.000000000 +1100 +++ linux-2.6.4-stair5.2/arch/ppc64/Kconfig 2004-03-30 19:25:27.000000000 +1000 @@ -173,6 +173,16 @@ config NUMA bool "NUMA support" depends on DISCONTIGMEM +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + config PREEMPT bool help diff -Naurp linux-2.6.4-base/arch/ppc64/kernel/smp.c linux-2.6.4-stair5.2/arch/ppc64/kernel/smp.c --- linux-2.6.4-base/arch/ppc64/kernel/smp.c 2004-03-11 21:28:54.000000000 +1100 +++ linux-2.6.4-stair5.2/arch/ppc64/kernel/smp.c 2004-03-30 19:25:27.000000000 +1000 @@ -796,3 +796,225 @@ static int __init topology_init(void) return 0; } __initcall(topology_init); + +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + cpumask_t tmp1 = cpumask_of_cpu(i ^ 0x1); + cpumask_t tmp2 = cpumask_of_cpu(i); + + *cpu_domain = SD_SIBLING_INIT; + cpus_or(cpu_domain->span, tmp1, tmp2); + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) { + cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= + SD_FLAG_SHARE_CPUPOWER; + continue; + } + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpumask_t node_cpumask = node_to_cpumask(i); + cpus_and(nodemask, node_cpumask, cpu_online_map); + + if (cpus_empty(nodemask)) + continue; + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = cpu_sched_domain(j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + /* Set up nodes */ + first_cpu = last_cpu = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpumask_t node_cpumask = node_to_cpumask(i); + cpus_and(nodemask, node_cpumask, cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + /* ->cpu_power already setup */ + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + int node = cpu_to_node(i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_group *cpu_group = &sched_group_cpus[i]; + struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; + struct sched_group *node_group = &sched_group_nodes[node]; + + cpu_domain->parent = phys_domain; + phys_domain->parent = node_domain; + + node_domain->groups = node_group; + phys_domain->groups = phys_group; + cpu_domain->groups = cpu_group; + } +} +#else /* CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_possible_map; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) { + cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= + SD_FLAG_SHARE_CPUPOWER; + continue; + } + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_group *cpu_group = &sched_group_cpus[i]; + struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; + cpu_domain->parent = phys_domain; + phys_domain->groups = phys_group; + cpu_domain->groups = cpu_group; + } +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_SCHED_SMT */ diff -Naurp linux-2.6.4-base/Documentation/sched-domains.txt linux-2.6.4-stair5.2/Documentation/sched-domains.txt --- linux-2.6.4-base/Documentation/sched-domains.txt 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-stair5.2/Documentation/sched-domains.txt 2004-03-30 19:25:27.528679942 +1000 @@ -0,0 +1,55 @@ +Each CPU has a "base" scheduling domain (struct sched_domain). These are +accessed via cpu_sched_domain(i) and this_sched_domain() macros. The domain +hierarchy is built from these base domains via the ->parent pointer. ->parent +MUST be NULL terminated, and domain structures should be per-CPU as they +are locklessly updated. + +Each scheduling domain spans a number of CPUs (stored in the ->span field). +A domain's span MUST be a superset of it child's span, and a base domain +for CPU i MUST span at least i. The top domain for each CPU will generally +span all CPUs in the system although strictly it doesn't have to, but this +could lead to a case where some CPUs will never be given tasks to run unless +the CPUs allowed mask is explicitly set. A sched domain's span means "balance +process load among these CPUs". + +Each scheduling domain must have one or more CPU groups (struct sched_group) +which are organised as a circular one way linked list from the ->groups +pointer. The union of cpumasks of these groups MUST be the same as the +domain's span. The intersection of cpumasks from any two of these groups +MUST be the empty set. The group pointed to by the ->groups pointer MUST +contain the CPU to which the domain belongs. Groups may be shared among +CPUs as they contain read only data after they have been set up. + +Balancing within a sched domain occurs between groups. That is, each group +is treated as one entity. The load of a group is defined as the sum of the +load of each of its member CPUs, and only when the load of a group becomes +out of balance are tasks moved between groups. + +In kernel/sched.c, rebalance_tick is run periodically on each CPU. This +function takes its CPU's base sched domain and checks to see if has reached +its rebalance interval. If so, then it will run load_balance on that domain. +rebalance_tick then checks the parent sched_domain (if it exists), and the +parent of the parent and so forth. + +*** Implementing sched domains *** +The "base" domain will "span" the first level of the hierarchy. In the case +of SMT, you'll span all siblings of the physical CPU, with each group being +a single virtual CPU. + +In SMP, the parent of the base domain will span all physical CPUs in the +node. Each group being a single physical CPU. Then with NUMA, the parent +of the SMP domain will span the entire machine, with each group having the +cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example, +might have just one domain covering its one NUMA level. + +The implementor should read comments in include/linux/sched.h: +struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of +the specifics and what to tune. + +Implementors should change the line +#undef SCHED_DOMAIN_DEBUG +to +#define SCHED_DOMAIN_DEBUG +in kernel/sched.c as this enables an error checking parse of the sched domains +which should catch most possible errors (described above). It also prints out +the domain structure in a visual format. diff -Naurp linux-2.6.4-base/fs/proc/array.c linux-2.6.4-stair5.2/fs/proc/array.c --- linux-2.6.4-base/fs/proc/array.c 2004-03-11 21:28:51.000000000 +1100 +++ linux-2.6.4-stair5.2/fs/proc/array.c 2004-03-30 19:25:27.528679942 +1000 @@ -154,7 +154,6 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -162,7 +161,6 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1000000000/1024), p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, diff -Naurp linux-2.6.4-base/include/asm-i386/processor.h linux-2.6.4-stair5.2/include/asm-i386/processor.h --- linux-2.6.4-base/include/asm-i386/processor.h 2004-03-11 21:29:30.000000000 +1100 +++ linux-2.6.4-stair5.2/include/asm-i386/processor.h 2004-03-30 19:25:27.529679785 +1000 @@ -646,4 +646,9 @@ extern inline void prefetchw(const void extern void select_idle_routine(const struct cpuinfo_x86 *c); +#ifdef CONFIG_SCHED_SMT +#define ARCH_HAS_SCHED_DOMAIN +#define ARCH_HAS_SCHED_WAKE_BALANCE +#endif + #endif /* __ASM_I386_PROCESSOR_H */ diff -Naurp linux-2.6.4-base/include/asm-i386/smp.h linux-2.6.4-stair5.2/include/asm-i386/smp.h --- linux-2.6.4-base/include/asm-i386/smp.h 2004-03-11 21:29:30.000000000 +1100 +++ linux-2.6.4-stair5.2/include/asm-i386/smp.h 2004-03-30 19:25:27.529679785 +1000 @@ -34,7 +34,7 @@ extern void smp_alloc_memory(void); extern int pic_mode; extern int smp_num_siblings; -extern int cpu_sibling_map[]; +extern cpumask_t cpu_sibling_map[]; extern void smp_flush_tlb(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); diff -Naurp linux-2.6.4-base/include/asm-ppc64/processor.h linux-2.6.4-stair5.2/include/asm-ppc64/processor.h --- linux-2.6.4-base/include/asm-ppc64/processor.h 2004-03-11 21:29:29.000000000 +1100 +++ linux-2.6.4-stair5.2/include/asm-ppc64/processor.h 2004-03-30 19:25:27.530679628 +1000 @@ -618,6 +618,11 @@ static inline void prefetchw(const void #define spin_lock_prefetch(x) prefetchw(x) +#ifdef CONFIG_SCHED_SMT +#define ARCH_HAS_SCHED_DOMAIN +#define ARCH_HAS_SCHED_WAKE_BALANCE +#endif + #endif /* ASSEMBLY */ #endif /* __ASM_PPC64_PROCESSOR_H */ diff -Naurp linux-2.6.4-base/include/linux/sched.h linux-2.6.4-stair5.2/include/linux/sched.h --- linux-2.6.4-base/include/linux/sched.h 2004-03-11 21:29:26.000000000 +1100 +++ linux-2.6.4-stair5.2/include/linux/sched.h 2004-03-30 19:25:27.000000000 +1000 @@ -147,6 +147,7 @@ extern spinlock_t mmlist_lock; typedef struct task_struct task_t; extern void sched_init(void); +extern void sched_init_smp(void); extern void init_idle(task_t *idle, int cpu); extern void show_state(void); @@ -371,14 +372,13 @@ struct task_struct { struct list_head run_list; prio_array_t *array; - unsigned long sleep_avg; - long interactive_credit; unsigned long long timestamp; - int activated; + unsigned long runtime; + unsigned int deadline; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + unsigned int slice, time_slice, first_time_slice; struct list_head tasks; struct list_head ptrace_children; @@ -529,6 +529,108 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #ifdef CONFIG_SMP +#define SCHED_LOAD_SHIFT 7 /* increase resolution of load calculations */ +#define SCHED_LOAD_SCALE (1UL << SCHED_LOAD_SHIFT) + +#define SD_FLAG_NEWIDLE 1 /* Balance when about to become idle */ +#define SD_FLAG_EXEC 2 /* Balance on exec */ +#define SD_FLAG_WAKE 4 /* Balance on task wakeup */ +#define SD_FLAG_FASTMIGRATE 8 /* Sync wakes put task on waking CPU */ +#define SD_FLAG_SHARE_CPUPOWER 16 /* Domain members share cpu power */ + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + cpumask_t cpumask; + + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. This should be read only (except for setup). Although + * it will need to be written to at cpu hot(un)plug time, perhaps the + * cpucontrol semaphore will provide enough exclusion? + */ + unsigned long cpu_power; +}; + +struct sched_domain { + /* These fields must be setup */ + struct sched_domain *parent; /* top domain must be null terminated */ + struct sched_group *groups; /* the balancing groups of the domain */ + cpumask_t span; /* span of all CPUs in this domain */ + unsigned long min_interval; /* Minimum balance interval ms */ + unsigned long max_interval; /* Maximum balance interval ms */ + unsigned int busy_factor; /* less balancing by factor if busy */ + unsigned int imbalance_pct; /* No balance until over watermark */ + unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ + int flags; /* See SD_FLAG_* */ + + /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ +}; + +/* Common values for SMT siblings */ +#define SD_SIBLING_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 8, \ + .imbalance_pct = 110, \ + .cache_hot_time = 0, \ + .cache_nice_tries = 0, \ + .per_cpu_gain = 15, \ + .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE | SD_FLAG_WAKE,\ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +/* Common values for CPUs */ +#define SD_CPU_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_hot_time = (5*1000000/2), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE,\ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +#ifdef CONFIG_NUMA +/* Common values for NUMA nodes */ +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 8, \ + .max_interval = 256*fls(num_online_cpus()),\ + .busy_factor = 8, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_FLAG_EXEC, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif + +DECLARE_PER_CPU(struct sched_domain, base_domains); +#define cpu_sched_domain(cpu) (&per_cpu(base_domains, (cpu))) +#define this_sched_domain() (&__get_cpu_var(base_domains)) + extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); #else static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) @@ -541,10 +643,8 @@ extern unsigned long long sched_clock(vo #ifdef CONFIG_NUMA extern void sched_balance_exec(void); -extern void node_nr_running_init(void); #else #define sched_balance_exec() {} -#define node_nr_running_init() {} #endif extern void set_user_nice(task_t *p, long nice); diff -Naurp linux-2.6.4-base/init/main.c linux-2.6.4-stair5.2/init/main.c --- linux-2.6.4-base/init/main.c 2004-03-11 21:28:56.000000000 +1100 +++ linux-2.6.4-stair5.2/init/main.c 2004-03-30 19:25:27.000000000 +1000 @@ -568,7 +568,6 @@ static void do_pre_smp_initcalls(void) migration_init(); #endif - node_nr_running_init(); spawn_ksoftirqd(); } @@ -599,6 +598,7 @@ static int init(void * unused) do_pre_smp_initcalls(); smp_init(); + sched_init_smp(); do_basic_setup(); prepare_namespace(); diff -Naurp linux-2.6.4-base/kernel/sched.c linux-2.6.4-stair5.2/kernel/sched.c --- linux-2.6.4-base/kernel/sched.c 2004-03-11 21:29:21.000000000 +1100 +++ linux-2.6.4-stair5.2/kernel/sched.c 2004-03-30 19:41:23.000000000 +1000 @@ -15,6 +15,9 @@ * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-03-19. New staircase scheduling policy by Con Kolivas with help + * from Zwane Mwaikambo and useful suggestions by + * William Lee Irwin III. */ #include @@ -63,122 +66,17 @@ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ - (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) -/* - * Some helpers for converting nanosecond timing to jiffy resolution - */ +//This is the time all tasks within the same priority round robin. +#define RR_INTERVAL (((10 * HZ / 1000) ? : 1) * num_online_cpus()) + #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -/* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, - * maximum timeslice is 200 msecs. Timeslices get refilled after - * they expire. - */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define NODE_THRESHOLD 125 -#define CREDIT_LIMIT 100 - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ - INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define HIGH_CREDIT(p) \ - ((p)->interactive_credit > CREDIT_LIMIT) - -#define LOW_CREDIT(p) \ - ((p)->interactive_credit < -CREDIT_LIMIT) - #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) /* - * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] - * to time slice values. - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - * - * task_timeslice() is the interface that is used by the scheduler. - */ - -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ - ((MAX_TIMESLICE - MIN_TIMESLICE) * \ - (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) - -static inline unsigned int task_timeslice(task_t *p) -{ - return BASE_TIMESLICE(p); -} - -/* * These are the runqueue data structures: */ @@ -187,9 +85,9 @@ static inline unsigned int task_timeslic typedef struct runqueue runqueue_t; struct prio_array { - int nr_active; + unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; + struct list_head queue[MAX_PRIO + 1]; }; /* @@ -201,24 +99,32 @@ struct prio_array { */ struct runqueue { spinlock_t lock; - unsigned long nr_running, nr_switches, expired_timestamp, - nr_uninterruptible, timestamp_last_tick; + unsigned long nr_running, nr_switches, nr_uninterruptible; + unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int best_expired_prio, prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA - atomic_t *node_nr_running; - int prev_node_load[MAX_NUMNODES]; + prio_array_t array; + int cpu; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + unsigned long cpu_load[NR_CPUS]; #endif + /* For active balancing */ + int active_balance; + int push_cpu; + task_t *migration_thread; struct list_head migration_queue; - - atomic_t nr_iowait; }; static DEFINE_PER_CPU(struct runqueue, runqueues); +#ifdef CONFIG_SMP +/* Mandatory scheduling domains */ +DEFINE_PER_CPU(struct sched_domain, base_domains); +#endif + #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) @@ -233,51 +139,16 @@ static DEFINE_PER_CPU(struct runqueue, r # define task_running(rq, p) ((rq)->curr == (p)) #endif -#ifdef CONFIG_NUMA - -/* - * Keep track of running tasks. - */ - -static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = - {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)}; - -static inline void nr_running_init(struct runqueue *rq) -{ - rq->node_nr_running = &node_nr_running[0]; -} - static inline void nr_running_inc(runqueue_t *rq) { - atomic_inc(rq->node_nr_running); rq->nr_running++; } static inline void nr_running_dec(runqueue_t *rq) { - atomic_dec(rq->node_nr_running); rq->nr_running--; } -__init void node_nr_running_init(void) -{ - int i; - - for (i = 0; i < NR_CPUS; i++) { - if (cpu_possible(i)) - cpu_rq(i)->node_nr_running = - &node_nr_running[cpu_to_node(i)]; - } -} - -#else /* !CONFIG_NUMA */ - -# define nr_running_init(rq) do { } while (0) -# define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) -# define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) - -#endif /* CONFIG_NUMA */ - /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -325,172 +196,127 @@ static inline void rq_unlock(runqueue_t /* * Adding/removing a task to/from a priority array: */ -static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +static inline void dequeue_task(struct task_struct *p, runqueue_t *rq) { + prio_array_t* array = &rq->array; array->nr_active--; list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); } -static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +static inline void enqueue_task(struct task_struct *p, runqueue_t *rq) { - list_add_tail(&p->run_list, array->queue + p->prio); + prio_array_t* array = &rq->array; + if (p->time_slice < RR_INTERVAL) { + // has been preempted, put at head of list. + list_add(&p->run_list, array->queue + p->prio); + if (p->time_slice < 2) + // make sure we run at least one tick if requeued. + p->time_slice = 2; + } else + list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; } /* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. + * __activate_task - move a task to the runqueue. */ -static int effective_prio(task_t *p) +static inline void __activate_task(task_t *p, runqueue_t *rq) { - int bonus, prio; + enqueue_task(p, rq); + nr_running_inc(rq); +} - if (rt_task(p)) - return p->prio; +// deadline - the best deadline rank a task can have. +static inline unsigned int deadline(task_t *p) +{ + unsigned int deadline; + if (unlikely(rt_task(p))) + return p->deadline; + deadline = 40 - TASK_USER_PRIO(p); + return deadline; +} - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; +// slice - the duration a task runs before losing a deadline rank. +static inline unsigned int slice(task_t *p) +{ + unsigned int slice = RR_INTERVAL; + if (likely(!rt_task(p))) + slice *= deadline(p); + return slice; +} - prio = p->static_prio - bonus; +// effective_prio - dynamic priority dependent on deadline rank. +static inline int effective_prio(task_t *p) +{ + unsigned int prio; + if (unlikely(rt_task(p))) + return p->prio; + + prio = MAX_PRIO - 1 - p->deadline; if (prio < MAX_RT_PRIO) prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; return prio; } /* - * __activate_task - move a task to the runqueue. + * first_time_slice - is the duration a task spends at the highest step + * on the "staircase" of the scheduler. If it's deadline rank is less + * than the best rank the duration is longer to maintain appropriate + * cpu distribution. */ -static inline void __activate_task(task_t *p, runqueue_t *rq) +static inline unsigned int first_time_slice(task_t *p) { - enqueue_task(p, rq->active); - nr_running_inc(rq); + unsigned int time_slice = RR_INTERVAL; + if (likely(!rt_task(p))) + time_slice *= ((deadline(p) - p->deadline) + 1); + return time_slice; } -static void recalc_task_prio(task_t *p, unsigned long long now) +static inline int apparent_prio(task_t *p) { - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; - - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - - if (likely(sleep_time > 0)) { - /* - * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. - */ - if (p->mm && p->activated != -1 && - sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - AVG_TIMESLICE); - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } else { - /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - - /* - * Tasks with low interactive_credit are limited to - * one timeslice worth of sleep avg bonus. - */ - if (LOW_CREDIT(p) && - sleep_time > JIFFIES_TO_NS(task_timeslice(p))) - sleep_time = JIFFIES_TO_NS(task_timeslice(p)); - - /* - * Non high_credit tasks waking from uninterruptible - * sleep are limited in their sleep_avg rise as they - * are likely to be cpu hogs waiting on I/O - */ - if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - if (p->sleep_avg > NS_MAX_SLEEP_AVG) { - p->sleep_avg = NS_MAX_SLEEP_AVG; - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } - } - } - - p->prio = effective_prio(p); + return (MAX_PRIO - 1 - (p->slice / RR_INTERVAL)); } /* * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) */ static inline void activate_task(task_t *p, runqueue_t *rq) { unsigned long long now = sched_clock(); - - recalc_task_prio(p, now); - - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. - */ - if (!p->activated) { + unsigned long run_time, sleep_time; + unsigned int full_slice = slice(p); + p->slice = full_slice; + + if (likely(!rt_task(p))) { /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: + * This ensures that tasks running for less than one tick are + * treated the same as longer running tasks. */ - if (in_interrupt()) - p->activated = 2; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->activated = 1; + sleep_time = now - p->timestamp; + if (sleep_time > p->runtime) + p->runtime = 0; + else + p->runtime -= sleep_time; + run_time = NS_TO_JIFFIES(p->runtime); + + if (unlikely(run_time >= full_slice)) { + if (p->deadline) + p->deadline--; + p->runtime = 0; + } else { + if (run_time) + p->slice -= run_time; + else if (p->deadline < deadline(p)) + p->deadline++; } } + p->prio = effective_prio(p); + p->time_slice = first_time_slice(p); p->timestamp = now; - __activate_task(p, rq); } @@ -500,9 +326,13 @@ static inline void activate_task(task_t static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { nr_running_dec(rq); - if (p->state == TASK_UNINTERRUPTIBLE) + if (p->state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible++; - dequeue_task(p, p->array); + if (p->deadline && p->deadline > (deadline(p) - 2) && p->mm) + // maximum deadline rank limited while waiting on i/o. + p->deadline--; + } + dequeue_task(p, rq); p->array = NULL; } @@ -545,37 +375,30 @@ inline int task_curr(task_t *p) typedef struct { struct list_head list; task_t *task; + int dest_cpu; struct completion done; } migration_req_t; /* - * The task's runqueue lock must be held, and the new mask must be valid. + * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static int __set_cpus_allowed(task_t *p, cpumask_t new_mask, - migration_req_t *req) +static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) { runqueue_t *rq = task_rq(p); - p->cpus_allowed = new_mask; - /* - * Can the task run on the task's current CPU? If not then - * migrate the thread off to a proper CPU. - */ - if (cpu_isset(task_cpu(p), new_mask)) - return 0; - /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ if (!p->array && !task_running(rq, p)) { - set_task_cpu(p, any_online_cpu(p->cpus_allowed)); + set_task_cpu(p, dest_cpu); return 0; } init_completion(&req->done); req->task = p; + req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); return 1; } @@ -636,7 +459,76 @@ void kick_process(task_t *p) } EXPORT_SYMBOL_GPL(kick_process); +/* + * Return a low guess at the load of cpu. Update previous history if update + * is true + */ +static inline unsigned long get_low_cpu_load(int cpu, int update) +{ + runqueue_t *rq = cpu_rq(cpu); + runqueue_t *this_rq = this_rq(); + unsigned long nr = rq->nr_running << SCHED_LOAD_SHIFT; + unsigned long load = this_rq->cpu_load[cpu]; + unsigned long ret = min(nr, load); + + if (update) + this_rq->cpu_load[cpu] = (nr + load) / 2; + + return ret; +} + +static inline unsigned long get_high_cpu_load(int cpu, int update) +{ + runqueue_t *rq = cpu_rq(cpu); + runqueue_t *this_rq = this_rq(); + unsigned long nr = rq->nr_running << SCHED_LOAD_SHIFT; + unsigned long load = this_rq->cpu_load[cpu]; + unsigned long ret = max(nr, load); + + if (update) + this_rq->cpu_load[cpu] = (nr + load) / 2; + + return ret; +} + +#endif + +/* + * sched_balance_wake can be used with SMT architectures to wake a + * task onto an idle sibling if cpu is not idle. Returns cpu if + * cpu is idle or no siblings are idle, otherwise returns an idle + * sibling. + */ +#if defined(CONFIG_SMP) && defined(ARCH_HAS_SCHED_WAKE_BALANCE) +static int sched_balance_wake(int cpu, task_t *p) +{ + cpumask_t tmp; + struct sched_domain *domain; + int i; + + if (idle_cpu(cpu)) + return cpu; + + domain = cpu_sched_domain(cpu); + if (!(domain->flags & SD_FLAG_WAKE)) + return cpu; + + cpus_and(tmp, domain->span, cpu_online_map); + for_each_cpu_mask(i, tmp) { + if (!cpu_isset(i, p->cpus_allowed)) + continue; + if (idle_cpu(i)) + return i; + } + + return cpu; +} +#else +static inline int sched_balance_wake(int cpu, task_t *p) +{ + return cpu; +} #endif /*** @@ -659,44 +551,102 @@ static int try_to_wake_up(task_t * p, un int success = 0; long old_state; runqueue_t *rq; + int cpu, this_cpu; +#ifdef CONFIG_SMP + unsigned long long now; + unsigned long load, this_load; + int new_cpu; + struct sched_domain *sd; + runqueue_t *this_rq; +#endif -repeat_lock_task: rq = task_rq_lock(p, &flags); old_state = p->state; - if (old_state & state) { - if (!p->array) { - /* - * Fast-migrate the task if it's not running or runnable - * currently. Do not violate hard affinity. - */ - if (unlikely(sync && !task_running(rq, p) && - (task_cpu(p) != smp_processor_id()) && - cpu_isset(smp_processor_id(), - p->cpus_allowed))) { - - set_task_cpu(p, smp_processor_id()); - task_rq_unlock(rq, &flags); - goto repeat_lock_task; - } - if (old_state == TASK_UNINTERRUPTIBLE) { - rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } - if (sync && (task_cpu(p) == smp_processor_id())) - __activate_task(p, rq); - else { - activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } - success = 1; + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + +#ifdef CONFIG_SMP + if (cpu == this_cpu) + goto out_activate; + + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed) + || task_running(rq, p))) + goto out_activate; + + /* Passive load balancing */ + load = get_low_cpu_load(cpu, 1); + this_load = get_high_cpu_load(this_cpu, 1) + SCHED_LOAD_SCALE; + if (load > this_load) { + new_cpu = sched_balance_wake(this_cpu, p); + set_task_cpu(p, new_cpu); + goto repeat_lock_task; + } + + this_rq = this_rq(); + now = sched_clock(); + sd = cpu_sched_domain(this_cpu); + + /* + * Fast-migrate the task if it's not running or + * runnable currently. Do not violate hard affinity. + */ + do { + if (!(sd->flags & SD_FLAG_FASTMIGRATE)) + break; + if (now - p->timestamp < sd->cache_hot_time) + break; + + if (cpu_isset(cpu, sd->span)) { + new_cpu = sched_balance_wake(this_cpu, p); + set_task_cpu(p, new_cpu); + goto repeat_lock_task; } - p->state = TASK_RUNNING; + sd = sd->parent; + } while (sd); + + new_cpu = sched_balance_wake(cpu, p); + if (new_cpu != cpu) { + set_task_cpu(p, new_cpu); + goto repeat_lock_task; } + goto out_activate; + +repeat_lock_task: + task_rq_unlock(rq, &flags); + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + +out_activate: +#endif /* CONFIG_SMP */ + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + if (sync && cpu == this_cpu) { + __activate_task(p, rq); + } else { + activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: task_rq_unlock(rq, &flags); return success; @@ -746,17 +696,19 @@ void fastcall sched_fork(task_t *p) */ local_irq_disable(); p->time_slice = (current->time_slice + 1) >> 1; + p->slice = (current->slice + 1) >> 1; /* * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. */ p->first_time_slice = 1; current->time_slice >>= 1; + current->slice >>= 1; p->timestamp = sched_clock(); if (!current->time_slice) { /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the * runqueue lock is not a problem. */ current->time_slice = 1; @@ -779,33 +731,20 @@ void fastcall wake_up_forked_process(tas unsigned long flags; runqueue_t *rq = task_rq_lock(current, &flags); + // Forked process gets a lower deadline rank to prevent fork bombs. + if (p->deadline) + p->deadline--; + + // Deadline rank on kernel threads is fixed at best. + if (unlikely(!p->mm)) + p->deadline = deadline(p); BUG_ON(p->state != TASK_RUNNING); - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. - */ - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->interactive_credit = 0; - - p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - nr_running_inc(rq); - } + p->prio = effective_prio(p); + p->runtime = 0; + __activate_task(p, rq); task_rq_unlock(rq, &flags); } @@ -826,19 +765,11 @@ void fastcall sched_exit(task_t * p) local_irq_save(flags); if (p->first_time_slice) { p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) - p->parent->time_slice = MAX_TIMESLICE; + if (unlikely(p->parent->time_slice > slice(p->parent))) + p->parent->time_slice = slice(p->parent); } local_irq_restore(flags); - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ rq = task_rq_lock(p->parent, &flags); - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); task_rq_unlock(rq, &flags); } @@ -872,7 +803,7 @@ static inline void finish_task_switch(ta * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. - * Manfred Spraul + * Manfred Spraul */ prev_task_flags = prev->flags; finish_arch_switch(rq, prev); @@ -934,7 +865,7 @@ unsigned long nr_running(void) { unsigned long i, sum = 0; - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) sum += cpu_rq(i)->nr_running; return sum; @@ -1004,6 +935,14 @@ static inline void double_rq_unlock(runq spin_unlock(&rq2->lock); } +enum idle_type +{ + IDLE, + NOT_IDLE, + NEWLY_IDLE, +}; + +#ifdef CONFIG_SMP #ifdef CONFIG_NUMA /* * If dest_cpu is allowed for this process, migrate the task to it. @@ -1016,28 +955,18 @@ static void sched_migrate_task(task_t *p runqueue_t *rq; migration_req_t req; unsigned long flags; - cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu); rq = task_rq_lock(p, &flags); - old_mask = p->cpus_allowed; - if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu)) + if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; /* force the process onto the specified CPU */ - if (__set_cpus_allowed(p, new_mask, &req)) { + if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); wait_for_completion(&req.done); - - /* If we raced with sys_sched_setaffinity, don't - * restore mask. */ - rq = task_rq_lock(p, &flags); - if (likely(cpus_equal(p->cpus_allowed, new_mask))) { - /* Restore old mask: won't need migration - * thread, since current cpu is allowed. */ - BUG_ON(__set_cpus_allowed(p, old_mask, NULL)); - } + return; } out: task_rq_unlock(rq, &flags); @@ -1047,404 +976,622 @@ out: * Find the least loaded CPU. Slightly favor the current CPU by * setting its runqueue length as the minimum to start. */ -static int sched_best_cpu(struct task_struct *p) +static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain) { - int i, minload, load, best_cpu, node = 0; - cpumask_t cpumask; + cpumask_t tmp; + int i, min_load, this_cpu, best_cpu; - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; + best_cpu = this_cpu = task_cpu(p); + min_load = INT_MAX; - minload = 10000000; - for_each_node_with_cpus(i) { - /* - * Node load is always divided by nr_cpus_node to normalise - * load values in case cpu count differs from node to node. - * We first multiply node_nr_running by 10 to get a little - * better resolution. - */ - load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i); - if (load < minload) { - minload = load; - node = i; - } - } + cpus_and(tmp, domain->span, cpu_online_map); + for_each_cpu_mask(i, tmp) { + unsigned long load; + if (i == this_cpu) + load = get_low_cpu_load(i, 0); + else + load = get_high_cpu_load(i, 0) + SCHED_LOAD_SCALE; - minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!cpu_isset(i, cpumask)) - continue; - if (cpu_rq(i)->nr_running < minload) { + if (min_load > load) { best_cpu = i; - minload = cpu_rq(i)->nr_running; + min_load = load; } + } return best_cpu; } void sched_balance_exec(void) { + struct sched_domain *domain = this_sched_domain(); int new_cpu; + int this_cpu = smp_processor_id(); + if (numnodes == 1) + return; - if (numnodes > 1) { - new_cpu = sched_best_cpu(current); - if (new_cpu != smp_processor_id()) - sched_migrate_task(current, new_cpu); - } -} + if (this_rq()->nr_running <= 1) + return; -/* - * Find the busiest node. All previous node loads contribute with a - * geometrically deccaying weight to the load measure: - * load_{t} = load_{t-1}/2 + nr_node_running_{t} - * This way sudden load peaks are flattened out a bit. - * Node load is divided by nr_cpus_node() in order to compare nodes - * of different cpu count but also [first] multiplied by 10 to - * provide better resolution. - */ -static int find_busiest_node(int this_node) -{ - int i, node = -1, load, this_load, maxload; - - if (!nr_cpus_node(this_node)) - return node; - this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1) - + (10 * atomic_read(&node_nr_running[this_node]) - / nr_cpus_node(this_node)); - this_rq()->prev_node_load[this_node] = this_load; - for_each_node_with_cpus(i) { - if (i == this_node) - continue; - load = (this_rq()->prev_node_load[i] >> 1) - + (10 * atomic_read(&node_nr_running[i]) - / nr_cpus_node(i)); - this_rq()->prev_node_load[i] = load; - if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) { - maxload = load; - node = i; - } + while (domain->parent && !(domain->flags & SD_FLAG_EXEC)) + domain = domain->parent; + + if (domain->flags & SD_FLAG_EXEC) { + new_cpu = sched_best_cpu(current, domain); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); } - return node; } - #endif /* CONFIG_NUMA */ -#ifdef CONFIG_SMP - /* - * double_lock_balance - lock the busiest runqueue - * - * this_rq is locked already. Recalculate nr_running if we have to - * drop the runqueue lock. + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static inline -unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest, - int this_cpu, int idle, - unsigned int nr_running) +static inline void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) { if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); - /* Need to recalculate nr_running */ - if (idle || (this_rq->nr_running > - this_rq->prev_cpu_load[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_cpu_load[this_cpu]; } else spin_lock(&busiest->lock); } - return nr_running; } /* - * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline void pull_task(runqueue_t *src_rq, task_t *p, + runqueue_t *this_rq, int this_cpu) +{ + dequeue_task(p, src_rq); + nr_running_dec(src_rq); + set_task_cpu(p, this_cpu); + nr_running_inc(this_rq); + enqueue_task(p, this_rq); + p->timestamp = sched_clock() - + (src_rq->timestamp_last_tick - p->timestamp); + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static inline -runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, - int *imbalance, cpumask_t cpumask) +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *domain, enum idle_type idle) +{ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (task_running(rq, p)) + return 0; + if (!cpu_isset(this_cpu, p->cpus_allowed)) + return 0; + + /* Aggressive migration if we've failed balancing */ + if (idle == NEWLY_IDLE || + domain->nr_balance_failed < domain->cache_nice_tries) { + if ((rq->timestamp_last_tick - p->timestamp) + < domain->cache_hot_time) + return 0; + } + + return 1; +} + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *domain, + enum idle_type idle) +{ + int idx; + int pulled = 0; + prio_array_t* array, *dst_array; + struct list_head *head, *curr; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + array = &busiest->array; + dst_array = &this_rq->array; + + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) + goto out; + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, domain, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + pull_task(busiest, tmp, this_rq, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + return pulled; +} + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *domain, int this_cpu, + unsigned long *imbalance, enum idle_type idle) { - int nr_running, load, max_load, i; - runqueue_t *busiest, *rq_src; + unsigned long max_load, avg_load, total_load, this_load; + unsigned int total_pwr; + int modify; + struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups; + + max_load = 0; + this_load = 0; + total_load = 0; + total_pwr = 0; + + if (group == NULL) + goto out_balanced; /* - * We search all runqueues to find the most busy one. - * We do this lockless to reduce cache-bouncing overhead, - * we re-check the 'best' source CPU later on again, with - * the lock held. - * - * We fend off statistical fluctuations in runqueue lengths by - * saving the runqueue length (as seen by the balancing CPU) during - * the previous load-balancing operation and using the smaller one - * of the current and saved lengths. If a runqueue is long enough - * for a longer amount of time then we recognize it and pull tasks - * from it. - * - * The 'current runqueue length' is a statistical maximum variable, - * for that one we take the longer one - to avoid fluctuations in - * the other direction. So for a load-balance to happen it needs - * stable long runqueue on the target CPU and stable short runqueue - * on the local runqueue. - * - * We make an exception if this CPU is about to become idle - in - * that case we are less picky about moving a task across CPUs and - * take what can be taken. + * Don't modify when we newly become idle because that ruins our + * statistics: its triggered by some value of nr_running (ie. 0). + * Timer based balancing is a good statistic though. */ - if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) - nr_running = this_rq->nr_running; + if (idle == NEWLY_IDLE) + modify = 0; else - nr_running = this_rq->prev_cpu_load[this_cpu]; + modify = 1; - busiest = NULL; - max_load = 1; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpumask)) - continue; + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + local_group = cpu_isset(this_cpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + /* Bias balancing toward cpus of our domain */ + if (local_group) { + load = get_high_cpu_load(i, modify); + } else + load = get_low_cpu_load(i, modify); - rq_src = cpu_rq(i); - if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i])) - load = rq_src->nr_running; - else - load = this_rq->prev_cpu_load[i]; - this_rq->prev_cpu_load[i] = rq_src->nr_running; + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + goto nextgroup; + } + if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } +nextgroup: + group = group->next; + } while (group != domain->groups); + + if (!busiest || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (idle == NOT_IDLE) { + if (this_load >= avg_load || + 100*max_load <= domain->imbalance_pct*this_load) + goto out_balanced; + } + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + *imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2; + + if (*imbalance <= SCHED_LOAD_SCALE/2) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now >>= SCHED_LOAD_SHIFT; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp); + pwr_move >>= SCHED_LOAD_SHIFT; + + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; + *imbalance = 1; + return busiest; + } + + /* How many tasks to actually move to equalise the imbalance */ + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + >> SCHED_LOAD_SHIFT; + /* Get rid of the scaling factor, rounding *up* as we divide */ + *imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT; - if ((load > max_load) && (rq_src != this_rq)) { - busiest = rq_src; + return busiest; + +out_balanced: + if (busiest && idle == NEWLY_IDLE) { + *imbalance = 1; + return busiest; + } + + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static runqueue_t *find_busiest_queue(struct sched_group *group) +{ + cpumask_t tmp; + int i; + unsigned long max_load = 0; + runqueue_t *busiest = NULL; + + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + unsigned long load; + + load = get_low_cpu_load(i, 0); + + if (load >= max_load) { max_load = load; + busiest = cpu_rq(i); } } - if (likely(!busiest)) - goto out; + return busiest; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *domain, enum idle_type idle) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int balanced = 0, failed = 0; + int nr_moved = 0; - *imbalance = max_load - nr_running; + spin_lock(&this_rq->lock); - /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && ((*imbalance)*4 < max_load)) { - busiest = NULL; + group = find_busiest_group(domain, this_cpu, &imbalance, idle); + if (!group) { + balanced = 1; goto out; } - nr_running = double_lock_balance(this_rq, busiest, this_cpu, - idle, nr_running); - /* - * Make sure nothing changed since we checked the - * runqueue length. - */ - if (busiest->nr_running <= nr_running) { + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) { + balanced = 1; + goto out; + } + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, domain, idle); + spin_unlock(&busiest->lock); +out: + spin_unlock(&this_rq->lock); + + if (!balanced && nr_moved == 0) + failed = 1; + + if (failed && busiest && + domain->nr_balance_failed > domain->cache_nice_tries) { + int wake = 0; + + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } spin_unlock(&busiest->lock); - busiest = NULL; + if (wake) + wake_up_process(busiest->migration_thread); } + + if (failed) + domain->nr_balance_failed++; + else + domain->nr_balance_failed = 0; + + if (balanced) { + if (domain->balance_interval < domain->max_interval) + domain->balance_interval *= 2; + } else { + domain->balance_interval = domain->min_interval; + } + + return nr_moved; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. + */ +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *domain) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; + + group = find_busiest_group(domain, this_cpu, &imbalance, NEWLY_IDLE); + if (!group) + goto out; + + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) + goto out; + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, domain, NEWLY_IDLE); + + spin_unlock(&busiest->lock); + out: - return busiest; + return nr_moved; } /* - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. */ -static inline -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, int this_cpu) +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) { - dequeue_task(p, src_array); - nr_running_dec(src_rq); - set_task_cpu(p, this_cpu); - nr_running_inc(this_rq); - enqueue_task(p, this_rq->active); - p->timestamp = sched_clock() - - (src_rq->timestamp_last_tick - p->timestamp); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - set_need_resched(); + struct sched_domain *domain = this_sched_domain(); + + do { + if (unlikely(!domain->groups)) + /* hasn't been setup yet */ + break; + + if (domain->flags & SD_FLAG_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, domain)) { + /* We've pulled tasks over so stop searching */ + break; + } + } + + domain = domain->parent; + } while (domain); } /* - * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + * active_load_balance is run by migration threads. It pushes a running + * task off the cpu. It can be required to correctly have at least 1 task + * running on each physical CPU where possible, and not have a physical / + * logical imbalance. + * + * Called with busiest locked. */ -static inline -int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) +static void active_load_balance(runqueue_t *busiest, int busiest_cpu) { - unsigned long delta = rq->timestamp_last_tick - tsk->timestamp; - - /* - * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. - */ - if (task_running(rq, tsk)) - return 0; - if (!cpu_isset(this_cpu, tsk->cpus_allowed)) - return 0; - if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) - return 0; - return 1; -} - -/* - * Current runqueue is empty, or rebalance tick: if there is an - * inbalance (current runqueue is too short) then pull from - * busiest runqueue(s). - * - * We call this with the current runqueue locked, - * irqs disabled. - */ -static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) -{ - int imbalance, idx, this_cpu = smp_processor_id(); - runqueue_t *busiest; - prio_array_t *array; - struct list_head *head, *curr; - task_t *tmp; + int i; + struct sched_domain *sd = cpu_sched_domain(busiest_cpu); + struct sched_group *group, *busy_group; - busiest = find_busiest_queue(this_rq, this_cpu, idle, - &imbalance, cpumask); - if (!busiest) - goto out; + if (busiest->nr_running <= 1) + return; - /* - * We only want to steal a number of tasks equal to 1/2 the imbalance, - * otherwise we'll just shift the imbalance to the new queue: - */ - imbalance /= 2; + /* sd->parent should never cause a NULL dereference, if it did so, + * then push_cpu was set to a buggy value */ + while (!cpu_isset(busiest->push_cpu, sd->span)) { + sd = sd->parent; + if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) { + WARN_ON(1); + return; + } + } - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) - array = busiest->expired; - else - array = busiest->active; + if (!sd->groups) { + WARN_ON(1); + return; + } -new_array: - /* Start searching at priority 0: */ - idx = 0; -skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired) { - array = busiest->active; - goto new_array; + group = sd->groups; + while (!cpu_isset(busiest_cpu, group->cpumask)) { + group = group->next; + if (group == sd->groups) { + WARN_ON(1); + return; } - goto out_unlock; } + busy_group = group; - head = array->queue + idx; - curr = head->prev; -skip_queue: - tmp = list_entry(curr, task_t, run_list); + group = sd->groups; + do { + cpumask_t tmp; + runqueue_t *rq; + int push_cpu = 0, nr = 0; - curr = curr->prev; + if (group == busy_group) + goto next_group; - if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - pull_task(busiest, array, tmp, this_rq, this_cpu); + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + if (!idle_cpu(i)) + goto next_group; + push_cpu = i; + nr++; + } + if (nr == 0) + goto next_group; - /* Only migrate one task if we are idle */ - if (!idle && --imbalance) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } -out_unlock: - spin_unlock(&busiest->lock); -out: - ; + rq = cpu_rq(push_cpu); + double_lock_balance(busiest, rq); + move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); + spin_unlock(&rq->lock); +next_group: + group = group->next; + } while (group != sd->groups); } /* - * One of the idle_cpu_tick() and busy_cpu_tick() functions will - * get called every timer tick, on every CPU. Our balancing action - * frequency and balancing agressivity depends on whether the CPU is - * idle or not. + * rebalance_tick will get called every timer tick, on every CPU. * - * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on - * systems with HZ=100, every 10 msecs.) + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. * - * On NUMA, do a node-rebalance every 400 msecs. + * Balancing parameters are set up in arch_init_sched_domains. */ -#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) -#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) -#ifdef CONFIG_NUMA -static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) { - int node = find_busiest_node(cpu_to_node(this_cpu)); + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *domain = this_sched_domain(); - if (node >= 0) { - cpumask_t cpumask = node_to_cpumask(node); - cpu_set(this_cpu, cpumask); - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpumask); - spin_unlock(&this_rq->lock); - } -} -#endif + /* Run through all this CPU's domains */ + do { + unsigned long interval; -static void rebalance_tick(runqueue_t *this_rq, int idle) -{ -#ifdef CONFIG_NUMA - int this_cpu = smp_processor_id(); -#endif - unsigned long j = jiffies; + if (unlikely(!domain->groups)) + break; - /* - * First do inter-node rebalancing, then intra-node rebalancing, - * if both events happen in the same tick. The inter-node - * rebalancing does not necessarily have to create a perfect - * balance within the node, since we load-balance the most loaded - * node with the current CPU. (ie. other CPUs in the local node - * are not balanced.) - */ - if (idle) { -#ifdef CONFIG_NUMA - if (!(j % IDLE_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); -#endif - if (!(j % IDLE_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); + interval = domain->balance_interval; + if (idle != IDLE) + interval *= domain->busy_factor; + + /* scale ms to jiffies */ + interval = interval * HZ / 1000; + if (unlikely(interval == 0)) + interval = 1; + + if (j - domain->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, domain, idle)) { + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + domain->last_balance += interval; } - return; - } -#ifdef CONFIG_NUMA - if (!(j % BUSY_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); -#endif - if (!(j % BUSY_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); - } + + domain = domain->parent; + } while (domain); } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(runqueue_t *this_rq, int idle) +static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) +{ +} +#endif + +#ifdef CONFIG_SCHED_SMT +static inline int wake_priority_sleeper(runqueue_t *rq) +{ /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + return 1; + } + return 0; +} +#else +static inline int wake_priority_sleeper(runqueue_t *rq) { + return 0; } #endif @@ -1452,21 +1599,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat EXPORT_PER_CPU_SYMBOL(kstat); -/* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) /* * This function gets called by the timer code, with HZ frequency. @@ -1501,7 +1633,9 @@ void scheduler_tick(int user_ticks, int cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; - rebalance_tick(rq, 1); + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, IDLE); return; } if (TASK_NICE(p) > 0) @@ -1511,83 +1645,137 @@ void scheduler_tick(int user_ticks, int cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { + if (p->array != &rq->array) { set_tsk_need_resched(p); goto out; } spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (unlikely(rt_task(p))) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); - } + + // SCHED_FIFO tasks never run out of timeslice. + if (unlikely(p->policy == SCHED_FIFO)) + goto out_unlock; + // Tasks lose a deadline rank each time they use up a full slice(). + if (!--p->slice) { + set_tsk_need_resched(p); + dequeue_task(p, rq); + if (p->deadline && p->mm) + p->deadline--; + p->slice = slice(p); + p->prio = effective_prio(p); + p->time_slice = first_time_slice(p); + enqueue_task(p, rq); + p->first_time_slice = 0; goto out_unlock; } + /* + * Tasks that run out of time_slice but still have slice left get + * requeued with a lower priority && RR_INTERVAL time_slice. + */ if (!--p->time_slice) { - dequeue_task(p, rq->active); set_tsk_need_resched(p); + dequeue_task(p, rq); p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - enqueue_task(p, rq->active); - } + p->time_slice = RR_INTERVAL; + enqueue_task(p, rq); + goto out_unlock; + } + // All tasks within a priority level round robin at RR_INTERVAL. + if (!(p->slice % RR_INTERVAL)) { + set_tsk_need_resched(p); + dequeue_task(p, rq); + enqueue_task(p, rq); } out_unlock: spin_unlock(&rq->lock); out: - rebalance_tick(rq, 0); + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static inline void wake_sleeping_dependent(runqueue_t *rq) +{ + int i, this_cpu = rq->cpu; + struct sched_domain *sd = cpu_sched_domain(this_cpu); + cpumask_t sibling_map; + + if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { + /* Not SMT */ + return; + } + + cpus_and(sibling_map, sd->span, cpu_online_map); + cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + + smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } } +static inline int dependent_sleeper(runqueue_t *rq, task_t *p) +{ + int ret = 0, i, this_cpu = rq->cpu; + struct sched_domain *sd = cpu_sched_domain(this_cpu); + cpumask_t sibling_map; + + if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { + /* Not SMT */ + return 0; + } + + cpus_and(sibling_map, sd->span, cpu_online_map); + cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + task_t *smt_curr; + + smt_rq = cpu_rq(i); + smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (((smt_curr->slice * (100 - sd->per_cpu_gain) / 100) > + slice(p) || rt_task(smt_curr)) && + p->mm && smt_curr->mm && !rt_task(p)) + ret |= 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((((p->slice * (100 - sd->per_cpu_gain) / 100) > + slice(smt_curr) || rt_task(p)) && + smt_curr->mm && p->mm && !rt_task(smt_curr)) || + (smt_curr == smt_rq->idle && smt_rq->nr_running)) + resched_task(smt_curr); + } + return ret; +} +#else +static inline void wake_sleeping_dependent(runqueue_t *rq) +{ +} + +static inline int dependent_sleeper(runqueue_t *rq, task_t *p) +{ + return 0; +} +#endif + void scheduling_functions_start_here(void) { } /* @@ -1598,7 +1786,7 @@ asmlinkage void schedule(void) long *switch_count; task_t *prev, *next; runqueue_t *rq; - prio_array_t *array; + prio_array_t* array; struct list_head *queue; unsigned long long now; unsigned long run_time; @@ -1623,18 +1811,7 @@ need_resched: release_kernel_lock(prev); now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; - else - run_time = NS_MAX_SLEEP_AVG; - - /* - * Tasks with interactive credits get charged less run_time - * at high sleep_avg to delay them losing their interactive - * status - */ - if (HIGH_CREDIT(prev)) - run_time /= (CURRENT_BONUS(prev) ? : 1); + run_time = now - prev->timestamp; spin_lock_irq(&rq->lock); @@ -1654,56 +1831,37 @@ need_resched: if (unlikely(!rq->nr_running)) { #ifdef CONFIG_SMP - load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); + idle_balance(smp_processor_id(), rq); #endif if (!rq->nr_running) { next = rq->idle; - rq->expired_timestamp = 0; + wake_sleeping_dependent(rq); goto switch_tasks; } } - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - + array = &rq->array; idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); - - if (next->activated > 0) { - unsigned long long delta = now - next->timestamp; - - if (next->activated == 1) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); - } - next->activated = 0; + if (dependent_sleeper(rq, next)) + next = rq->idle; switch_tasks: + if (!NS_TO_JIFFIES(run_time)) + // Keeps track of tasks that run less than one tick + prev->runtime += run_time; + else { + if (run_time > prev->runtime) + prev->runtime = 0; + else + prev->runtime -= run_time; + } + prev->timestamp = now; + prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; - - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) { - prev->sleep_avg = 0; - if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) - prev->interactive_credit--; - } - prev->timestamp = now; - + if (likely(prev != next)) { next->timestamp = now; rq->nr_switches++; @@ -1969,7 +2127,7 @@ void scheduling_functions_end_here(void) void set_user_nice(task_t *p, long nice) { unsigned long flags; - prio_array_t *array; + prio_array_t* array; runqueue_t *rq; int old_prio, new_prio, delta; @@ -1992,7 +2150,7 @@ void set_user_nice(task_t *p, long nice) } array = p->array; if (array) - dequeue_task(p, array); + dequeue_task(p, rq); old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); @@ -2001,7 +2159,7 @@ void set_user_nice(task_t *p, long nice) p->prio += delta; if (array) { - enqueue_task(p, array); + enqueue_task(p, rq); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -2111,7 +2269,7 @@ static int setscheduler(pid_t pid, int p struct sched_param lp; int retval = -EINVAL; int oldprio; - prio_array_t *array; + prio_array_t* array; unsigned long flags; runqueue_t *rq; task_t *p; @@ -2380,29 +2538,26 @@ out_unlock: /** * sys_sched_yield - yield the current processor to other threads. * - * this function yields the current CPU by moving the calling thread - * to the expired array. If there are no other threads running on this + * this function yields the current CPU by transiently requeuing + * at lower priority. If there are no other threads running on this * CPU then this function will return. */ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ + dequeue_task(current, rq); if (likely(!rt_task(current))) { - dequeue_task(current, array); - enqueue_task(current, rq->expired); - } else { - list_del(¤t->run_list); - list_add_tail(¤t->run_list, array->queue + current->prio); - } + current->prio++; + if (current->prio >= MAX_PRIO) + current->prio = MAX_PRIO - 1; + if (current->deadline < deadline(current)) + current->deadline++; + } + current->slice = slice(current); + current->time_slice = current->slice; + enqueue_task(current, rq); + /* * Since we are going to call schedule() anyway, there's * no need to preempt: @@ -2540,7 +2695,7 @@ long sys_sched_rr_get_interval(pid_t pid goto out_unlock; jiffies_to_timespec(p->policy & SCHED_FIFO ? - 0 : task_timeslice(p), &t); + 0 : slice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -2661,6 +2816,7 @@ void __init init_idle(task_t *idle, int idle->array = NULL; idle->prio = MAX_PRIO; idle->state = TASK_RUNNING; + idle->deadline = 0; set_task_cpu(idle, cpu); double_rq_unlock(idle_rq, rq); set_tsk_need_resched(idle); @@ -2713,7 +2869,12 @@ int set_cpus_allowed(task_t *p, cpumask_ goto out; } - if (__set_cpus_allowed(p, new_mask, &req)) { + p->cpus_allowed = new_mask; + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, any_online_cpu(new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -2727,8 +2888,16 @@ out: EXPORT_SYMBOL_GPL(set_cpus_allowed); -/* Move (not current) task off this cpu, onto dest cpu. */ -static void move_task_away(struct task_struct *p, int dest_cpu) +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_balance_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static void __migrate_task(struct task_struct *p, int dest_cpu) { runqueue_t *rq_dest; unsigned long flags; @@ -2737,14 +2906,18 @@ static void move_task_away(struct task_s local_irq_save(flags); double_rq_lock(this_rq(), rq_dest); + /* Already moved. */ if (task_cpu(p) != smp_processor_id()) - goto out; /* Already moved */ + goto out; + /* Affinity changed (again). */ + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; set_task_cpu(p, dest_cpu); if (p->array) { deactivate_task(p, this_rq()); activate_task(p, rq_dest); - if (p->prio < rq_dest->curr->prio) + if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } p->timestamp = rq_dest->timestamp_last_tick; @@ -2781,7 +2954,13 @@ static int migration_thread(void * data) refrigerator(PF_IOTHREAD); spin_lock_irq(&rq->lock); + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + head = &rq->migration_queue; + current->state = TASK_INTERRUPTIBLE; if (list_empty(head)) { spin_unlock_irq(&rq->lock); @@ -2792,8 +2971,7 @@ static int migration_thread(void * data) list_del_init(head->next); spin_unlock_irq(&rq->lock); - move_task_away(req->task, - any_online_cpu(req->task->cpus_allowed)); + __migrate_task(req->task, req->dest_cpu); complete(&req->done); } return 0; @@ -2860,33 +3038,236 @@ int __init migration_init(void) spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; EXPORT_SYMBOL(kernel_flag); +#ifdef CONFIG_SMP +#ifdef ARCH_HAS_SCHED_DOMAIN +extern void __init arch_init_sched_domains(void); +#else +static struct sched_group sched_group_cpus[NR_CPUS]; +#ifdef CONFIG_NUMA +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +DEFINE_PER_CPU(struct sched_domain, node_domains); +static void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_node = NULL, *last_node = NULL; + + /* Set up domains */ + for_each_cpu(i) { + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + + *cpu_domain = SD_CPU_INIT; + cpus_and(cpu_domain->span, nodemask, cpu_possible_map); + cpu_domain->parent = node_domain; + } + + /* Set up groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpumask_t tmp = node_to_cpumask(i); + + cpus_and(nodemask, tmp, cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + node->cpumask = nodemask; + node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask); + + for_each_cpu_mask(j, node->cpumask) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + if (!first_node) + first_node = node; + if (last_node) + last_node->next = node; + last_node = node; + } + last_node->next = first_node; + + mb(); + for_each_cpu(i) { + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; + cpu_domain->groups = &sched_group_cpus[i]; + } +} + +#else /* CONFIG_NUMA */ +static void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + + *cpu_domain = SD_CPU_INIT; + cpu_domain->span = cpu_possible_map; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + cpu_domain->groups = &sched_group_cpus[i]; + } +} + +#endif /* CONFIG_NUMA */ +#endif /* ARCH_HAS_SCHED_DOMAIN */ + +#undef SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +void sched_domain_debug(void) +{ + int i; + + for_each_cpu(i) { + int level = 0; + struct sched_domain *cpu_domain = cpu_sched_domain(i); + + printk(KERN_DEBUG "CPU%d: %s\n", + i, (cpu_online(i) ? " online" : "offline")); + + do { + int j; + char str[NR_CPUS]; + struct sched_group *group = cpu_domain->groups; + cpumask_t groupmask, tmp; + + cpumask_snprintf(str, NR_CPUS, cpu_domain->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (j = 0; j < level + 1; j++) + printk(" "); + printk("domain %d: span %s\n", level, str); + + if (!cpu_isset(i, cpu_domain->span)) + printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); + if (!cpu_isset(i, group->cpumask)) + printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); + + printk(KERN_DEBUG); + for (j = 0; j < level + 2; j++) + printk(" "); + printk("groups:"); + do { + if (group == NULL) { + printk(" ERROR: NULL"); + break; + } + + if (cpus_weight(group->cpumask) == 0) + printk(" ERROR empty group:"); + + cpus_and(tmp, groupmask, group->cpumask); + if (cpus_weight(tmp) > 0) + printk(" ERROR repeated CPUs:"); + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_snprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != cpu_domain->groups); + printk("\n"); + + if (!cpus_equal(cpu_domain->span, groupmask)) + printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); + + level++; + cpu_domain = cpu_domain->parent; + + if (cpu_domain) { + cpus_and(tmp, groupmask, cpu_domain->span); + if (!cpus_equal(tmp, groupmask)) + printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); + } + + } while (cpu_domain); + } +} +#else +#define sched_domain_debug() {} +#endif + +void __init sched_init_smp(void) +{ + arch_init_sched_domains(); + sched_domain_debug(); +} +#else +void __init sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + void __init sched_init(void) { runqueue_t *rq; - int i, j, k; + int i, k; for (i = 0; i < NR_CPUS; i++) { - prio_array_t *array; - + prio_array_t* array; +#ifdef CONFIG_SMP + struct sched_domain *domain; + domain = cpu_sched_domain(i); + memset(domain, 0, sizeof(struct sched_domain)); +#endif rq = cpu_rq(i); - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; - + rq->cpu = i; + spin_lock_init(&rq->lock); INIT_LIST_HEAD(&rq->migration_queue); atomic_set(&rq->nr_iowait, 0); - nr_running_init(rq); - - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); + array = &rq->array; + + for (k = 0; k <= MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); } /* * We have to do a little magic to get the first