diff -Naurp linux-2.6.0-test4/arch/i386/kernel/smpboot.c linux-2.6.0-test4-O20/arch/i386/kernel/smpboot.c --- linux-2.6.0-test4/arch/i386/kernel/smpboot.c 2003-08-25 19:48:52.000000000 +1000 +++ linux-2.6.0-test4-O20/arch/i386/kernel/smpboot.c 2003-09-04 00:16:07.000000000 +1000 @@ -915,13 +915,13 @@ static void smp_tune_scheduling (void) cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } - cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1; printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", (long)cacheflush_time/(cpu_khz/1000), ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); printk("task migration cache decay timeout: %ld msecs.\n", - (cache_decay_ticks + 1) * 1000 / HZ); + cache_decay_ticks); } /* diff -Naurp linux-2.6.0-test4/arch/i386/kernel/timers/timer_tsc.c linux-2.6.0-test4-O20/arch/i386/kernel/timers/timer_tsc.c --- linux-2.6.0-test4/arch/i386/kernel/timers/timer_tsc.c 2003-08-25 19:48:52.000000000 +1000 +++ linux-2.6.0-test4-O20/arch/i386/kernel/timers/timer_tsc.c 2003-09-04 00:16:07.000000000 +1000 @@ -116,6 +116,24 @@ static unsigned long long monotonic_cloc return base + cycles_2_ns(this_offset - last_offset); } +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + if (unlikely(!cpu_has_tsc)) + return (unsigned long long)jiffies * (1000000000 / HZ); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + + static void mark_offset_tsc(void) { unsigned long lost,delay; diff -Naurp linux-2.6.0-test4/arch/ia64/kernel/time.c linux-2.6.0-test4-O20/arch/ia64/kernel/time.c --- linux-2.6.0-test4/arch/ia64/kernel/time.c 2003-08-25 19:48:53.000000000 +1000 +++ linux-2.6.0-test4-O20/arch/ia64/kernel/time.c 2003-09-04 00:16:07.000000000 +1000 @@ -61,6 +61,14 @@ do_profile (unsigned long ip) atomic_inc((atomic_t *) &prof_buffer[ip]); } +unsigned long long +sched_clock (void) +{ + unsigned long offset = ia64_get_itc(); + + return (offset * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT; +} + static void itc_reset (void) { diff -Naurp linux-2.6.0-test4/fs/proc/array.c linux-2.6.0-test4-O20/fs/proc/array.c --- linux-2.6.0-test4/fs/proc/array.c 2003-08-25 19:48:46.000000000 +1000 +++ linux-2.6.0-test4-O20/fs/proc/array.c 2003-09-04 00:16:07.000000000 +1000 @@ -154,13 +154,16 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" + "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", - get_task_state(p), p->tgid, + get_task_state(p), + (p->sleep_avg/1024)*100/(1000000000/1024), + p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, p->uid, p->euid, p->suid, p->fsuid, diff -Naurp linux-2.6.0-test4/include/linux/sched.h linux-2.6.0-test4-O20/include/linux/sched.h --- linux-2.6.0-test4/include/linux/sched.h 2003-08-25 19:49:33.000000000 +1000 +++ linux-2.6.0-test4-O20/include/linux/sched.h 2003-09-04 00:16:07.000000000 +1000 @@ -340,7 +340,9 @@ struct task_struct { prio_array_t *array; unsigned long sleep_avg; - unsigned long last_run; + long interactive_credit; + unsigned long long timestamp; + int activated; unsigned long policy; cpumask_t cpus_allowed; @@ -498,6 +500,8 @@ static inline int set_cpus_allowed(task_ } #endif +extern unsigned long long sched_clock(void); + #ifdef CONFIG_NUMA extern void sched_balance_exec(void); extern void node_nr_running_init(void); diff -Naurp linux-2.6.0-test4/kernel/fork.c linux-2.6.0-test4-O20/kernel/fork.c --- linux-2.6.0-test4/kernel/fork.c 2003-08-25 19:49:29.000000000 +1000 +++ linux-2.6.0-test4-O20/kernel/fork.c 2003-09-04 00:16:07.000000000 +1000 @@ -924,7 +924,7 @@ struct task_struct *copy_process(unsigne */ p->first_time_slice = 1; current->time_slice >>= 1; - p->last_run = jiffies; + p->timestamp = sched_clock(); if (!current->time_slice) { /* * This case is rare, it happens when the parent has only diff -Naurp linux-2.6.0-test4/kernel/sched.c linux-2.6.0-test4-O20/kernel/sched.c --- linux-2.6.0-test4/kernel/sched.c 2003-08-25 19:49:29.000000000 +1000 +++ linux-2.6.0-test4-O20/kernel/sched.c 2003-09-04 00:16:17.000000000 +1000 @@ -58,6 +58,14 @@ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ + (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) /* * These are the 'tuning knobs' of the scheduler: @@ -68,14 +76,18 @@ */ #define MIN_TIMESLICE ( 10 * HZ / 1000) #define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 #define PARENT_PENALTY 100 #define EXIT_WEIGHT 3 #define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) #define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) +#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) #define NODE_THRESHOLD 125 +#define CREDIT_LIMIT 100 /* * If a task is 'interactive' then we reinsert it in the active @@ -105,6 +117,19 @@ * too hard. */ +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) \ + (MIN_TIMESLICE * (1 << (MAX_BONUS - CURRENT_BONUS(p))) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) \ + (MIN_TIMESLICE * (1 << (MAX_BONUS - CURRENT_BONUS(p)))) +#endif + #define SCALE(v1,v1_max,v2_max) \ (v1) * (v2_max) / (v1_max) @@ -115,6 +140,19 @@ #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) +#define JUST_INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define HIGH_CREDIT(p) \ + ((p)->interactive_credit > CREDIT_LIMIT) + +#define LOW_CREDIT(p) \ + ((p)->interactive_credit < -CREDIT_LIMIT) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] * to time slice values. @@ -319,8 +357,7 @@ static int effective_prio(task_t *p) if (rt_task(p)) return p->prio; - bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - - MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) @@ -339,6 +376,82 @@ static inline void __activate_task(task_ nr_running_inc(rq); } +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; + + if (likely(sleep_time > 0)) { + /* + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. + */ + if (p->mm && p->activated != -1 && + sleep_time > JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + AVG_TIMESLICE); + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; + + /* + * Tasks with low interactive_credit are limited to + * one timeslice worth of sleep avg bonus. + */ + if (LOW_CREDIT(p) && + sleep_time > JIFFIES_TO_NS(task_timeslice(p))) + sleep_time = + JIFFIES_TO_NS(task_timeslice(p)); + + /* + * Non high_credit tasks waking from uninterruptible + * sleep are limited in their sleep_avg rise as they + * are likely to be cpu hogs waiting on I/O + */ + if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm){ + if (p->sleep_avg >= JUST_INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sleep_avg + sleep_time >= + JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = + JUST_INTERACTIVE_SLEEP(p); + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a task + * spends sleeping, the higher the average gets - and the + * higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + if (p->sleep_avg > NS_MAX_SLEEP_AVG){ + p->sleep_avg = NS_MAX_SLEEP_AVG; + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } + } + } + + p->prio = effective_prio(p); +} + /* * activate_task - move a task to the runqueue and do priority recalculation * @@ -347,34 +460,33 @@ static inline void __activate_task(task_ */ static inline void activate_task(task_t *p, runqueue_t *rq) { - long sleep_time = jiffies - p->last_run - 1; + unsigned long long now = sched_clock(); - if (sleep_time > 0) { - int sleep_avg; + recalc_task_prio(p, now); + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (!p->activated){ /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->last_run. The more time a task - * spends sleeping, the higher the average gets - and the - * higher the priority boost gets as well. + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: */ - sleep_avg = p->sleep_avg + sleep_time; - + if (in_interrupt()) + p->activated = 2; + else /* - * 'Overflow' bonus ticks go to the waker as well, so the - * ticks are not lost. This has the effect of further - * boosting tasks that are related to maximum-interactive - * tasks. + * Normal first-time wakeups get a credit too for on-runqueue + * time, but it will be weighted down: */ - if (sleep_avg > MAX_SLEEP_AVG) - sleep_avg = MAX_SLEEP_AVG; - if (p->sleep_avg != sleep_avg) { - p->sleep_avg = sleep_avg; - p->prio = effective_prio(p); + p->activated = 1; } - } + p->timestamp = now; + __activate_task(p, rq); } @@ -495,13 +607,19 @@ repeat_lock_task: task_rq_unlock(rq, &flags); goto repeat_lock_task; } - if (old_state == TASK_UNINTERRUPTIBLE) + if (old_state == TASK_UNINTERRUPTIBLE){ rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } if (sync) __activate_task(p, rq); else { activate_task(p, rq); - if (p->prio < rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } success = 1; @@ -550,8 +668,14 @@ void wake_up_forked_process(task_t * p) * and children as well, to keep max-interactive tasks * from forking tasks that are max-interactive. */ - current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; - p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->interactive_credit = 0; + p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); @@ -592,8 +716,9 @@ void sched_exit(task_t * p) * the sleep_avg of the parent as well. */ if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT + - p->sleep_avg) / (EXIT_WEIGHT + 1); + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); } /** @@ -995,13 +1120,31 @@ static inline void pull_task(runqueue_t * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (p->prio < this_rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, this_rq)) set_need_resched(); - else { - if (p->prio == this_rq->curr->prio && - p->time_slice > this_rq->curr->time_slice) - set_need_resched(); - } +} + +/* + * Previously: + * + * #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + * ((!idle || (NS_TO_JIFFIES(now - (p)->timestamp) > \ + * cache_decay_ticks)) && !task_running(rq, p) && \ + * cpu_isset(this_cpu, (p)->cpus_allowed)) + */ + +static inline int +can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) +{ + unsigned long delta = sched_clock() - tsk->timestamp; + + if (idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) + return 0; + if (task_running(rq, tsk)) + return 0; + if (!cpu_isset(this_cpu, tsk->cpus_allowed)) + return 0; + return 1; } /* @@ -1063,14 +1206,9 @@ skip_queue: * 3) are cache-hot on their current CPU. */ -#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \ - !task_running(rq, p) && \ - cpu_isset(this_cpu, (p)->cpus_allowed)) - curr = curr->prev; - if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { + if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { if (curr != head) goto skip_queue; idx++; @@ -1232,14 +1370,11 @@ void scheduler_tick(int user_ticks, int spin_lock(&rq->lock); /* * The task was running during this tick - update the - * time slice counter and the sleep average. Note: we - * do not update a thread's priority until it either - * goes to sleep or uses up its timeslice. This makes - * it possible for interactive tasks to use up their - * timeslices at their highest priority levels. + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. */ - if (p->sleep_avg) - p->sleep_avg--; if (unlikely(rt_task(p))) { /* * RR tasks need a special form of timeslice management. @@ -1263,12 +1398,39 @@ void scheduler_tick(int user_ticks, int p->time_slice = task_timeslice(p); p->first_time_slice = 0; + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least MIN_TIMESLICE to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice >= (MIN_TIMESLICE * 2)) && + (p->array == rq->active)) { + + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } } out_unlock: spin_unlock(&rq->lock); @@ -1287,6 +1449,8 @@ asmlinkage void schedule(void) runqueue_t *rq; prio_array_t *array; struct list_head *queue; + unsigned long long now; + unsigned long run_time; int idx; /* @@ -1307,7 +1471,20 @@ need_resched: rq = this_rq(); release_kernel_lock(prev); - prev->last_run = jiffies; + now = sched_clock(); + if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) + run_time = now - prev->timestamp; + else + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks with interactive credits get charged less run_time + * at high sleep_avg to delay them losing their interactive + * status + */ + if (HIGH_CREDIT(prev)) + run_time /= (CURRENT_BONUS(prev) ? : 1); + spin_lock_irq(&rq->lock); /* @@ -1355,12 +1532,33 @@ pick_next_task: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (next->activated > 0) { + unsigned long long delta = now - next->timestamp; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } + next->activated = 0; switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg <= 0){ + prev->sleep_avg = 0; + if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) + prev->interactive_credit--; + } + prev->timestamp = now; + if (likely(prev != next)) { + next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -1600,6 +1798,7 @@ void set_user_nice(task_t *p, long nice) unsigned long flags; prio_array_t *array; runqueue_t *rq; + int old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -1608,6 +1807,12 @@ void set_user_nice(task_t *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ if (rt_task(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; @@ -1615,16 +1820,20 @@ void set_user_nice(task_t *p, long nice) array = p->array; if (array) dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio = NICE_TO_PRIO(nice); + p->prio += delta; + if (array) { enqueue_task(p, array); /* - * If the task is running and lowered its priority, - * or increased its priority then reschedule its CPU: + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: */ - if ((NICE_TO_PRIO(nice) < p->static_prio) || - task_running(rq, p)) + if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } out_unlock: @@ -2381,6 +2590,12 @@ static void move_task_away(struct task_s local_irq_restore(flags); } +typedef struct { + int cpu; + struct completion startup_done; + task_t *task; +} migration_startup_t; + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -2390,20 +2605,21 @@ static int migration_thread(void * data) { /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */ struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 }; - int cpu = (long) data; + migration_startup_t *startup = data; + int cpu = startup->cpu; runqueue_t *rq; int ret; + startup->task = current; + complete(&startup->startup_done); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + + BUG_ON(smp_processor_id() != cpu); + daemonize("migration/%d", cpu); set_fs(KERNEL_DS); - /* - * Either we are running on the right CPU, or there's a a - * migration thread on this CPU, guaranteed (we're started - * serially). - */ - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - ret = setscheduler(0, SCHED_FIFO, ¶m); rq = this_rq(); @@ -2439,13 +2655,30 @@ static int migration_call(struct notifie unsigned long action, void *hcpu) { + long cpu = (long) hcpu; + migration_startup_t startup; + switch (action) { case CPU_ONLINE: - printk("Starting migration thread for cpu %li\n", - (long)hcpu); - kernel_thread(migration_thread, hcpu, CLONE_KERNEL); - while (!cpu_rq((long)hcpu)->migration_thread) + + printk("Starting migration thread for cpu %li\n", cpu); + + startup.cpu = cpu; + startup.task = NULL; + init_completion(&startup.startup_done); + + kernel_thread(migration_thread, &startup, CLONE_KERNEL); + wait_for_completion(&startup.startup_done); + wait_task_inactive(startup.task); + + startup.task->thread_info->cpu = cpu; + startup.task->cpus_allowed = cpumask_of_cpu(cpu); + + wake_up_process(startup.task); + + while (!cpu_rq(cpu)->migration_thread) yield(); + break; } return NOTIFY_OK;