Index: linux-2.6.16-rc6-mm1/include/linux/sched.h =================================================================== --- linux-2.6.16-rc6-mm1.orig/include/linux/sched.h 2006-03-13 18:29:40.000000000 +1100 +++ linux-2.6.16-rc6-mm1/include/linux/sched.h 2006-03-13 20:12:22.000000000 +1100 @@ -102,6 +102,7 @@ extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); +extern unsigned long weighted_cpuload(const int cpu); #include #include @@ -546,9 +547,9 @@ enum idle_type /* * sched-domains (multiprocessor balancing) declarations: */ -#ifdef CONFIG_SMP #define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ +#ifdef CONFIG_SMP #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ #define SD_BALANCE_EXEC 4 /* Balance on exec */ @@ -637,6 +638,22 @@ extern unsigned int max_cache_size; #endif /* CONFIG_SMP */ +/* + * A runqueue laden with a single nice 0 task scores a weighted_cpuload of + * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a + * task of nice 0 or enough lower priority tasks to bring up the + * weighted_cpuload + */ +static inline int above_background_load(void) +{ + unsigned long cpu; + + for_each_online_cpu(cpu) { + if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) + return 1; + } + return 0; +} struct io_context; /* See blkdev.h */ void exit_io_context(void); @@ -704,8 +721,8 @@ struct task_struct { #ifdef __ARCH_WANT_UNLOCKED_CTXSW int oncpu; #endif - int load_weight; /* for load balancing purposes */ #endif + int load_weight; /* for niceness load balancing purposes */ int prio, static_prio; struct list_head run_list; prio_array_t *array; Index: linux-2.6.16-rc6-mm1/kernel/sched.c =================================================================== --- linux-2.6.16-rc6-mm1.orig/kernel/sched.c 2006-03-13 18:29:40.000000000 +1100 +++ linux-2.6.16-rc6-mm1/kernel/sched.c 2006-03-13 20:12:15.000000000 +1100 @@ -170,12 +170,12 @@ */ #define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) static unsigned int static_prio_timeslice(int static_prio) { if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE*4, static_prio); + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); else return SCALE_PRIO(DEF_TIMESLICE, static_prio); } @@ -217,8 +217,8 @@ struct runqueue { * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; -#ifdef CONFIG_SMP unsigned long raw_weighted_load; +#ifdef CONFIG_SMP unsigned long cpu_load[3]; #endif unsigned long long nr_switches; @@ -672,7 +672,6 @@ static int effective_prio(task_t *p) return prio; } -#ifdef CONFIG_SMP /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that @@ -695,9 +694,10 @@ static int effective_prio(task_t *p) #define RTPRIO_TO_LOAD_WEIGHT(rp) \ (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) -static inline void set_load_weight(task_t *p) +static void set_load_weight(task_t *p) { if (rt_task(p)) { +#ifdef CONFIG_SMP if (p == task_rq(p)->migration_thread) /* * The migration thread does the actual balancing. @@ -706,6 +706,7 @@ static inline void set_load_weight(task_ */ p->load_weight = 0; else +#endif p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); } else p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); @@ -720,19 +721,6 @@ static inline void dec_raw_weighted_load { rq->raw_weighted_load -= p->load_weight; } -#else -static inline void set_load_weight(task_t *p) -{ -} - -static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) -{ -} - -static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p) -{ -} -#endif static inline void inc_nr_running(task_t *p, runqueue_t *rq) { @@ -926,6 +914,12 @@ inline int task_curr(const task_t *p) return cpu_curr(task_cpu(p)) == p; } +/* Used instead of source_load when we know the type == 0 */ +unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->raw_weighted_load; +} + #ifdef CONFIG_SMP typedef struct { struct list_head list; @@ -1126,7 +1120,7 @@ find_idlest_cpu(struct sched_group *grou cpus_and(tmp, group->cpumask, p->cpus_allowed); for_each_cpu_mask(i, tmp) { - load = source_load(i, 0); + load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; @@ -2198,7 +2192,7 @@ static runqueue_t *find_busiest_queue(st int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i, 0); + load = weighted_cpuload(i); if (load > max_load) { max_load = load; Index: linux-2.6.16-rc6-mm1/mm/swap_prefetch.c =================================================================== --- linux-2.6.16-rc6-mm1.orig/mm/swap_prefetch.c 2006-03-13 20:11:22.000000000 +1100 +++ linux-2.6.16-rc6-mm1/mm/swap_prefetch.c 2006-03-16 21:06:50.000000000 +1100 @@ -27,8 +27,18 @@ */ #define PREFETCH_DELAY (HZ * 5) -/* sysctl - enable/disable swap prefetching */ -int swap_prefetch __read_mostly = 1; +#define PREFETCH_NORMAL (1 << 0) +#define PREFETCH_AGGRESSIVE (1 << 1) +/* + * sysctl - enable/disable swap prefetching bits + * This is composed of the bitflags PREFETCH_NORMAL and PREFETCH_AGGRESSIVE. + * Once PREFETCH_AGGRESSIVE is set, swap prefetching will be peformed as much + * as possible irrespective of load conditions and then the + * PREFETCH_AGGRESSIVE bit will be unset. + */ +int swap_prefetch __read_mostly = PREFETCH_NORMAL; + +#define aggressive_prefetch (unlikely(swap_prefetch & PREFETCH_AGGRESSIVE)) struct swapped_root { unsigned long busy; /* vm busy */ @@ -150,21 +160,31 @@ enum trickle_return { TRICKLE_DELAY, }; +struct node_stats { + unsigned long last_free; + /* Free ram after a cycle of prefetching */ + unsigned long current_free; + /* Free ram on this cycle of checking prefetch_suitable */ + unsigned long prefetch_watermark; + /* Maximum amount we will prefetch to */ + unsigned long highfree[MAX_NR_ZONES]; + /* The amount of free ram before we start prefetching */ + unsigned long lowfree[MAX_NR_ZONES]; + /* The amount of free ram where we will stop prefetching */ + unsigned long *pointfree[MAX_NR_ZONES]; + /* highfree or lowfree depending on whether we've hit a watermark */ +}; + /* * prefetch_stats stores the free ram data of each node and this is used to * determine if a node is suitable for prefetching into. */ -struct prefetch_stats{ - unsigned long last_free[MAX_NUMNODES]; - /* Free ram after a cycle of prefetching */ - unsigned long current_free[MAX_NUMNODES]; - /* Free ram on this cycle of checking prefetch_suitable */ - unsigned long prefetch_watermark[MAX_NUMNODES]; - /* Maximum amount we will prefetch to */ +struct prefetch_stats { nodemask_t prefetch_nodes; /* Which nodes are currently suited to prefetching */ unsigned long prefetched_pages; /* Total pages we've prefetched on this wakeup of kprefetchd */ + struct node_stats node[MAX_NUMNODES]; }; static struct prefetch_stats sp_stat; @@ -211,7 +231,7 @@ static enum trickle_return trickle_swap_ } sp_stat.prefetched_pages++; - sp_stat.last_free[node]--; + sp_stat.node[node].last_free--; ret = TRICKLE_SUCCESS; out_release: @@ -229,8 +249,11 @@ static void clear_last_prefetch_free(voi * update the data to take into account memory hotplug if desired.. */ sp_stat.prefetch_nodes = node_online_map; - for_each_node_mask(node, sp_stat.prefetch_nodes) - sp_stat.last_free[node] = 0; + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + + ns->last_free = 0; + } } static void clear_current_prefetch_free(void) @@ -238,72 +261,143 @@ static void clear_current_prefetch_free( int node; sp_stat.prefetch_nodes = node_online_map; - for_each_node_mask(node, sp_stat.prefetch_nodes) - sp_stat.current_free[node] = 0; + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + + ns->current_free = 0; + } } /* - * We want to be absolutely certain it's ok to start prefetching. + * This updates the high and low watermarks of amount of free ram in each + * node used to start and stop prefetching. We prefetch from pages_high * 4 + * down to pages_high * 3. */ -static int prefetch_suitable(void) +static void examine_free_limits(void) { - struct page_state ps; - unsigned long limit; struct zone *z; - int node, ret = 0; - /* Purposefully racy and might return false positive which is ok */ - if (__test_and_clear_bit(0, &swapped.busy)) - goto out; + for_each_zone(z) { + struct node_stats *ns; + int idx; - clear_current_prefetch_free(); + if (!populated_zone(z)) + continue; + + ns = &sp_stat.node[z->zone_pgdat->node_id]; + idx = zone_idx(z); + ns->lowfree[idx] = z->pages_high * 3 + z->lowmem_reserve[idx]; + ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; + + if (z->free_pages > ns->highfree[idx]) { + /* + * We've gotten above the high watermark of free pages + * so we can start prefetching till we get to the low + * watermark. + */ + ns->pointfree[idx] = &ns->lowfree[idx]; + } + } +} + +/* + * Have some hysteresis between where page reclaiming and prefetching + * will occur to prevent ping-ponging between them. + */ +static void set_suitable_nodes(void) +{ + struct zone *z; - /* - * Have some hysteresis between where page reclaiming and prefetching - * will occur to prevent ping-ponging between them. - */ for_each_zone(z) { + struct node_stats *ns; unsigned long free; + int node, idx; if (!populated_zone(z)) continue; + node = z->zone_pgdat->node_id; + ns = &sp_stat.node[node]; + idx = zone_idx(z); free = z->free_pages; - if (z->pages_high * 3 + z->lowmem_reserve[zone_idx(z)] > free) { + if (free < *ns->pointfree[idx]) { + /* + * Free pages have dropped below the low watermark so + * we won't start prefetching again till we hit the + * high watermark of free pages. + */ + ns->pointfree[idx] = &ns->highfree[idx]; node_clear(node, sp_stat.prefetch_nodes); continue; } - sp_stat.current_free[node] += free; + ns->current_free += free; + } +} + +/* + * We want to be absolutely certain it's ok to start prefetching. + */ +static int prefetch_suitable(void) +{ + unsigned long limit; + int node, ret = 0, test_pagestate = 0; + + if (aggressive_prefetch) { + clear_current_prefetch_free(); + set_suitable_nodes(); + if (!nodes_empty(sp_stat.prefetch_nodes)) + ret = 1; + goto out; + } + + /* Purposefully racy */ + if (test_bit(0, &swapped.busy)) { + __clear_bit(0, &swapped.busy); + goto out; + } + + /* + * get_page_state and above_background_load are expensive so we only + * perform them every SWAP_CLUSTER_MAX prefetched_pages. + * We test to see if we're above_background_load as disk activity + * even at low priority can cause interrupt induced scheduling + * latencies. + */ + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) { + if (above_background_load()) + goto out; + test_pagestate = 1; } + clear_current_prefetch_free(); + set_suitable_nodes(); + /* * We iterate over each node testing to see if it is suitable for * prefetching and clear the nodemask if it is not. */ for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + struct page_state ps; + /* * We check to see that pages are not being allocated * elsewhere at any significant rate implying any * degree of memory pressure (eg during file reads) */ - if (sp_stat.last_free[node]) { - if (sp_stat.current_free[node] + SWAP_CLUSTER_MAX < - sp_stat.last_free[node]) { - sp_stat.last_free[node] = - sp_stat.current_free[node]; - node_clear(node, - sp_stat.prefetch_nodes); - continue; + if (ns->last_free) { + if (ns->current_free + SWAP_CLUSTER_MAX < + ns->last_free) { + ns->last_free = ns->current_free; + node_clear(node, + sp_stat.prefetch_nodes); + continue; } } else - sp_stat.last_free[node] = sp_stat.current_free[node]; + ns->last_free = ns->current_free; - /* - * get_page_state is super expensive so we only perform it - * every SWAP_CLUSTER_MAX prefetched_pages - */ - if (sp_stat.prefetched_pages % SWAP_CLUSTER_MAX) + if (!test_pagestate) continue; get_page_state_node(&ps, node); @@ -324,7 +418,7 @@ static int prefetch_suitable(void) */ limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty + ps.nr_unstable + total_swapcache_pages; - if (limit > sp_stat.prefetch_watermark[node]) { + if (limit > ns->prefetch_watermark) { node_clear(node, sp_stat.prefetch_nodes); continue; } @@ -350,6 +444,17 @@ static inline struct swapped_entry *prev struct swapped_entry, swapped_list); } +static unsigned long pages_prefetched(void) +{ + unsigned long pages = sp_stat.prefetched_pages; + + if (pages) { + lru_add_drain(); + sp_stat.prefetched_pages = 0; + } + return pages; +} + /* * trickle_swap is the main function that initiates the swap prefetching. It * first checks to see if the busy flag is set, and does not prefetch if it @@ -367,9 +472,10 @@ static enum trickle_return trickle_swap( * If laptop_mode is enabled don't prefetch to avoid hard drives * doing unnecessary spin-ups */ - if (!swap_prefetch || laptop_mode) + if (!swap_prefetch || (laptop_mode && !aggressive_prefetch)) return ret; + examine_free_limits(); entry = NULL; for ( ; ; ) { @@ -402,6 +508,14 @@ static enum trickle_return trickle_swap( * delay attempting further prefetching. */ spin_unlock(&swapped.lock); + if (aggressive_prefetch) { + /* + * If we're prefetching aggressively and + * making progress then don't give up. + */ + if (pages_prefetched()) + continue; + } break; } @@ -419,14 +533,15 @@ static enum trickle_return trickle_swap( entry = prev_swapped_entry(entry); spin_unlock(&swapped.lock); - if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) + if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY && + !aggressive_prefetch) break; } - if (sp_stat.prefetched_pages) { - lru_add_drain(); - sp_stat.prefetched_pages = 0; - } + /* Return value of pages_prefetched irrelevant here */ + pages_prefetched(); + if (aggressive_prefetch) + swap_prefetch &= ~PREFETCH_AGGRESSIVE; return ret; } @@ -459,8 +574,7 @@ static int kprefetchd(void *__unused) */ void __init prepare_swap_prefetch(void) { - pg_data_t *pgdat; - int node; + struct zone *zone; swapped.cache = kmem_cache_create("swapped_entry", sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); @@ -471,14 +585,19 @@ void __init prepare_swap_prefetch(void) */ swapped.maxcount = nr_free_pagecache_pages() / 3 * 2; - for_each_online_pgdat(pgdat) { + for_each_zone(zone) { unsigned long present; + struct node_stats *ns; + int idx; - present = pgdat->node_present_pages; + present = zone->present_pages; if (!present) continue; - node = pgdat->node_id; - sp_stat.prefetch_watermark[node] += present / 3 * 2; + + ns = &sp_stat.node[zone->zone_pgdat->node_id]; + ns->prefetch_watermark += present / 3 * 2; + idx = zone_idx(zone); + ns->pointfree[idx] = &ns->highfree[idx]; } } Index: linux-2.6.16-rc6-mm1/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.16-rc6-mm1.orig/Documentation/sysctl/vm.txt 2006-03-13 10:04:51.000000000 +1100 +++ linux-2.6.16-rc6-mm1/Documentation/sysctl/vm.txt 2006-03-16 21:10:42.000000000 +1100 @@ -188,4 +188,13 @@ memory subsystem has been extremely idle copying back pages from swap into the swapcache and keep a copy in swap. In practice it can take many minutes before the vm is idle enough. +This is value ORed together of +1 = Normal background swap prefetching when load is light +2 = Aggressively swap prefetch as much as possible + +When 2 is set, after the maximum amount possible has been prefetched, this bit +is unset. ie Setting the value to 3 will prefetch aggressively then drop to 1. +This is useful for doing aggressive prefetching for short periods in scripts +such as after resuming from software suspend. + The default value is 1.