Index: linux-2.6.16-rc6-mm1/include/linux/sched.h
===================================================================
--- linux-2.6.16-rc6-mm1.orig/include/linux/sched.h	2006-03-13 18:29:40.000000000 +1100
+++ linux-2.6.16-rc6-mm1/include/linux/sched.h	2006-03-13 20:12:22.000000000 +1100
@@ -102,6 +102,7 @@ extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_iowait(void);
+extern unsigned long weighted_cpuload(const int cpu);
 
 #include <linux/time.h>
 #include <linux/param.h>
@@ -546,9 +547,9 @@ enum idle_type
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
-#ifdef CONFIG_SMP
 #define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
 
+#ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
 #define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		4	/* Balance on exec */
@@ -637,6 +638,22 @@ extern unsigned int max_cache_size;
 
 #endif	/* CONFIG_SMP */
 
+/*
+ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
+ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
+ * task of nice 0 or enough lower priority tasks to bring up the
+ * weighted_cpuload
+ */
+static inline int above_background_load(void)
+{
+	unsigned long cpu;
+
+	for_each_online_cpu(cpu) {
+		if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
+			return 1;
+	}
+	return 0;
+}
 
 struct io_context;			/* See blkdev.h */
 void exit_io_context(void);
@@ -704,8 +721,8 @@ struct task_struct {
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	int oncpu;
 #endif
-	int load_weight;	/* for load balancing purposes */
 #endif
+	int load_weight;	/* for niceness load balancing purposes */
 	int prio, static_prio;
 	struct list_head run_list;
 	prio_array_t *array;
Index: linux-2.6.16-rc6-mm1/kernel/sched.c
===================================================================
--- linux-2.6.16-rc6-mm1.orig/kernel/sched.c	2006-03-13 18:29:40.000000000 +1100
+++ linux-2.6.16-rc6-mm1/kernel/sched.c	2006-03-13 20:12:15.000000000 +1100
@@ -170,12 +170,12 @@
  */
 
 #define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
+	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 
 static unsigned int static_prio_timeslice(int static_prio)
 {
 	if (static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE*4, static_prio);
+		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 	else
 		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
@@ -217,8 +217,8 @@ struct runqueue {
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
-#ifdef CONFIG_SMP
 	unsigned long raw_weighted_load;
+#ifdef CONFIG_SMP
 	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
@@ -672,7 +672,6 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
-#ifdef CONFIG_SMP
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -695,9 +694,10 @@ static int effective_prio(task_t *p)
 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
 
-static inline void set_load_weight(task_t *p)
+static void set_load_weight(task_t *p)
 {
 	if (rt_task(p)) {
+#ifdef CONFIG_SMP
 		if (p == task_rq(p)->migration_thread)
 			/*
 			 * The migration thread does the actual balancing.
@@ -706,6 +706,7 @@ static inline void set_load_weight(task_
 			 */
 			p->load_weight = 0;
 		else
+#endif
 			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
 	} else
 		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
@@ -720,19 +721,6 @@ static inline void dec_raw_weighted_load
 {
 	rq->raw_weighted_load -= p->load_weight;
 }
-#else
-static inline void set_load_weight(task_t *p)
-{
-}
-
-static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
-{
-}
-
-static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
-{
-}
-#endif
 
 static inline void inc_nr_running(task_t *p, runqueue_t *rq)
 {
@@ -926,6 +914,12 @@ inline int task_curr(const task_t *p)
 	return cpu_curr(task_cpu(p)) == p;
 }
 
+/* Used instead of source_load when we know the type == 0 */
+unsigned long weighted_cpuload(const int cpu)
+{
+	return cpu_rq(cpu)->raw_weighted_load;
+}
+
 #ifdef CONFIG_SMP
 typedef struct {
 	struct list_head list;
@@ -1126,7 +1120,7 @@ find_idlest_cpu(struct sched_group *grou
 	cpus_and(tmp, group->cpumask, p->cpus_allowed);
 
 	for_each_cpu_mask(i, tmp) {
-		load = source_load(i, 0);
+		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
 			min_load = load;
@@ -2198,7 +2192,7 @@ static runqueue_t *find_busiest_queue(st
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i, 0);
+		load = weighted_cpuload(i);
 
 		if (load > max_load) {
 			max_load = load;
Index: linux-2.6.16-rc6-mm1/mm/swap_prefetch.c
===================================================================
--- linux-2.6.16-rc6-mm1.orig/mm/swap_prefetch.c	2006-03-13 20:11:22.000000000 +1100
+++ linux-2.6.16-rc6-mm1/mm/swap_prefetch.c	2006-03-16 21:06:50.000000000 +1100
@@ -27,8 +27,18 @@
  */
 #define PREFETCH_DELAY	(HZ * 5)
 
-/* sysctl - enable/disable swap prefetching */
-int swap_prefetch __read_mostly = 1;
+#define PREFETCH_NORMAL		(1 << 0)
+#define PREFETCH_AGGRESSIVE 	(1 << 1)
+/*
+ * sysctl - enable/disable swap prefetching bits
+ * This is composed of the bitflags PREFETCH_NORMAL and PREFETCH_AGGRESSIVE.
+ * Once PREFETCH_AGGRESSIVE is set, swap prefetching will be peformed as much
+ * as possible irrespective of load conditions and then the
+ * PREFETCH_AGGRESSIVE bit will be unset.
+ */
+int swap_prefetch __read_mostly = PREFETCH_NORMAL;
+
+#define aggressive_prefetch	(unlikely(swap_prefetch & PREFETCH_AGGRESSIVE))
 
 struct swapped_root {
 	unsigned long		busy;		/* vm busy */
@@ -150,21 +160,31 @@ enum trickle_return {
 	TRICKLE_DELAY,
 };
 
+struct node_stats {
+	unsigned long	last_free;
+	/* Free ram after a cycle of prefetching */
+	unsigned long	current_free;
+	/* Free ram on this cycle of checking prefetch_suitable */
+	unsigned long	prefetch_watermark;
+	/* Maximum amount we will prefetch to */
+	unsigned long	highfree[MAX_NR_ZONES];
+	/* The amount of free ram before we start prefetching */
+	unsigned long	lowfree[MAX_NR_ZONES];
+	/* The amount of free ram where we will stop prefetching */
+	unsigned long	*pointfree[MAX_NR_ZONES];
+	/* highfree or lowfree depending on whether we've hit a watermark */
+};
+
 /*
  * prefetch_stats stores the free ram data of each node and this is used to
  * determine if a node is suitable for prefetching into.
  */
-struct prefetch_stats{
-	unsigned long	last_free[MAX_NUMNODES];
-	/* Free ram after a cycle of prefetching */
-	unsigned long	current_free[MAX_NUMNODES];
-	/* Free ram on this cycle of checking prefetch_suitable */
-	unsigned long	prefetch_watermark[MAX_NUMNODES];
-	/* Maximum amount we will prefetch to */
+struct prefetch_stats {
 	nodemask_t	prefetch_nodes;
 	/* Which nodes are currently suited to prefetching */
 	unsigned long	prefetched_pages;
 	/* Total pages we've prefetched on this wakeup of kprefetchd */
+	struct node_stats node[MAX_NUMNODES];
 };
 
 static struct prefetch_stats sp_stat;
@@ -211,7 +231,7 @@ static enum trickle_return trickle_swap_
 	}
 
 	sp_stat.prefetched_pages++;
-	sp_stat.last_free[node]--;
+	sp_stat.node[node].last_free--;
 
 	ret = TRICKLE_SUCCESS;
 out_release:
@@ -229,8 +249,11 @@ static void clear_last_prefetch_free(voi
 	 * update the data to take into account memory hotplug if desired..
 	 */
 	sp_stat.prefetch_nodes = node_online_map;
-	for_each_node_mask(node, sp_stat.prefetch_nodes)
-		sp_stat.last_free[node] = 0;
+	for_each_node_mask(node, sp_stat.prefetch_nodes) {
+		struct node_stats *ns = &sp_stat.node[node];
+
+		ns->last_free = 0;
+	}
 }
 
 static void clear_current_prefetch_free(void)
@@ -238,72 +261,143 @@ static void clear_current_prefetch_free(
 	int node;
 
 	sp_stat.prefetch_nodes = node_online_map;
-	for_each_node_mask(node, sp_stat.prefetch_nodes)
-		sp_stat.current_free[node] = 0;
+	for_each_node_mask(node, sp_stat.prefetch_nodes) {
+		struct node_stats *ns = &sp_stat.node[node];
+
+		ns->current_free = 0;
+	}
 }
 
 /*
- * We want to be absolutely certain it's ok to start prefetching.
+ * This updates the high and low watermarks of amount of free ram in each
+ * node used to start and stop prefetching. We prefetch from pages_high * 4
+ * down to pages_high * 3.
  */
-static int prefetch_suitable(void)
+static void examine_free_limits(void)
 {
-	struct page_state ps;
-	unsigned long limit;
 	struct zone *z;
-	int node, ret = 0;
 
-	/* Purposefully racy and might return false positive which is ok */
-	if (__test_and_clear_bit(0, &swapped.busy))
-		goto out;
+	for_each_zone(z) {
+		struct node_stats *ns;
+		int idx;
 
-	clear_current_prefetch_free();
+		if (!populated_zone(z))
+			continue;
+
+		ns = &sp_stat.node[z->zone_pgdat->node_id];
+		idx = zone_idx(z);
+		ns->lowfree[idx] = z->pages_high * 3 + z->lowmem_reserve[idx];
+		ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
+
+		if (z->free_pages > ns->highfree[idx]) {
+			/*
+			 * We've gotten above the high watermark of free pages
+			 * so we can start prefetching till we get to the low
+			 * watermark.
+			 */
+			ns->pointfree[idx] = &ns->lowfree[idx];
+		}
+	}
+}
+
+/*
+ * Have some hysteresis between where page reclaiming and prefetching
+ * will occur to prevent ping-ponging between them.
+ */
+static void set_suitable_nodes(void)
+{
+	struct zone *z;
 
-	/*
-	 * Have some hysteresis between where page reclaiming and prefetching
-	 * will occur to prevent ping-ponging between them.
-	 */
 	for_each_zone(z) {
+		struct node_stats *ns;
 		unsigned long free;
+		int node, idx;
 
 		if (!populated_zone(z))
 			continue;
+
 		node = z->zone_pgdat->node_id;
+		ns = &sp_stat.node[node];
+		idx = zone_idx(z);
 
 		free = z->free_pages;
-		if (z->pages_high * 3 + z->lowmem_reserve[zone_idx(z)] > free) {
+		if (free < *ns->pointfree[idx]) {
+			/*
+			 * Free pages have dropped below the low watermark so
+			 * we won't start prefetching again till we hit the
+			 * high watermark of free pages.
+			 */
+			ns->pointfree[idx] = &ns->highfree[idx];
 			node_clear(node, sp_stat.prefetch_nodes);
 			continue;
 		}
-		sp_stat.current_free[node] += free;
+		ns->current_free += free;
+	}
+}
+
+/*
+ * We want to be absolutely certain it's ok to start prefetching.
+ */
+static int prefetch_suitable(void)
+{
+	unsigned long limit;
+	int node, ret = 0, test_pagestate = 0;
+
+	if (aggressive_prefetch) {
+		clear_current_prefetch_free();
+		set_suitable_nodes();
+		if (!nodes_empty(sp_stat.prefetch_nodes))
+			ret = 1;
+		goto out;
+	}
+
+	/* Purposefully racy */
+	if (test_bit(0, &swapped.busy)) {
+		__clear_bit(0, &swapped.busy);
+		goto out;
+	}
+
+	/*
+	 * get_page_state and above_background_load are expensive so we only
+	 * perform them every SWAP_CLUSTER_MAX prefetched_pages.
+	 * We test to see if we're above_background_load as disk activity
+	 * even at low priority can cause interrupt induced scheduling
+	 * latencies.
+	 */
+	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
+		if (above_background_load())
+			goto out;
+		test_pagestate = 1;
 	}
 
+	clear_current_prefetch_free();
+	set_suitable_nodes();
+
 	/*
 	 * We iterate over each node testing to see if it is suitable for
 	 * prefetching and clear the nodemask if it is not.
 	 */
 	for_each_node_mask(node, sp_stat.prefetch_nodes) {
+		struct node_stats *ns = &sp_stat.node[node];
+		struct page_state ps;
+
 		/*
 		 * We check to see that pages are not being allocated
 		 * elsewhere at any significant rate implying any
 		 * degree of memory pressure (eg during file reads)
 		 */
-		if (sp_stat.last_free[node]) {
-			if (sp_stat.current_free[node] + SWAP_CLUSTER_MAX <
-				sp_stat.last_free[node]) {
-					sp_stat.last_free[node] =
-						sp_stat.current_free[node];
-					node_clear(node,
-						sp_stat.prefetch_nodes);
-					continue;
+		if (ns->last_free) {
+			if (ns->current_free + SWAP_CLUSTER_MAX <
+			    ns->last_free) {
+				ns->last_free = ns->current_free;
+				node_clear(node,
+					sp_stat.prefetch_nodes);
+				continue;
 			}
 		} else
-			sp_stat.last_free[node] = sp_stat.current_free[node];
+			ns->last_free = ns->current_free;
 
-		/*
-		 * get_page_state is super expensive so we only perform it
-		 * every SWAP_CLUSTER_MAX prefetched_pages
-		 */
-		if (sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)
+		if (!test_pagestate)
 			continue;
 
 		get_page_state_node(&ps, node);
@@ -324,7 +418,7 @@ static int prefetch_suitable(void)
 		 */
 		limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty +
 			ps.nr_unstable + total_swapcache_pages;
-		if (limit > sp_stat.prefetch_watermark[node]) {
+		if (limit > ns->prefetch_watermark) {
 			node_clear(node, sp_stat.prefetch_nodes);
 			continue;
 		}
@@ -350,6 +444,17 @@ static inline struct swapped_entry *prev
 		struct swapped_entry, swapped_list);
 }
 
+static unsigned long pages_prefetched(void)
+{
+	unsigned long pages = sp_stat.prefetched_pages;
+
+	if (pages) {
+		lru_add_drain();
+		sp_stat.prefetched_pages = 0;
+	}
+	return pages;
+}
+
 /*
  * trickle_swap is the main function that initiates the swap prefetching. It
  * first checks to see if the busy flag is set, and does not prefetch if it
@@ -367,9 +472,10 @@ static enum trickle_return trickle_swap(
 	 * If laptop_mode is enabled don't prefetch to avoid hard drives
 	 * doing unnecessary spin-ups
 	 */
-	if (!swap_prefetch || laptop_mode)
+	if (!swap_prefetch || (laptop_mode && !aggressive_prefetch))
 		return ret;
 
+	examine_free_limits();
 	entry = NULL;
 
 	for ( ; ; ) {
@@ -402,6 +508,14 @@ static enum trickle_return trickle_swap(
 			 * delay attempting further prefetching.
 			 */
 			spin_unlock(&swapped.lock);
+			if (aggressive_prefetch) {
+				/*
+				 * If we're prefetching aggressively and
+				 * making progress then don't give up.
+				 */
+				if (pages_prefetched())
+					continue;
+			}
 			break;
 		}
 
@@ -419,14 +533,15 @@ static enum trickle_return trickle_swap(
 		entry = prev_swapped_entry(entry);
 		spin_unlock(&swapped.lock);
 
-		if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY)
+		if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY &&
+		    !aggressive_prefetch)
 			break;
 	}
 
-	if (sp_stat.prefetched_pages) {
-		lru_add_drain();
-		sp_stat.prefetched_pages = 0;
-	}
+	/* Return value of pages_prefetched irrelevant here */
+	pages_prefetched();
+	if (aggressive_prefetch)
+		swap_prefetch &= ~PREFETCH_AGGRESSIVE;
 	return ret;
 }
 
@@ -459,8 +574,7 @@ static int kprefetchd(void *__unused)
  */
 void __init prepare_swap_prefetch(void)
 {
-	pg_data_t *pgdat;
-	int node;
+	struct zone *zone;
 
 	swapped.cache = kmem_cache_create("swapped_entry",
 		sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
@@ -471,14 +585,19 @@ void __init prepare_swap_prefetch(void)
 	 */
 	swapped.maxcount = nr_free_pagecache_pages() / 3 * 2;
 
-	for_each_online_pgdat(pgdat) {
+	for_each_zone(zone) {
 		unsigned long present;
+		struct node_stats *ns;
+		int idx;
 
-		present = pgdat->node_present_pages;
+		present = zone->present_pages;
 		if (!present)
 			continue;
-		node = pgdat->node_id;
-		sp_stat.prefetch_watermark[node] += present / 3 * 2;
+
+		ns = &sp_stat.node[zone->zone_pgdat->node_id];
+		ns->prefetch_watermark += present / 3 * 2;
+		idx = zone_idx(zone);
+		ns->pointfree[idx] = &ns->highfree[idx];
 	}
 }
 
Index: linux-2.6.16-rc6-mm1/Documentation/sysctl/vm.txt
===================================================================
--- linux-2.6.16-rc6-mm1.orig/Documentation/sysctl/vm.txt	2006-03-13 10:04:51.000000000 +1100
+++ linux-2.6.16-rc6-mm1/Documentation/sysctl/vm.txt	2006-03-16 21:10:42.000000000 +1100
@@ -188,4 +188,13 @@ memory subsystem has been extremely idle
 copying back pages from swap into the swapcache and keep a copy in swap. In
 practice it can take many minutes before the vm is idle enough.
 
+This is value ORed together of
+1	= Normal background swap prefetching when load is light
+2	= Aggressively swap prefetch as much as possible
+
+When 2 is set, after the maximum amount possible has been prefetched, this bit
+is unset. ie Setting the value to 3 will prefetch aggressively then drop to 1.
+This is useful for doing aggressive prefetching for short periods in scripts
+such as after resuming from software suspend.
+
 The default value is 1.
