Index: linux-2.6.21-ck1/include/linux/sched.h
===================================================================
--- linux-2.6.21-ck1.orig/include/linux/sched.h	2007-05-14 09:24:26.000000000 +1000
+++ linux-2.6.21-ck1/include/linux/sched.h	2007-05-14 09:24:44.000000000 +1000
@@ -133,7 +133,7 @@ extern unsigned long nr_uninterruptible(
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long weighted_cpuload(const int cpu);
-
+extern int above_background_load(void);
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -749,22 +749,6 @@ extern unsigned int max_cache_size;
 
 #endif	/* CONFIG_SMP */
 
-/*
- * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
- * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
- * task of nice 0 or enough lower priority tasks to bring up the
- * weighted_cpuload
- */
-static inline int above_background_load(void)
-{
-	unsigned long cpu;
-
-	for_each_online_cpu(cpu) {
-		if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
-			return 1;
-	}
-	return 0;
-}
 
 struct io_context;			/* See blkdev.h */
 struct cpuset;
Index: linux-2.6.21-ck1/kernel/sched.c
===================================================================
--- linux-2.6.21-ck1.orig/kernel/sched.c	2007-05-14 09:24:31.000000000 +1000
+++ linux-2.6.21-ck1/kernel/sched.c	2007-05-14 09:24:44.000000000 +1000
@@ -1082,6 +1082,37 @@ static int effective_prio(struct task_st
 	return p->prio;
 }
 
+static inline int nice_quota(int nice)
+{
+	int rr = rr_interval;
+
+	if (nice < -6) {
+		rr *= nice * nice;
+		rr /= 40;
+	} else if (nice > 0)
+		rr = rr / 2 ? : 1;
+	return MS_TO_US(rr);
+}
+
+#define DEFAULT_TIMESLICE	(nice_quota(0) * 20 * PRIO_RANGE)
+
+/*
+ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
+ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
+ * task of nice 0 or enough lower priority tasks to bring up the
+ * weighted_cpuload
+ */
+int above_background_load(void)
+{
+	unsigned long cpu;
+
+	for_each_online_cpu(cpu) {
+		if (weighted_cpuload(cpu) >= US_TO_MS(DEFAULT_TIMESLICE))
+			return 1;
+	}
+	return 0;
+}
+
 /*
  * All tasks have quotas based on rr_interval. RT tasks all get rr_interval.
  * From nice 1 to 19 they are smaller than it only if they are at least one
@@ -1092,16 +1123,9 @@ static int effective_prio(struct task_st
  */
 static inline unsigned int rr_quota(struct task_struct *p)
 {
-	int nice = TASK_NICE(p), rr = rr_interval;
-
-	if (!rt_task(p)) {
-		if (nice < -6) {
-			rr *= nice * nice;
-			rr /= 40;
-		} else if (nice > 0)
-			rr = rr / 2 ? : 1;
-	}
-	return MS_TO_US(rr);
+	if (rt_task(p))
+		return rr_interval;
+	return nice_quota(TASK_NICE(p));
 }
 
 /* Every time we set the quota we need to set the load weight */
Index: linux-2.6.21-ck1/mm/page_io.c
===================================================================
--- linux-2.6.21-ck1.orig/mm/page_io.c	2007-02-14 09:09:44.000000000 +1100
+++ linux-2.6.21-ck1/mm/page_io.c	2007-05-14 09:24:44.000000000 +1000
@@ -17,6 +17,7 @@
 #include <linux/bio.h>
 #include <linux/swapops.h>
 #include <linux/writeback.h>
+#include <linux/swap-prefetch.h>
 #include <asm/pgtable.h>
 
 static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
@@ -118,6 +119,7 @@ int swap_writepage(struct page *page, st
 		ret = -ENOMEM;
 		goto out;
 	}
+	add_to_swapped_list(page);
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		rw |= (1 << BIO_RW_SYNC);
 	count_vm_event(PSWPOUT);
Index: linux-2.6.21-ck1/mm/swap_state.c
===================================================================
--- linux-2.6.21-ck1.orig/mm/swap_state.c	2007-05-14 09:24:27.000000000 +1000
+++ linux-2.6.21-ck1/mm/swap_state.c	2007-05-14 09:24:44.000000000 +1000
@@ -83,7 +83,6 @@ static int __add_to_swap_cache(struct pa
 		error = radix_tree_insert(&swapper_space.page_tree,
 						entry.val, page);
 		if (!error) {
-			remove_from_swapped_list(entry.val);
 			page_cache_get(page);
 			SetPageLocked(page);
 			SetPageSwapCache(page);
@@ -102,7 +101,6 @@ int add_to_swap_cache(struct page *page,
 	int error;
 
 	if (!swap_duplicate(entry)) {
-		remove_from_swapped_list(entry.val);
 		INC_CACHE_INFO(noent_race);
 		return -ENOENT;
 	}
Index: linux-2.6.21-ck1/mm/vmscan.c
===================================================================
--- linux-2.6.21-ck1.orig/mm/vmscan.c	2007-05-14 09:24:29.000000000 +1000
+++ linux-2.6.21-ck1/mm/vmscan.c	2007-05-14 09:24:44.000000000 +1000
@@ -427,7 +427,6 @@ int remove_mapping(struct address_space 
 
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
-		add_to_swapped_list(page);
 		__delete_from_swap_cache(page);
 		write_unlock_irq(&mapping->tree_lock);
 		swap_free(swap);
Index: linux-2.6.21-ck1/mm/swap_prefetch.c
===================================================================
--- linux-2.6.21-ck1.orig/mm/swap_prefetch.c	2007-05-14 09:24:27.000000000 +1000
+++ linux-2.6.21-ck1/mm/swap_prefetch.c	2007-05-14 10:21:47.000000000 +1000
@@ -27,7 +27,8 @@
  * needs to be at least this duration of idle time meaning in practice it can
  * be much longer
  */
-#define PREFETCH_DELAY	(HZ * 5)
+#define PREFETCH_DELAY		(HZ * 5)
+#define DISABLED_PREFETCH_DELAY	(HZ * 60)
 
 /* sysctl - enable/disable swap prefetching */
 int swap_prefetch __read_mostly = 1;
@@ -61,19 +62,30 @@ inline void delay_swap_prefetch(void)
 }
 
 /*
+ * If laptop_mode is enabled don't prefetch to avoid hard drives
+ * doing unnecessary spin-ups unless swap_prefetch is explicitly
+ * set to a higher value.
+ */
+static inline int prefetch_enabled(void)
+{
+	if (swap_prefetch <= laptop_mode)
+		return 0;
+	return 1;
+}
+
+static int wakeup_kprefetchd;
+
+/*
  * Drop behind accounting which keeps a list of the most recently used swap
- * entries.
+ * entries. Entries are removed lazily by kprefetchd.
  */
 void add_to_swapped_list(struct page *page)
 {
 	struct swapped_entry *entry;
 	unsigned long index, flags;
-	int wakeup;
-
-	if (!swap_prefetch)
-		return;
 
-	wakeup = 0;
+	if (!prefetch_enabled())
+		goto out;
 
 	spin_lock_irqsave(&swapped.lock, flags);
 	if (swapped.count >= swapped.maxcount) {
@@ -103,23 +115,16 @@ void add_to_swapped_list(struct page *pa
 	store_swap_entry_node(entry, page);
 
 	if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) {
-		/*
-		 * If this is the first entry, kprefetchd needs to be
-		 * (re)started.
-		 */
-		if (!swapped.count)
-			wakeup = 1;
 		list_add(&entry->swapped_list, &swapped.list);
 		swapped.count++;
-	}
+	} else
+		kmem_cache_free(swapped.cache, entry);
 
 out_locked:
 	spin_unlock_irqrestore(&swapped.lock, flags);
-
-	/* Do the wakeup outside the lock to shorten lock hold time. */
-	if (wakeup)
+out:
+	if (wakeup_kprefetchd)
 		wake_up_process(kprefetchd_task);
-
 	return;
 }
 
@@ -139,7 +144,7 @@ void remove_from_swapped_list(const unsi
 	spin_lock_irqsave(&swapped.lock, flags);
 	entry = radix_tree_delete(&swapped.swap_tree, index);
 	if (likely(entry)) {
-		list_del_init(&entry->swapped_list);
+		list_del(&entry->swapped_list);
 		swapped.count--;
 		kmem_cache_free(swapped.cache, entry);
 	}
@@ -153,18 +158,18 @@ enum trickle_return {
 };
 
 struct node_stats {
-	unsigned long	last_free;
 	/* Free ram after a cycle of prefetching */
-	unsigned long	current_free;
+	unsigned long	last_free;
 	/* Free ram on this cycle of checking prefetch_suitable */
-	unsigned long	prefetch_watermark;
+	unsigned long	current_free;
 	/* Maximum amount we will prefetch to */
-	unsigned long	highfree[MAX_NR_ZONES];
+	unsigned long	prefetch_watermark;
 	/* The amount of free ram before we start prefetching */
-	unsigned long	lowfree[MAX_NR_ZONES];
+	unsigned long	highfree[MAX_NR_ZONES];
 	/* The amount of free ram where we will stop prefetching */
-	unsigned long	*pointfree[MAX_NR_ZONES];
+	unsigned long	lowfree[MAX_NR_ZONES];
 	/* highfree or lowfree depending on whether we've hit a watermark */
+	unsigned long	*pointfree[MAX_NR_ZONES];
 };
 
 /*
@@ -172,10 +177,10 @@ struct node_stats {
  * determine if a node is suitable for prefetching into.
  */
 struct prefetch_stats {
-	nodemask_t	prefetch_nodes;
 	/* Which nodes are currently suited to prefetching */
-	unsigned long	prefetched_pages;
+	nodemask_t	prefetch_nodes;
 	/* Total pages we've prefetched on this wakeup of kprefetchd */
+	unsigned long	prefetched_pages;
 	struct node_stats node[MAX_NUMNODES];
 };
 
@@ -189,16 +194,15 @@ static enum trickle_return trickle_swap_
 	const int node)
 {
 	enum trickle_return ret = TRICKLE_FAILED;
+	unsigned long flags;
 	struct page *page;
 
-	read_lock_irq(&swapper_space.tree_lock);
+	read_lock_irqsave(&swapper_space.tree_lock, flags);
 	/* Entry may already exist */
 	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
-	read_unlock_irq(&swapper_space.tree_lock);
-	if (page) {
-		remove_from_swapped_list(entry.val);
+	read_unlock_irqrestore(&swapper_space.tree_lock, flags);
+	if (page)
 		goto out;
-	}
 
 	/*
 	 * Get a new page to read from swap. We have already checked the
@@ -217,10 +221,8 @@ static enum trickle_return trickle_swap_
 
 	/* Add them to the tail of the inactive list to preserve LRU order */
 	lru_cache_add_tail(page);
-	if (unlikely(swap_readpage(NULL, page))) {
-		ret = TRICKLE_DELAY;
+	if (unlikely(swap_readpage(NULL, page)))
 		goto out_release;
-	}
 
 	sp_stat.prefetched_pages++;
 	sp_stat.node[node].last_free--;
@@ -229,6 +231,12 @@ static enum trickle_return trickle_swap_
 out_release:
 	page_cache_release(page);
 out:
+	/*
+	 * All entries are removed here lazily. This avoids the cost of
+	 * remove_from_swapped_list during normal swapin. Thus there are
+	 * usually many stale entries.
+	 */
+	remove_from_swapped_list(entry.val);
 	return ret;
 }
 
@@ -410,17 +418,6 @@ out:
 }
 
 /*
- * Get previous swapped entry when iterating over all entries. swapped.lock
- * should be held and we should already ensure that entry exists.
- */
-static inline struct swapped_entry *prev_swapped_entry
-	(struct swapped_entry *entry)
-{
-	return list_entry(entry->swapped_list.prev->prev,
-		struct swapped_entry, swapped_list);
-}
-
-/*
  * trickle_swap is the main function that initiates the swap prefetching. It
  * first checks to see if the busy flag is set, and does not prefetch if it
  * is, as the flag implied we are low on memory or swapping in currently.
@@ -431,70 +428,49 @@ static inline struct swapped_entry *prev
 static enum trickle_return trickle_swap(void)
 {
 	enum trickle_return ret = TRICKLE_DELAY;
-	struct swapped_entry *entry;
+	struct swapped_entry *pos, *n;
 	unsigned long flags;
 
-	/*
-	 * If laptop_mode is enabled don't prefetch to avoid hard drives
-	 * doing unnecessary spin-ups
-	 */
-	if (!swap_prefetch || laptop_mode)
+	if (!prefetch_enabled())
 		return ret;
 
 	examine_free_limits();
-	entry = NULL;
+	if (!prefetch_suitable())
+		return ret;
+	if (list_empty(&swapped.list))
+		return TRICKLE_FAILED;
 
-	for ( ; ; ) {
+	spin_lock_irqsave(&swapped.lock, flags);
+	list_for_each_entry_safe_reverse(pos, n, &swapped.list, swapped_list) {
 		swp_entry_t swp_entry;
 		int node;
 
-		if (!prefetch_suitable())
-			break;
+		spin_unlock_irqrestore(&swapped.lock, flags);
+		/* Yield to anything else running */
+		if (cond_resched() || !prefetch_suitable())
+			goto out_unlocked;
 
 		spin_lock_irqsave(&swapped.lock, flags);
-		if (list_empty(&swapped.list)) {
-			ret = TRICKLE_FAILED;
-			spin_unlock_irqrestore(&swapped.lock, flags);
-			break;
-		}
-
-		if (!entry) {
-			/*
-			 * This sets the entry for the first iteration. It
-			 * also is a safeguard against the entry disappearing
-			 * while the lock is not held.
-			 */
-			entry = list_entry(swapped.list.prev,
-				struct swapped_entry, swapped_list);
-		} else if (entry->swapped_list.prev == swapped.list.next) {
-			/*
-			 * If we have iterated over all entries and there are
-			 * still entries that weren't swapped out there may
-			 * be a reason we could not swap them back in so
-			 * delay attempting further prefetching.
-			 */
-			spin_unlock_irqrestore(&swapped.lock, flags);
-			break;
-		}
-
-		node = get_swap_entry_node(entry);
+		if (unlikely(!pos))
+			continue;
+		node = get_swap_entry_node(pos);
 		if (!node_isset(node, sp_stat.prefetch_nodes)) {
 			/*
 			 * We found an entry that belongs to a node that is
 			 * not suitable for prefetching so skip it.
 			 */
-			entry = prev_swapped_entry(entry);
-			spin_unlock_irqrestore(&swapped.lock, flags);
 			continue;
 		}
-		swp_entry = entry->swp_entry;
-		entry = prev_swapped_entry(entry);
+		swp_entry = pos->swp_entry;
 		spin_unlock_irqrestore(&swapped.lock, flags);
 
 		if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY)
-			break;
+			goto out_unlocked;
+		spin_lock_irqsave(&swapped.lock, flags);
 	}
+	spin_unlock_irqrestore(&swapped.lock, flags);
 
+out_unlocked:
 	if (sp_stat.prefetched_pages) {
 		lru_add_drain();
 		sp_stat.prefetched_pages = 0;
@@ -509,13 +485,14 @@ static int kprefetchd(void *__unused)
 	sched_setscheduler(current, SCHED_BATCH, &param);
 	set_user_nice(current, 19);
 	/* Set ioprio to lowest if supported by i/o scheduler */
-	sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE);
+	sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE);
 
 	/* kprefetchd has nothing to do until it is woken up the first time */
+	wakeup_kprefetchd = 1;
 	set_current_state(TASK_INTERRUPTIBLE);
 	schedule();
 
-	do {
+	while (!kthread_should_stop()) {
 		try_to_freeze();
 
 		/*
@@ -523,13 +500,17 @@ static int kprefetchd(void *__unused)
 		 * a wakeup, and further delay the next one.
 		 */
 		if (trickle_swap() == TRICKLE_FAILED) {
+			wakeup_kprefetchd = 1;
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule();
-		}
+		} else
+			wakeup_kprefetchd = 0;
 		clear_last_prefetch_free();
-		schedule_timeout_interruptible(PREFETCH_DELAY);
-	} while (!kthread_should_stop());
-
+		if (!prefetch_enabled())
+			schedule_timeout_interruptible(DISABLED_PREFETCH_DELAY);
+		else
+			schedule_timeout_interruptible(PREFETCH_DELAY);
+	}
 	return 0;
 }
 
Index: linux-2.6.21-ck1/Documentation/sysctl/vm.txt
===================================================================
--- linux-2.6.21-ck1.orig/Documentation/sysctl/vm.txt	2007-05-14 09:24:29.000000000 +1000
+++ linux-2.6.21-ck1/Documentation/sysctl/vm.txt	2007-05-14 09:24:44.000000000 +1000
@@ -236,7 +236,8 @@ swap_prefetch
 This enables or disables the swap prefetching feature. When the virtual
 memory subsystem has been extremely idle for at least 5 seconds it will start
 copying back pages from swap into the swapcache and keep a copy in swap. In
-practice it can take many minutes before the vm is idle enough.
+practice it can take many minutes before the vm is idle enough. A value of 0
+disables swap prefetching, 1 enables it unless laptop_mode is enabled, and 2
+enables it even in the presence of laptop_mode.
 
 The default value is 1.
-
Index: linux-2.6.21-ck1/init/Kconfig
===================================================================
--- linux-2.6.21-ck1.orig/init/Kconfig	2007-05-14 09:24:27.000000000 +1000
+++ linux-2.6.21-ck1/init/Kconfig	2007-05-14 09:24:44.000000000 +1000
@@ -103,7 +103,7 @@ config SWAP
 
 config SWAP_PREFETCH
 	bool "Support for prefetching swapped memory"
-	depends on SWAP
+	depends on SWAP && !CPUSETS
 	default y
 	---help---
 	  This option will allow the kernel to prefetch swapped memory pages
