Index: linux-2.6.14-rc2-ck1/mm/swap_prefetch.c
===================================================================
--- linux-2.6.14-rc2-ck1.orig/mm/swap_prefetch.c	2005-09-23 00:22:02.000000000 +1000
+++ linux-2.6.14-rc2-ck1/mm/swap_prefetch.c	2005-09-23 16:17:35.000000000 +1000
@@ -20,8 +20,7 @@
 #define PREFETCH_INTERVAL (HZ)
 
 struct swapped_root_t {
-	int 			busy;
-	spinlock_t		busylock;
+	unsigned long		busy;
 	spinlock_t		lock;
 	struct list_head	list;
 	unsigned int		count;
@@ -64,7 +63,7 @@ void prepare_prefetch(void)
 	mapped_limit = total_memory / 3 * 2;
 
 	spin_lock_init(&swapped_root.lock);
-	spin_lock_init(&swapped_root.busylock);
+	swapped_root.busy = 0;
 }
 
 static inline void delay_prefetch_timer(void)
@@ -79,17 +78,19 @@ static inline void reset_prefetch_timer(
 
 /*
  * We check to see no part of the vm is busy. If it is this will interrupt
- * trickle_swap and wait another PREFETCH_DELAY
+ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy.
  */
 void delay_prefetch(void)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&swapped_root.busylock, flags);
-	swapped_root.busy = 1;
-	spin_unlock_irqrestore(&swapped_root.busylock, flags);
+	__set_bit(0, &swapped_root.busy);
 }
 
+/*
+ * Accounting is sloppy on purpose. As adding and removing entries from the
+ * list happens during swapping in and out we don't want to be spinning on
+ * locks. It is cheaper to just miss adding an entry since having a reference
+ * to every entry is not critical.
+ */
 void add_to_swapped_list(unsigned long index)
 {
 	struct swapped_entry_t *entry;
@@ -97,12 +98,6 @@ void add_to_swapped_list(unsigned long i
 	unsigned long flags;
 	int error;
 
-	/*
-	 * It is not critical to add every entry to the swapped list and
-	 * since we're adding to the swapped list when we're swapping
-	 * out it is not a good time to be spinning to acquire the lock so
-	 * just don't add this entry to the list.
-	 */
 	if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags)))
 		goto out;
 
@@ -144,13 +139,18 @@ out:
 	return;
 }
 
+/*
+ * Cheaper to not spin on the lock and remove the entry lazily via
+ * add_to_swap_cache when we hit it in trickle_swap_cache_async
+ */
 void remove_from_swapped_list(unsigned long index)
 {
 	struct address_space *mapping = &swapper_space;
 	struct swapped_entry_t *entry;
 	unsigned long flags;
 
-	spin_lock_irqsave(&swapped_root.lock, flags);
+	if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags)))
+		return;
 	entry = radix_tree_delete(&mapping->swap_tree, index);
 	if (entry) {
 		list_del_init(&entry->swapped_list);
@@ -161,27 +161,56 @@ void remove_from_swapped_list(unsigned l
 }
 
 /*
- * This is a very lightweight function to get a page to prefetch into. The
- * watermarks should already have been checked prior to this and we don't
- * want to start reclaiming so we shouldn't do this in __alloc_pages.
+ * Find the zone with the most free pages, recheck the watermarks and
+ * then directly allocate the ram. We don't want prefetch to use
+ * __alloc_pages and go calling on reclaim.
  */
 static struct page * prefetch_get_page(void)
 {
-	struct zone *z;
+	struct zone *zone = NULL, *z;
 	struct page *page = NULL;
-	struct zonelist *zonelist;
-
-	zonelist = NODE_DATA(numa_node_id())->node_zonelists +
-		(GFP_HIGHUSER & GFP_ZONEMASK);
+	long most_free = 0;
 
 	for_each_zone(z) {
-		if (zone_idx(z) == ZONE_DMA || z->present_pages == 0)
+		long free;
+
+		if (z->present_pages == 0)
+			continue;
+
+		free = z->free_pages;
+
+		/* Check yet again we are above watermarks, by now likely */
+		if (unlikely(free < z->pages_high * 3))
+			goto out;
+
+		/* We don't prefetch into DMA */
+		if (zone_idx(z) == ZONE_DMA)
+			continue;
+
+		/* Reasonably stressed zone, bypass it */
+		if (z->prev_priority < DEF_PRIORITY / 2)
 			continue;
-		page = buffered_rmqueue(z, 0, GFP_HIGHUSER);
-		if (page)
-			zone_statistics(zonelist, z);
-		break;
+
+		/* Select the zone with the most free ram */
+		if (free > most_free) {
+			most_free = free;
+			zone = z;
+		}
 	}
+
+	if (zone == NULL)
+		goto out;
+
+	page = buffered_rmqueue(zone, 0, GFP_HIGHUSER);
+	if (likely(page)) {
+		struct zonelist *zonelist;
+
+		zonelist = NODE_DATA(numa_node_id())->node_zonelists +
+		(GFP_HIGHUSER & GFP_ZONEMASK);
+;
+		zone_statistics(zonelist, zone);
+	}
+out:
 	return page;
 }
 
@@ -194,7 +223,7 @@ static int trickle_swap_cache_async(swp_
 	struct address_space *mapping = &swapper_space;
 	unsigned long flags;
 
-	/* Entry may already exist, check it as cheaply as possible */
+	/* Entry may already exist */
 	local_irq_save(flags);
 	if (unlikely(!read_trylock(&mapping->tree_lock))) {
 		local_irq_restore(flags);
@@ -227,41 +256,17 @@ out:
 	return 0;
 }
 
-static int test_clear_busy(void)
-{
-	int ret;
-
-	/* Lock is held? We must be busy */
-	if (unlikely(!spin_trylock(&swapped_root.busylock))) {
-		ret = 1;
-		goto out;
-	}
-	ret = swapped_root.busy;
-	swapped_root.busy = 0;
-	spin_unlock(&swapped_root.busylock);
-out:
-	return ret;
-}
-
 /*
  * We want to be absolutely certain it's ok to start prefetching.
  */
 static int prefetch_suitable(void)
 {
+	struct page_state ps;
 	unsigned long pending_writes, limit;
 	struct zone *z;
 
-	if (test_clear_busy())
-		goto out_delay;
-
-	/* We shouldn't prefetch when we are doing writeback */
-	if (read_page_state(nr_writeback))
-		goto out_delay;
-
-	/* Delay prefetching if we have significant amounts of dirty data */
-	pending_writes = read_page_state(nr_dirty) +
-		read_page_state(nr_unstable);
-	if (pending_writes > SWAP_CLUSTER_MAX)
+	/* Purposefully racy and might return false positive which is ok */
+	if (__test_and_clear_bit(0, &swapped_root.busy))
 		goto out_delay;
 
 	/*
@@ -275,8 +280,19 @@ static int prefetch_suitable(void)
 			goto out_delay;
 	}
 
+	get_page_state(&ps);
+
+	/* We shouldn't prefetch when we are doing writeback */
+	if (ps.nr_writeback)
+		goto out_delay;
+
+	/* Delay prefetching if we have significant amounts of dirty data */
+	pending_writes = ps.nr_dirty + ps.nr_unstable;
+	if (pending_writes > SWAP_CLUSTER_MAX)
+		goto out_delay;
+
 	/* >2/3 of the ram is mapped, we need some free for pagecache */
-	limit = read_page_state(nr_mapped);
+	limit = ps.nr_mapped + ps.nr_slab + pending_writes;
 	if (limit > mapped_limit)
 		goto out_delay;
 
