 Documentation/sysctl/vm.txt |   10 
 include/linux/mm_inline.h   |    7 
 include/linux/swap.h        |   50 +++-
 include/linux/sysctl.h      |    1 
 init/Kconfig                |    5 
 kernel/sysctl.c             |    4 
 mm/page_alloc.c             |   12 -
 mm/swap.c                   |   41 +++
 mm/swap_prefetch.c          |  472 ++++++++++++++++++++++++--------------------
 mm/swap_state.c             |    4 
 mm/vmscan.c                 |    6 
 11 files changed, 363 insertions(+), 249 deletions(-)

Index: linux-2.6.15-ck4/Documentation/sysctl/vm.txt
===================================================================
--- linux-2.6.15-ck4.orig/Documentation/sysctl/vm.txt	2006-02-11 11:23:50.000000000 +1100
+++ linux-2.6.15-ck4/Documentation/sysctl/vm.txt	2006-02-12 02:20:31.000000000 +1100
@@ -108,9 +108,9 @@ a number of reserved free pages based pr
 
 swap_prefetch
 
-This is the amount of data prefetched per prefetching interval when
-swap prefetching is compiled in. The value means multiples of 128K,
-except when laptop_mode is enabled and then it is ten times larger.
-Setting it to 0 disables prefetching entirely.
+This enables or disables the swap prefetching feature. When the virtual
+memory subsystem has been extremely idle for at least 5 seconds it will start
+copying back pages from swap into the swapcache and keep a copy in swap. In
+practice it can take many minutes before the vm is idle enough.
 
-The default value is dependant on ramsize.
+The default value is 1.
Index: linux-2.6.15-ck4/include/linux/swap.h
===================================================================
--- linux-2.6.15-ck4.orig/include/linux/swap.h	2006-02-11 11:23:51.000000000 +1100
+++ linux-2.6.15-ck4/include/linux/swap.h	2006-02-12 02:23:15.000000000 +1100
@@ -7,6 +7,7 @@
 #include <linux/mmzone.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -164,6 +165,7 @@ extern unsigned int nr_free_pagecache_pa
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
 extern void FASTCALL(lru_cache_add_active(struct page *));
+extern void FASTCALL(lru_cache_add_tail(struct page *));
 extern void FASTCALL(activate_page(struct page *));
 extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
@@ -185,32 +187,52 @@ extern int shmem_unuse(swp_entry_t entry
 extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 
 #ifdef CONFIG_SWAP_PREFETCH
-/* only used by prefetch externally */
-/*	mm/swap_prefetch.c */
-extern void prepare_prefetch(void);
-extern void add_to_swapped_list(unsigned long index);
-extern void remove_from_swapped_list(unsigned long index);
-extern void delay_prefetch(void);
-/* linux/mm/page_alloc.c */
-extern struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags);
-extern void zone_statistics(struct zonelist *zonelist, struct zone *z);
+/* mm/swap_prefetch.c */
 extern int swap_prefetch;
+struct swapped_entry {
+	swp_entry_t		swp_entry;	/* The actual swap entry */
+	struct list_head	swapped_list;	/* Linked list of entries */
+#if MAX_NUMNODES > 1
+	int			node;		/* Node id */
+#endif
+} __attribute__((packed));
+
+static inline void store_swap_entry_node(struct swapped_entry *entry,
+	struct page *page)
+{
+#if MAX_NUMNODES > 1
+	entry->node = page_to_nid(page);
+#endif
+}
+
+static inline int get_swap_entry_node(struct swapped_entry *entry)
+{
+#if MAX_NUMNODES > 1
+	return entry->node;
+#else
+	return 0;
+#endif
+}
+
+extern void add_to_swapped_list(struct page *page);
+extern void remove_from_swapped_list(const unsigned long index);
+extern void delay_swap_prefetch(void);
+extern void prepare_swap_prefetch(void);
 
 #else	/* CONFIG_SWAP_PREFETCH */
-static inline void add_to_swapped_list(unsigned long index)
+static inline void add_to_swapped_list(struct page *__unused)
 {
 }
 
-static inline void prepare_prefetch(void)
+static inline void prepare_swap_prefetch(void)
 {
 }
 
-static inline void remove_from_swapped_list(unsigned long index)
+static inline void remove_from_swapped_list(unsigned long __unused)
 {
 }
 
-static inline void delay_prefetch(void)
+static inline void delay_swap_prefetch(void)
 {
 }
 
Index: linux-2.6.15-ck4/include/linux/sysctl.h
===================================================================
--- linux-2.6.15-ck4.orig/include/linux/sysctl.h	2006-02-11 11:23:51.000000000 +1100
+++ linux-2.6.15-ck4/include/linux/sysctl.h	2006-02-12 02:23:42.000000000 +1100
@@ -185,6 +185,7 @@ enum
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
 	VM_SWAP_PREFETCH=29,	/* int: amount to swap prefetch */
 	VM_HARDMAPLIMIT=30,	/* Make mapped a hard limit */
+	VM_SWAP_PREFETCH=31,	/* swap prefetch */
 };
 
 
Index: linux-2.6.15-ck4/init/Kconfig
===================================================================
--- linux-2.6.15-ck4.orig/init/Kconfig	2006-02-11 11:23:50.000000000 +1100
+++ linux-2.6.15-ck4/init/Kconfig	2006-02-12 02:20:31.000000000 +1100
@@ -120,9 +120,10 @@ config SWAP_PREFETCH
 	  computer after leaving it idle for a while, applications will come
 	  to life faster. Note that your swap usage will appear to increase
 	  but these are cached pages, can be dropped freely by the vm, and it
-	  should stabilise around 50% swap usage.
+	  should stabilise around 50% swap usage maximum.
 
-	  Desktop users will most likely want to say Y.
+	  Workstations and multiuser workstation servers will most likely want
+	  to say Y.
 
 config SYSVIPC
 	bool "System V IPC"
Index: linux-2.6.15-ck4/kernel/sysctl.c
===================================================================
--- linux-2.6.15-ck4.orig/kernel/sysctl.c	2006-02-11 11:23:51.000000000 +1100
+++ linux-2.6.15-ck4/kernel/sysctl.c	2006-02-12 02:20:31.000000000 +1100
@@ -882,6 +882,7 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec_jiffies,
 		.strategy	= &sysctl_jiffies,
 	},
+#endif
 #ifdef CONFIG_SWAP_PREFETCH
 	{
 		.ctl_name	= VM_SWAP_PREFETCH,
@@ -890,11 +891,8 @@ static ctl_table vm_table[] = {
 		.maxlen		= sizeof(swap_prefetch),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
 	},
 #endif
-#endif
 	{ .ctl_name = 0 }
 };
 
Index: linux-2.6.15-ck4/mm/page_alloc.c
===================================================================
--- linux-2.6.15-ck4.orig/mm/page_alloc.c	2006-02-11 11:23:51.000000000 +1100
+++ linux-2.6.15-ck4/mm/page_alloc.c	2006-02-12 02:24:30.000000000 +1100
@@ -727,7 +727,7 @@ static inline void prep_zero_page(struct
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-struct page *
+static struct page *
 buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
@@ -797,7 +797,7 @@ int zone_watermark_ok(struct zone *z, in
 		min -= min / 4;
 
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-		goto out_failed;
+		return 0;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
@@ -806,15 +806,9 @@ int zone_watermark_ok(struct zone *z, in
 		min >>= 1;
 
 		if (free_pages <= min)
-			goto out_failed;
+			return 0;
 	}
-
 	return 1;
-out_failed:
-	/* Swap prefetching is delayed if any watermark is low */
-	delay_prefetch();
-
-	return 0;
 }
 
 /*
Index: linux-2.6.15-ck4/mm/swap.c
===================================================================
--- linux-2.6.15-ck4.orig/mm/swap.c	2006-02-11 11:23:50.000000000 +1100
+++ linux-2.6.15-ck4/mm/swap.c	2006-02-12 02:20:31.000000000 +1100
@@ -338,6 +338,45 @@ void __pagevec_lru_add_active(struct pag
 	pagevec_reinit(pvec);
 }
 
+static inline void __pagevec_lru_add_tail(struct pagevec *pvec)
+{
+	int i;
+	struct zone *zone = NULL;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+		struct zone *pagezone = page_zone(page);
+
+		if (pagezone != zone) {
+			if (zone)
+				spin_unlock_irq(&zone->lru_lock);
+			zone = pagezone;
+			spin_lock_irq(&zone->lru_lock);
+		}
+		if (TestSetPageLRU(page))
+			BUG();
+		add_page_to_inactive_list_tail(zone, page);
+	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
+	release_pages(pvec->pages, pvec->nr, pvec->cold);
+	pagevec_reinit(pvec);
+}
+
+/*
+ * Function used uniquely to put pages back to the lru at the end of the
+ * inactive list currently only used by swap prefetch.
+ */
+void fastcall lru_cache_add_tail(struct page *page)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+
+	page_cache_get(page);
+	if (!pagevec_add(pvec, page))
+		__pagevec_lru_add_tail(pvec);
+	put_cpu_var(lru_add_pvecs);
+}
+
 /*
  * Try to drop buffers from the pages in a pagevec
  */
@@ -480,7 +519,7 @@ void __init swap_setup(void)
 	 * _really_ don't want to cluster much more
 	 */
 
-	prepare_prefetch();
+	prepare_swap_prefetch();
 
 	hotcpu_notifier(cpu_swap_callback, 0);
 }
Index: linux-2.6.15-ck4/mm/swap_prefetch.c
===================================================================
--- linux-2.6.15-ck4.orig/mm/swap_prefetch.c	2006-02-11 11:23:50.000000000 +1100
+++ linux-2.6.15-ck4/mm/swap_prefetch.c	2006-02-12 02:25:06.000000000 +1100
@@ -1,7 +1,7 @@
 /*
  * linux/mm/swap_prefetch.c
  *
- * Copyright (C) 2005 Con Kolivas
+ * Copyright (C) 2005-2006 Con Kolivas
  *
  * Written by Con Kolivas <kernel@kolivas.org>
  *
@@ -18,13 +18,15 @@
 #include <linux/syscalls.h>
 #include <linux/writeback.h>
 
-/* Time to delay prefetching if vm is busy or prefetching unsuccessful */
+/*
+ * Time to delay prefetching if vm is busy or prefetching unsuccessful. There
+ * needs to be at least this duration of idle time meaning in practice it can
+ * be much longer
+ */
 #define PREFETCH_DELAY	(HZ * 5)
-/* Time between attempting prefetching when vm is idle */
-#define PREFETCH_INTERVAL (HZ)
 
-/* sysctl - how many SWAP_CLUSTER_MAX pages to prefetch at a time */
-int swap_prefetch __read_mostly;
+/* sysctl - enable/disable swap prefetching */
+int swap_prefetch __read_mostly = 1;
 
 struct swapped_root {
 	unsigned long		busy;		/* vm busy */
@@ -33,73 +35,51 @@ struct swapped_root {
 	struct radix_tree_root	swap_tree;	/* Lookup tree of pages */
 	unsigned int		count;		/* Number of entries */
 	unsigned int		maxcount;	/* Maximum entries allowed */
-	kmem_cache_t		*cache;
-};
-
-struct swapped_entry {
-	swp_entry_t		swp_entry;
-	struct list_head	swapped_list;
+	kmem_cache_t		*cache;		/* Of struct swapped_entry */
 };
 
 static struct swapped_root swapped = {
-	.busy 		= 0,
+	.busy 		= 0,			/* Any vm activity */
 	.lock		= SPIN_LOCK_UNLOCKED,
 	.list  		= LIST_HEAD_INIT(swapped.list),
 	.swap_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
-	.count 		= 0,
+	.count 		= 0,			/* Number of swapped entries */
 };
 
 static task_t *kprefetchd_task;
 
-/* Max mapped we will prefetch to */
-static unsigned long mapped_limit __read_mostly;
-/* Last total free pages */
-static unsigned long last_free = 0;
-static unsigned long temp_free = 0;
-
-/*
- * Create kmem cache for swapped entries
- */
-void __init prepare_prefetch(void)
-{
-	long mem = nr_free_pagecache_pages();
-
-	swapped.cache = kmem_cache_create("swapped_entry",
-		sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
-
-	/* Set max number of entries to size of physical ram */
-	swapped.maxcount = mem;
-	/* Set maximum amount of mapped pages to prefetch to 2/3 ram */
-	mapped_limit = mem / 3 * 2;
-
-	/* Set initial swap_prefetch value according to memory size */
-	swap_prefetch = mem / 10000 ? : 1;
-}
-
 /*
  * We check to see no part of the vm is busy. If it is this will interrupt
  * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy.
  */
-inline void delay_prefetch(void)
+inline void delay_swap_prefetch(void)
 {
-	__set_bit(0, &swapped.busy);
+	if (!test_bit(0, &swapped.busy))
+		__set_bit(0, &swapped.busy);
 }
 
 /*
- * Accounting is sloppy on purpose. As adding and removing entries from the
- * list happens during swapping in and out we don't want to be spinning on
- * locks. It is cheaper to just miss adding an entry since having a reference
- * to every entry is not critical.
+ * Drop behind accounting which keeps a list of the most recently used swap
+ * entries.
  */
-void add_to_swapped_list(unsigned long index)
+void add_to_swapped_list(struct page *page)
 {
 	struct swapped_entry *entry;
-	int error;
+	unsigned long index;
+	int wakeup;
 
-	if (unlikely(!spin_trylock(&swapped.lock)))
-		goto out;
+	if (!swap_prefetch)
+		return;
 
+	wakeup = 0;
+
+	spin_lock(&swapped.lock);
 	if (swapped.count >= swapped.maxcount) {
+		/*
+		 * We limit the number of entries to 2/3 of physical ram.
+		 * Once the number of entries exceeds this we start removing
+		 * the least recently used entries.
+		 */
 		entry = list_entry(swapped.list.next,
 			struct swapped_entry, swapped_list);
 		radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val);
@@ -112,42 +92,49 @@ void add_to_swapped_list(unsigned long i
 			goto out_locked;
 	}
 
+	index = page_private(page);
 	entry->swp_entry.val = index;
+	/*
+	 * On numa we need to store the node id to ensure that we prefetch to
+	 * the same node it came from.
+	 */
+	store_swap_entry_node(entry, page);
 
-	error = radix_tree_preload(GFP_ATOMIC);
-	if (likely(!error)) {
-		error = radix_tree_insert(&swapped.swap_tree, index, entry);
-		if (likely(!error)) {
-			/*
-			 * If this is the first entry, kprefetchd needs to be
-			 * (re)started
-			 */
-			if (list_empty(&swapped.list))
-				wake_up_process(kprefetchd_task);
-			list_add(&entry->swapped_list, &swapped.list);
-			swapped.count++;
-		}
-		radix_tree_preload_end();
-	} else
-		kmem_cache_free(swapped.cache, entry);
+	if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) {
+		/*
+		 * If this is the first entry, kprefetchd needs to be
+		 * (re)started.
+		 */
+		if (list_empty(&swapped.list))
+			wakeup = 1;
+		list_add(&entry->swapped_list, &swapped.list);
+		swapped.count++;
+	}
 
 out_locked:
 	spin_unlock(&swapped.lock);
-out:
+
+	/* Do the wakeup outside the lock to shorten lock hold time. */
+	if (wakeup)
+		wake_up_process(kprefetchd_task);
+
 	return;
 }
 
 /*
- * Cheaper to not spin on the lock and remove the entry lazily via
- * add_to_swap_cache when we hit it in trickle_swap_cache_async
+ * Removes entries from the swapped_list. The radix tree allows us to quickly
+ * look up the entry from the index without having to iterate over the whole
+ * list.
  */
-void remove_from_swapped_list(unsigned long index)
+void remove_from_swapped_list(const unsigned long index)
 {
 	struct swapped_entry *entry;
 	unsigned long flags;
 
-	if (unlikely(!spin_trylock_irqsave(&swapped.lock, flags)))
+	if (list_empty(&swapped.list))
 		return;
+
+	spin_lock_irqsave(&swapped.lock, flags);
 	entry = radix_tree_delete(&swapped.swap_tree, index);
 	if (likely(entry)) {
 		list_del_init(&entry->swapped_list);
@@ -157,59 +144,6 @@ void remove_from_swapped_list(unsigned l
 	spin_unlock_irqrestore(&swapped.lock, flags);
 }
 
-static inline int high_zone(struct zone *zone)
-{
-	if (zone == NULL)
-		return 0;
-	return is_highmem(zone);
-}
-
-/*
- * Find the zone with the most free pages, recheck the watermarks and
- * then directly allocate the ram. We don't want prefetch to use
- * __alloc_pages and go calling on reclaim.
- */
-static struct page *prefetch_get_page(void)
-{
-	struct zone *zone = NULL, *z;
-	struct page *page = NULL;
-	long most_free = 0;
-
-	for_each_zone(z) {
-		long free;
-
-		if (z->present_pages == 0)
-			continue;
-
-		/* We don't prefetch into DMA */
-		if (zone_idx(z) == ZONE_DMA)
-			continue;
-
-		free = z->free_pages;
-		/* Select the zone with the most free ram preferring high */
-		if ((free > most_free && (!high_zone(zone) || high_zone(z))) ||
-			(!high_zone(zone) && high_zone(z))) {
-				most_free = free;
-				zone = z;
-		}
-	}
-
-	if (zone == NULL)
-		goto out;
-
-	page = buffered_rmqueue(zone, 0, GFP_HIGHUSER);
-	if (likely(page)) {
-		struct zonelist *zonelist;
-
-		zonelist = NODE_DATA(numa_node_id())->node_zonelists +
-			(GFP_HIGHUSER & GFP_ZONEMASK);
-
-		zone_statistics(zonelist, zone);
-	}
-out:
-	return page;
-}
-
 enum trickle_return {
 	TRICKLE_SUCCESS,
 	TRICKLE_FAILED,
@@ -217,43 +151,68 @@ enum trickle_return {
 };
 
 /*
+ * prefetch_stats stores the free ram data of each node and this is used to
+ * determine if a node is suitable for prefetching into.
+ */
+struct prefetch_stats{
+	unsigned long	last_free[MAX_NUMNODES];
+	/* Free ram after a cycle of prefetching */
+	unsigned long	current_free[MAX_NUMNODES];
+	/* Free ram on this cycle of checking prefetch_suitable */
+	unsigned long	prefetch_watermark[MAX_NUMNODES];
+	/* Maximum amount we will prefetch to */
+	nodemask_t	prefetch_nodes;
+	/* Which nodes are currently suited to prefetching */
+	unsigned long	prefetched_pages;
+	/* Total pages we've prefetched on this wakeup of kprefetchd */
+};
+
+static struct prefetch_stats sp_stat;
+
+/*
  * This tries to read a swp_entry_t into swap cache for swap prefetching.
  * If it returns TRICKLE_DELAY we should delay further prefetching.
  */
-static enum trickle_return trickle_swap_cache_async(swp_entry_t entry)
+static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry,
+	const int node)
 {
 	enum trickle_return ret = TRICKLE_FAILED;
-	struct page *page = NULL;
+	struct page *page;
 
-	if (unlikely(!read_trylock(&swapper_space.tree_lock))) {
-		ret = TRICKLE_DELAY;
-		goto out;
-	}
+	read_lock_irq(&swapper_space.tree_lock);
 	/* Entry may already exist */
 	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
-	read_unlock(&swapper_space.tree_lock);
+	read_unlock_irq(&swapper_space.tree_lock);
 	if (page) {
 		remove_from_swapped_list(entry.val);
 		goto out;
 	}
 
-	/* Get a new page to read from swap */
-	page = prefetch_get_page();
+	/*
+	 * Get a new page to read from swap. We have already checked the
+	 * watermarks so __alloc_pages will not call on reclaim.
+	 */
+	page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0);
 	if (unlikely(!page)) {
 		ret = TRICKLE_DELAY;
 		goto out;
 	}
 
-	if (add_to_swap_cache(page, entry))
+	if (add_to_swap_cache(page, entry)) {
 		/* Failed to add to swap cache */
 		goto out_release;
+	}
 
-	lru_cache_add(page);
+	/* Add them to the tail of the inactive list to preserve LRU order */
+	lru_cache_add_tail(page);
 	if (unlikely(swap_readpage(NULL, page))) {
 		ret = TRICKLE_DELAY;
 		goto out_release;
 	}
 
+	sp_stat.prefetched_pages++;
+	sp_stat.last_free[node]--;
+
 	ret = TRICKLE_SUCCESS;
 out_release:
 	page_cache_release(page);
@@ -261,14 +220,26 @@ out:
 	return ret;
 }
 
-/*
- * How many pages to prefetch at a time. We prefetch SWAP_CLUSTER_MAX *
- * swap_prefetch per PREFETCH_INTERVAL, but prefetch ten times as much at a
- * time in laptop_mode to minimise the time we keep the disk spinning.
- */
-static inline unsigned long prefetch_pages(void)
+static void clear_last_prefetch_free(void)
 {
-	return (SWAP_CLUSTER_MAX * swap_prefetch * (1 + 9 * !!laptop_mode));
+	int node;
+
+	/*
+	 * Reset the nodes suitable for prefetching to all nodes. We could
+	 * update the data to take into account memory hotplug if desired..
+	 */
+	sp_stat.prefetch_nodes = node_online_map;
+	for_each_node_mask(node, sp_stat.prefetch_nodes)
+		sp_stat.last_free[node] = 0;
+}
+
+static void clear_current_prefetch_free(void)
+{
+	int node;
+
+	sp_stat.prefetch_nodes = node_online_map;
+	for_each_node_mask(node, sp_stat.prefetch_nodes)
+		sp_stat.current_free[node] = 0;
 }
 
 /*
@@ -279,13 +250,14 @@ static int prefetch_suitable(void)
 	struct page_state ps;
 	unsigned long limit;
 	struct zone *z;
-	int ret = 0;
+	int node, ret = 0;
 
 	/* Purposefully racy and might return false positive which is ok */
 	if (__test_and_clear_bit(0, &swapped.busy))
 		goto out;
 
-	temp_free = 0;
+	clear_current_prefetch_free();
+
 	/*
 	 * Have some hysteresis between where page reclaiming and prefetching
 	 * will occur to prevent ping-ponging between them.
@@ -293,40 +265,72 @@ static int prefetch_suitable(void)
 	for_each_zone(z) {
 		unsigned long free;
 
-		if (z->present_pages == 0)
+		if (!populated_zone(z))
 			continue;
+		node = z->zone_pgdat->node_id;
+
 		free = z->free_pages;
-		if (z->pages_high * 3 > free)
-			goto out;
-		temp_free += free;
+		if (z->pages_high * 3 + z->lowmem_reserve[zone_idx(z)] > free) {
+			node_clear(node, sp_stat.prefetch_nodes);
+			continue;
+		}
+		sp_stat.current_free[node] += free;
 	}
 
 	/*
-	 * We check to see that pages are not being allocated elsewhere
-	 * at any significant rate implying any degree of memory pressure
-	 * (eg during file reads)
+	 * We iterate over each node testing to see if it is suitable for
+	 * prefetching and clear the nodemask if it is not.
 	 */
-	if (last_free) {
-		if (temp_free + SWAP_CLUSTER_MAX < last_free) {
-			last_free = temp_free;
-			goto out;
-		}
-	} else
-		last_free = temp_free;
+	for_each_node_mask(node, sp_stat.prefetch_nodes) {
+		/*
+		 * We check to see that pages are not being allocated
+		 * elsewhere at any significant rate implying any
+		 * degree of memory pressure (eg during file reads)
+		 */
+		if (sp_stat.last_free[node]) {
+			if (sp_stat.current_free[node] + SWAP_CLUSTER_MAX <
+				sp_stat.last_free[node]) {
+					sp_stat.last_free[node] =
+						sp_stat.current_free[node];
+					node_clear(node,
+						sp_stat.prefetch_nodes);
+					continue;
+			}
+		} else
+			sp_stat.last_free[node] = sp_stat.current_free[node];
 
-	get_page_state(&ps);
+		/*
+		 * get_page_state is super expensive so we only perform it
+		 * every SWAP_CLUSTER_MAX prefetched_pages
+		 */
+		if (sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)
+			continue;
 
-	/* We shouldn't prefetch when we are doing writeback */
-	if (ps.nr_writeback)
-		goto out;
+		get_page_state_node(&ps, node);
 
-	/*
-	 * >2/3 of the ram is mapped, swapcache or dirty, we need some free
-	 * for pagecache
-	 */
-	limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty + ps.nr_unstable +
-		total_swapcache_pages;
-	if (limit > mapped_limit)
+		/* We shouldn't prefetch when we are doing writeback */
+		if (ps.nr_writeback) {
+			node_clear(node, sp_stat.prefetch_nodes);
+			continue;
+		}
+
+		/*
+		 * >2/3 of the ram on this node is mapped, slab, swapcache or
+		 * dirty, we need to leave some free for pagecache.
+		 * Note that currently nr_slab is innacurate on numa because
+		 * nr_slab is incremented on the node doing the accounting
+		 * even if the slab is being allocated on a remote node. This
+		 * would be expensive to fix and not of great significance.
+		 */
+		limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty +
+			ps.nr_unstable + total_swapcache_pages;
+		if (limit > sp_stat.prefetch_watermark[node]) {
+			node_clear(node, sp_stat.prefetch_nodes);
+			continue;
+		}
+	}
+
+	if (nodes_empty(sp_stat.prefetch_nodes))
 		goto out;
 
 	/* Survived all that? Hooray we can prefetch! */
@@ -336,51 +340,83 @@ out:
 }
 
 /*
+ * Get next swapped entry when iterating over all entries. swapped.lock
+ * should be held and we should already ensure that entry exists.
+ */
+static inline struct swapped_entry *next_swapped_entry
+	(struct swapped_entry *entry)
+{
+	return list_entry(entry->swapped_list.next->next,
+		struct swapped_entry, swapped_list);
+}
+
+/*
  * trickle_swap is the main function that initiates the swap prefetching. It
  * first checks to see if the busy flag is set, and does not prefetch if it
  * is, as the flag implied we are low on memory or swapping in currently.
- * Otherwise it runs till prefetch_pages() are prefetched.
+ * Otherwise it runs until prefetch_suitable fails which occurs when the
+ * vm is busy, we prefetch to the watermark, or the list is empty.
  */
 static enum trickle_return trickle_swap(void)
 {
 	enum trickle_return ret = TRICKLE_DELAY;
 	struct swapped_entry *entry;
-	int pages = 0;
 
-	while (pages < prefetch_pages()) {
-		enum trickle_return got_page;
+	if (!swap_prefetch)
+		return ret;
+
+	entry = NULL;
+
+	for ( ; ; ) {
+		swp_entry_t swp_entry;
+		int node;
 
 		if (!prefetch_suitable())
-			goto out;
-		/* Lock is held? We must be busy elsewhere */
-		if (unlikely(!spin_trylock(&swapped.lock)))
-			goto out;
+			break;
+
+		spin_lock(&swapped.lock);
 		if (list_empty(&swapped.list)) {
-			spin_unlock(&swapped.lock);
 			ret = TRICKLE_FAILED;
-			goto out;
+			spin_unlock(&swapped.lock);
+			break;
 		}
-		entry = list_entry(swapped.list.next,
-			struct swapped_entry, swapped_list);
-		spin_unlock(&swapped.lock);
 
-		got_page = trickle_swap_cache_async(entry->swp_entry);
-		switch (got_page) {
-		case TRICKLE_FAILED:
-			break;
-		case TRICKLE_SUCCESS:
-			last_free--;
-			pages++;
+		if (!entry) {
+			/*
+			 * This sets the entry for the first iteration. It
+			 * also is a safeguard against the entry disappearing
+			 * while the lock is not held.
+			 */
+			entry = list_entry(swapped.list.next,
+				struct swapped_entry, swapped_list);
+		} else if (entry->swapped_list.next == swapped.list.next) {
+			/* Have we iterated over all entries? */
+			spin_unlock(&swapped.lock);
 			break;
-		case TRICKLE_DELAY:
-			goto out;
 		}
+
+		node = get_swap_entry_node(entry);
+		if (!node_isset(node, sp_stat.prefetch_nodes)) {
+			/*
+			 * We found an entry that belongs to a node that is
+			 * not suitable for prefetching so skip it.
+			 */
+			entry = next_swapped_entry(entry);
+			spin_unlock(&swapped.lock);
+			continue;
+		}
+		swp_entry = entry->swp_entry;
+		entry = next_swapped_entry(entry);
+		spin_unlock(&swapped.lock);
+
+		if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY)
+			break;
 	}
-	ret = TRICKLE_SUCCESS;
 
-out:
-	if (pages)
+	if (sp_stat.prefetched_pages) {
 		lru_add_drain();
+		sp_stat.prefetched_pages = 0;
+	}
 	return ret;
 }
 
@@ -391,35 +427,51 @@ static int kprefetchd(void *__unused)
 	sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE);
 
 	do {
-		enum trickle_return prefetched;
-
 		try_to_freeze();
 
 		/*
 		 * TRICKLE_FAILED implies no entries left - we do not schedule
 		 * a wakeup, and further delay the next one.
 		 */
-		prefetched = trickle_swap();
-		switch (prefetched) {
-		case TRICKLE_SUCCESS:
-			last_free = temp_free;
-			schedule_timeout_interruptible(PREFETCH_INTERVAL);
-			break;
-		case TRICKLE_DELAY:
-			last_free = 0;
-			schedule_timeout_interruptible(PREFETCH_DELAY);
-			break;
-		case TRICKLE_FAILED:
-			last_free = 0;
-			schedule_timeout_interruptible(MAX_SCHEDULE_TIMEOUT);
-			schedule_timeout_interruptible(PREFETCH_DELAY);
-			break;
+		if (trickle_swap() == TRICKLE_FAILED) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
 		}
+		clear_last_prefetch_free();
+		schedule_timeout_interruptible(PREFETCH_DELAY);
 	} while (!kthread_should_stop());
 
 	return 0;
 }
 
+/*
+ * Create kmem cache for swapped entries
+ */
+void __init prepare_swap_prefetch(void)
+{
+	pg_data_t *pgdat;
+	int node;
+
+	swapped.cache = kmem_cache_create("swapped_entry",
+		sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
+
+	/*
+	 * Set max number of entries to 2/3 the size of physical ram  as we
+	 * only ever prefetch to consume 2/3 of the ram.
+	 */
+	swapped.maxcount = nr_free_pagecache_pages() / 3 * 2;
+
+	for_each_pgdat(pgdat) {
+		unsigned long present;
+
+		present = pgdat->node_present_pages;
+		if (!present)
+			continue;
+		node = pgdat->node_id;
+		sp_stat.prefetch_watermark[node] += present / 3 * 2;
+	}
+}
+
 static int __init kprefetchd_init(void)
 {
 	kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd");
Index: linux-2.6.15-ck4/mm/swap_state.c
===================================================================
--- linux-2.6.15-ck4.orig/mm/swap_state.c	2006-02-11 11:23:50.000000000 +1100
+++ linux-2.6.15-ck4/mm/swap_state.c	2006-02-12 02:20:31.000000000 +1100
@@ -148,7 +148,7 @@ int add_to_swap(struct page * page)
 	int err;
 
 	/* Swap prefetching is delayed if we're swapping pages */
-	delay_prefetch();
+	delay_swap_prefetch();
 
 	if (!PageLocked(page))
 		BUG();
@@ -325,7 +325,7 @@ struct page *read_swap_cache_async(swp_e
 	int err;
 
 	/* Swap prefetching is delayed if we're already reading from swap */
-	delay_prefetch();
+	delay_swap_prefetch();
 
 	do {
 		/*
Index: linux-2.6.15-ck4/mm/vmscan.c
===================================================================
--- linux-2.6.15-ck4.orig/mm/vmscan.c	2006-02-11 11:23:51.000000000 +1100
+++ linux-2.6.15-ck4/mm/vmscan.c	2006-02-12 02:25:44.000000000 +1100
@@ -536,7 +536,7 @@ static int shrink_list(struct list_head 
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
 			swp_entry_t swap = { .val = page_private(page) };
-			add_to_swapped_list(swap.val);
+			add_to_swapped_list(page);
 			__delete_from_swap_cache(page);
 			write_unlock_irq(&mapping->tree_lock);
 			swap_free(swap);
@@ -994,7 +994,7 @@ int try_to_free_pages(struct zone **zone
 	if (p)
 		scan_priority = sc_priority(p);
 
-	delay_prefetch();
+	delay_swap_prefetch();
 
 	sc.gfp_mask = gfp_mask;
 	sc.may_writepage = 0;
@@ -1371,7 +1371,7 @@ int shrink_all_memory(int nr_pages)
 		.reclaimed_slab = 0,
 	};
 
-	delay_prefetch();
+	delay_swap_prefetch();
 
 	current->reclaim_state = &reclaim_state;
 	for_each_pgdat(pgdat) {
Index: linux-2.6.15-ck4/include/linux/mm_inline.h
===================================================================
--- linux-2.6.15-ck4.orig/include/linux/mm_inline.h	2004-03-11 21:29:27.000000000 +1100
+++ linux-2.6.15-ck4/include/linux/mm_inline.h	2006-02-12 02:20:31.000000000 +1100
@@ -14,6 +14,13 @@ add_page_to_inactive_list(struct zone *z
 }
 
 static inline void
+add_page_to_inactive_list_tail(struct zone *zone, struct page *page)
+{
+	list_add_tail(&page->lru, &zone->inactive_list);
+	zone->nr_inactive++;
+}
+
+static inline void
 del_page_from_active_list(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
