Documentation/sysctl/vm.txt | 10 include/linux/mm_inline.h | 7 include/linux/swap.h | 50 +++- include/linux/sysctl.h | 1 init/Kconfig | 5 kernel/sysctl.c | 4 mm/page_alloc.c | 12 - mm/swap.c | 41 +++ mm/swap_prefetch.c | 472 ++++++++++++++++++++++++-------------------- mm/swap_state.c | 4 mm/vmscan.c | 6 11 files changed, 363 insertions(+), 249 deletions(-) Index: linux-2.6.15-ck4/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.15-ck4.orig/Documentation/sysctl/vm.txt 2006-02-11 11:23:50.000000000 +1100 +++ linux-2.6.15-ck4/Documentation/sysctl/vm.txt 2006-02-12 02:20:31.000000000 +1100 @@ -108,9 +108,9 @@ a number of reserved free pages based pr swap_prefetch -This is the amount of data prefetched per prefetching interval when -swap prefetching is compiled in. The value means multiples of 128K, -except when laptop_mode is enabled and then it is ten times larger. -Setting it to 0 disables prefetching entirely. +This enables or disables the swap prefetching feature. When the virtual +memory subsystem has been extremely idle for at least 5 seconds it will start +copying back pages from swap into the swapcache and keep a copy in swap. In +practice it can take many minutes before the vm is idle enough. -The default value is dependant on ramsize. +The default value is 1. Index: linux-2.6.15-ck4/include/linux/swap.h =================================================================== --- linux-2.6.15-ck4.orig/include/linux/swap.h 2006-02-11 11:23:51.000000000 +1100 +++ linux-2.6.15-ck4/include/linux/swap.h 2006-02-12 02:23:15.000000000 +1100 @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -164,6 +165,7 @@ extern unsigned int nr_free_pagecache_pa /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); extern void FASTCALL(lru_cache_add_active(struct page *)); +extern void FASTCALL(lru_cache_add_tail(struct page *)); extern void FASTCALL(activate_page(struct page *)); extern void FASTCALL(mark_page_accessed(struct page *)); extern void lru_add_drain(void); @@ -185,32 +187,52 @@ extern int shmem_unuse(swp_entry_t entry extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); #ifdef CONFIG_SWAP_PREFETCH -/* only used by prefetch externally */ -/* mm/swap_prefetch.c */ -extern void prepare_prefetch(void); -extern void add_to_swapped_list(unsigned long index); -extern void remove_from_swapped_list(unsigned long index); -extern void delay_prefetch(void); -/* linux/mm/page_alloc.c */ -extern struct page * -buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags); -extern void zone_statistics(struct zonelist *zonelist, struct zone *z); +/* mm/swap_prefetch.c */ extern int swap_prefetch; +struct swapped_entry { + swp_entry_t swp_entry; /* The actual swap entry */ + struct list_head swapped_list; /* Linked list of entries */ +#if MAX_NUMNODES > 1 + int node; /* Node id */ +#endif +} __attribute__((packed)); + +static inline void store_swap_entry_node(struct swapped_entry *entry, + struct page *page) +{ +#if MAX_NUMNODES > 1 + entry->node = page_to_nid(page); +#endif +} + +static inline int get_swap_entry_node(struct swapped_entry *entry) +{ +#if MAX_NUMNODES > 1 + return entry->node; +#else + return 0; +#endif +} + +extern void add_to_swapped_list(struct page *page); +extern void remove_from_swapped_list(const unsigned long index); +extern void delay_swap_prefetch(void); +extern void prepare_swap_prefetch(void); #else /* CONFIG_SWAP_PREFETCH */ -static inline void add_to_swapped_list(unsigned long index) +static inline void add_to_swapped_list(struct page *__unused) { } -static inline void prepare_prefetch(void) +static inline void prepare_swap_prefetch(void) { } -static inline void remove_from_swapped_list(unsigned long index) +static inline void remove_from_swapped_list(unsigned long __unused) { } -static inline void delay_prefetch(void) +static inline void delay_swap_prefetch(void) { } Index: linux-2.6.15-ck4/include/linux/sysctl.h =================================================================== --- linux-2.6.15-ck4.orig/include/linux/sysctl.h 2006-02-11 11:23:51.000000000 +1100 +++ linux-2.6.15-ck4/include/linux/sysctl.h 2006-02-12 02:23:42.000000000 +1100 @@ -185,6 +185,7 @@ enum VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ VM_SWAP_PREFETCH=29, /* int: amount to swap prefetch */ VM_HARDMAPLIMIT=30, /* Make mapped a hard limit */ + VM_SWAP_PREFETCH=31, /* swap prefetch */ }; Index: linux-2.6.15-ck4/init/Kconfig =================================================================== --- linux-2.6.15-ck4.orig/init/Kconfig 2006-02-11 11:23:50.000000000 +1100 +++ linux-2.6.15-ck4/init/Kconfig 2006-02-12 02:20:31.000000000 +1100 @@ -120,9 +120,10 @@ config SWAP_PREFETCH computer after leaving it idle for a while, applications will come to life faster. Note that your swap usage will appear to increase but these are cached pages, can be dropped freely by the vm, and it - should stabilise around 50% swap usage. + should stabilise around 50% swap usage maximum. - Desktop users will most likely want to say Y. + Workstations and multiuser workstation servers will most likely want + to say Y. config SYSVIPC bool "System V IPC" Index: linux-2.6.15-ck4/kernel/sysctl.c =================================================================== --- linux-2.6.15-ck4.orig/kernel/sysctl.c 2006-02-11 11:23:51.000000000 +1100 +++ linux-2.6.15-ck4/kernel/sysctl.c 2006-02-12 02:20:31.000000000 +1100 @@ -882,6 +882,7 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies, }, +#endif #ifdef CONFIG_SWAP_PREFETCH { .ctl_name = VM_SWAP_PREFETCH, @@ -890,11 +891,8 @@ static ctl_table vm_table[] = { .maxlen = sizeof(swap_prefetch), .mode = 0644, .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, - .extra1 = &zero, }, #endif -#endif { .ctl_name = 0 } }; Index: linux-2.6.15-ck4/mm/page_alloc.c =================================================================== --- linux-2.6.15-ck4.orig/mm/page_alloc.c 2006-02-11 11:23:51.000000000 +1100 +++ linux-2.6.15-ck4/mm/page_alloc.c 2006-02-12 02:24:30.000000000 +1100 @@ -727,7 +727,7 @@ static inline void prep_zero_page(struct * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -struct page * +static struct page * buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; @@ -797,7 +797,7 @@ int zone_watermark_ok(struct zone *z, in min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) - goto out_failed; + return 0; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o; @@ -806,15 +806,9 @@ int zone_watermark_ok(struct zone *z, in min >>= 1; if (free_pages <= min) - goto out_failed; + return 0; } - return 1; -out_failed: - /* Swap prefetching is delayed if any watermark is low */ - delay_prefetch(); - - return 0; } /* Index: linux-2.6.15-ck4/mm/swap.c =================================================================== --- linux-2.6.15-ck4.orig/mm/swap.c 2006-02-11 11:23:50.000000000 +1100 +++ linux-2.6.15-ck4/mm/swap.c 2006-02-12 02:20:31.000000000 +1100 @@ -338,6 +338,45 @@ void __pagevec_lru_add_active(struct pag pagevec_reinit(pvec); } +static inline void __pagevec_lru_add_tail(struct pagevec *pvec) +{ + int i; + struct zone *zone = NULL; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + if (TestSetPageLRU(page)) + BUG(); + add_page_to_inactive_list_tail(zone, page); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + +/* + * Function used uniquely to put pages back to the lru at the end of the + * inactive list currently only used by swap prefetch. + */ +void fastcall lru_cache_add_tail(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + __pagevec_lru_add_tail(pvec); + put_cpu_var(lru_add_pvecs); +} + /* * Try to drop buffers from the pages in a pagevec */ @@ -480,7 +519,7 @@ void __init swap_setup(void) * _really_ don't want to cluster much more */ - prepare_prefetch(); + prepare_swap_prefetch(); hotcpu_notifier(cpu_swap_callback, 0); } Index: linux-2.6.15-ck4/mm/swap_prefetch.c =================================================================== --- linux-2.6.15-ck4.orig/mm/swap_prefetch.c 2006-02-11 11:23:50.000000000 +1100 +++ linux-2.6.15-ck4/mm/swap_prefetch.c 2006-02-12 02:25:06.000000000 +1100 @@ -1,7 +1,7 @@ /* * linux/mm/swap_prefetch.c * - * Copyright (C) 2005 Con Kolivas + * Copyright (C) 2005-2006 Con Kolivas * * Written by Con Kolivas * @@ -18,13 +18,15 @@ #include #include -/* Time to delay prefetching if vm is busy or prefetching unsuccessful */ +/* + * Time to delay prefetching if vm is busy or prefetching unsuccessful. There + * needs to be at least this duration of idle time meaning in practice it can + * be much longer + */ #define PREFETCH_DELAY (HZ * 5) -/* Time between attempting prefetching when vm is idle */ -#define PREFETCH_INTERVAL (HZ) -/* sysctl - how many SWAP_CLUSTER_MAX pages to prefetch at a time */ -int swap_prefetch __read_mostly; +/* sysctl - enable/disable swap prefetching */ +int swap_prefetch __read_mostly = 1; struct swapped_root { unsigned long busy; /* vm busy */ @@ -33,73 +35,51 @@ struct swapped_root { struct radix_tree_root swap_tree; /* Lookup tree of pages */ unsigned int count; /* Number of entries */ unsigned int maxcount; /* Maximum entries allowed */ - kmem_cache_t *cache; -}; - -struct swapped_entry { - swp_entry_t swp_entry; - struct list_head swapped_list; + kmem_cache_t *cache; /* Of struct swapped_entry */ }; static struct swapped_root swapped = { - .busy = 0, + .busy = 0, /* Any vm activity */ .lock = SPIN_LOCK_UNLOCKED, .list = LIST_HEAD_INIT(swapped.list), .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), - .count = 0, + .count = 0, /* Number of swapped entries */ }; static task_t *kprefetchd_task; -/* Max mapped we will prefetch to */ -static unsigned long mapped_limit __read_mostly; -/* Last total free pages */ -static unsigned long last_free = 0; -static unsigned long temp_free = 0; - -/* - * Create kmem cache for swapped entries - */ -void __init prepare_prefetch(void) -{ - long mem = nr_free_pagecache_pages(); - - swapped.cache = kmem_cache_create("swapped_entry", - sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); - - /* Set max number of entries to size of physical ram */ - swapped.maxcount = mem; - /* Set maximum amount of mapped pages to prefetch to 2/3 ram */ - mapped_limit = mem / 3 * 2; - - /* Set initial swap_prefetch value according to memory size */ - swap_prefetch = mem / 10000 ? : 1; -} - /* * We check to see no part of the vm is busy. If it is this will interrupt * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. */ -inline void delay_prefetch(void) +inline void delay_swap_prefetch(void) { - __set_bit(0, &swapped.busy); + if (!test_bit(0, &swapped.busy)) + __set_bit(0, &swapped.busy); } /* - * Accounting is sloppy on purpose. As adding and removing entries from the - * list happens during swapping in and out we don't want to be spinning on - * locks. It is cheaper to just miss adding an entry since having a reference - * to every entry is not critical. + * Drop behind accounting which keeps a list of the most recently used swap + * entries. */ -void add_to_swapped_list(unsigned long index) +void add_to_swapped_list(struct page *page) { struct swapped_entry *entry; - int error; + unsigned long index; + int wakeup; - if (unlikely(!spin_trylock(&swapped.lock))) - goto out; + if (!swap_prefetch) + return; + wakeup = 0; + + spin_lock(&swapped.lock); if (swapped.count >= swapped.maxcount) { + /* + * We limit the number of entries to 2/3 of physical ram. + * Once the number of entries exceeds this we start removing + * the least recently used entries. + */ entry = list_entry(swapped.list.next, struct swapped_entry, swapped_list); radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); @@ -112,42 +92,49 @@ void add_to_swapped_list(unsigned long i goto out_locked; } + index = page_private(page); entry->swp_entry.val = index; + /* + * On numa we need to store the node id to ensure that we prefetch to + * the same node it came from. + */ + store_swap_entry_node(entry, page); - error = radix_tree_preload(GFP_ATOMIC); - if (likely(!error)) { - error = radix_tree_insert(&swapped.swap_tree, index, entry); - if (likely(!error)) { - /* - * If this is the first entry, kprefetchd needs to be - * (re)started - */ - if (list_empty(&swapped.list)) - wake_up_process(kprefetchd_task); - list_add(&entry->swapped_list, &swapped.list); - swapped.count++; - } - radix_tree_preload_end(); - } else - kmem_cache_free(swapped.cache, entry); + if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { + /* + * If this is the first entry, kprefetchd needs to be + * (re)started. + */ + if (list_empty(&swapped.list)) + wakeup = 1; + list_add(&entry->swapped_list, &swapped.list); + swapped.count++; + } out_locked: spin_unlock(&swapped.lock); -out: + + /* Do the wakeup outside the lock to shorten lock hold time. */ + if (wakeup) + wake_up_process(kprefetchd_task); + return; } /* - * Cheaper to not spin on the lock and remove the entry lazily via - * add_to_swap_cache when we hit it in trickle_swap_cache_async + * Removes entries from the swapped_list. The radix tree allows us to quickly + * look up the entry from the index without having to iterate over the whole + * list. */ -void remove_from_swapped_list(unsigned long index) +void remove_from_swapped_list(const unsigned long index) { struct swapped_entry *entry; unsigned long flags; - if (unlikely(!spin_trylock_irqsave(&swapped.lock, flags))) + if (list_empty(&swapped.list)) return; + + spin_lock_irqsave(&swapped.lock, flags); entry = radix_tree_delete(&swapped.swap_tree, index); if (likely(entry)) { list_del_init(&entry->swapped_list); @@ -157,59 +144,6 @@ void remove_from_swapped_list(unsigned l spin_unlock_irqrestore(&swapped.lock, flags); } -static inline int high_zone(struct zone *zone) -{ - if (zone == NULL) - return 0; - return is_highmem(zone); -} - -/* - * Find the zone with the most free pages, recheck the watermarks and - * then directly allocate the ram. We don't want prefetch to use - * __alloc_pages and go calling on reclaim. - */ -static struct page *prefetch_get_page(void) -{ - struct zone *zone = NULL, *z; - struct page *page = NULL; - long most_free = 0; - - for_each_zone(z) { - long free; - - if (z->present_pages == 0) - continue; - - /* We don't prefetch into DMA */ - if (zone_idx(z) == ZONE_DMA) - continue; - - free = z->free_pages; - /* Select the zone with the most free ram preferring high */ - if ((free > most_free && (!high_zone(zone) || high_zone(z))) || - (!high_zone(zone) && high_zone(z))) { - most_free = free; - zone = z; - } - } - - if (zone == NULL) - goto out; - - page = buffered_rmqueue(zone, 0, GFP_HIGHUSER); - if (likely(page)) { - struct zonelist *zonelist; - - zonelist = NODE_DATA(numa_node_id())->node_zonelists + - (GFP_HIGHUSER & GFP_ZONEMASK); - - zone_statistics(zonelist, zone); - } -out: - return page; -} - enum trickle_return { TRICKLE_SUCCESS, TRICKLE_FAILED, @@ -217,43 +151,68 @@ enum trickle_return { }; /* + * prefetch_stats stores the free ram data of each node and this is used to + * determine if a node is suitable for prefetching into. + */ +struct prefetch_stats{ + unsigned long last_free[MAX_NUMNODES]; + /* Free ram after a cycle of prefetching */ + unsigned long current_free[MAX_NUMNODES]; + /* Free ram on this cycle of checking prefetch_suitable */ + unsigned long prefetch_watermark[MAX_NUMNODES]; + /* Maximum amount we will prefetch to */ + nodemask_t prefetch_nodes; + /* Which nodes are currently suited to prefetching */ + unsigned long prefetched_pages; + /* Total pages we've prefetched on this wakeup of kprefetchd */ +}; + +static struct prefetch_stats sp_stat; + +/* * This tries to read a swp_entry_t into swap cache for swap prefetching. * If it returns TRICKLE_DELAY we should delay further prefetching. */ -static enum trickle_return trickle_swap_cache_async(swp_entry_t entry) +static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, + const int node) { enum trickle_return ret = TRICKLE_FAILED; - struct page *page = NULL; + struct page *page; - if (unlikely(!read_trylock(&swapper_space.tree_lock))) { - ret = TRICKLE_DELAY; - goto out; - } + read_lock_irq(&swapper_space.tree_lock); /* Entry may already exist */ page = radix_tree_lookup(&swapper_space.page_tree, entry.val); - read_unlock(&swapper_space.tree_lock); + read_unlock_irq(&swapper_space.tree_lock); if (page) { remove_from_swapped_list(entry.val); goto out; } - /* Get a new page to read from swap */ - page = prefetch_get_page(); + /* + * Get a new page to read from swap. We have already checked the + * watermarks so __alloc_pages will not call on reclaim. + */ + page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); if (unlikely(!page)) { ret = TRICKLE_DELAY; goto out; } - if (add_to_swap_cache(page, entry)) + if (add_to_swap_cache(page, entry)) { /* Failed to add to swap cache */ goto out_release; + } - lru_cache_add(page); + /* Add them to the tail of the inactive list to preserve LRU order */ + lru_cache_add_tail(page); if (unlikely(swap_readpage(NULL, page))) { ret = TRICKLE_DELAY; goto out_release; } + sp_stat.prefetched_pages++; + sp_stat.last_free[node]--; + ret = TRICKLE_SUCCESS; out_release: page_cache_release(page); @@ -261,14 +220,26 @@ out: return ret; } -/* - * How many pages to prefetch at a time. We prefetch SWAP_CLUSTER_MAX * - * swap_prefetch per PREFETCH_INTERVAL, but prefetch ten times as much at a - * time in laptop_mode to minimise the time we keep the disk spinning. - */ -static inline unsigned long prefetch_pages(void) +static void clear_last_prefetch_free(void) { - return (SWAP_CLUSTER_MAX * swap_prefetch * (1 + 9 * !!laptop_mode)); + int node; + + /* + * Reset the nodes suitable for prefetching to all nodes. We could + * update the data to take into account memory hotplug if desired.. + */ + sp_stat.prefetch_nodes = node_online_map; + for_each_node_mask(node, sp_stat.prefetch_nodes) + sp_stat.last_free[node] = 0; +} + +static void clear_current_prefetch_free(void) +{ + int node; + + sp_stat.prefetch_nodes = node_online_map; + for_each_node_mask(node, sp_stat.prefetch_nodes) + sp_stat.current_free[node] = 0; } /* @@ -279,13 +250,14 @@ static int prefetch_suitable(void) struct page_state ps; unsigned long limit; struct zone *z; - int ret = 0; + int node, ret = 0; /* Purposefully racy and might return false positive which is ok */ if (__test_and_clear_bit(0, &swapped.busy)) goto out; - temp_free = 0; + clear_current_prefetch_free(); + /* * Have some hysteresis between where page reclaiming and prefetching * will occur to prevent ping-ponging between them. @@ -293,40 +265,72 @@ static int prefetch_suitable(void) for_each_zone(z) { unsigned long free; - if (z->present_pages == 0) + if (!populated_zone(z)) continue; + node = z->zone_pgdat->node_id; + free = z->free_pages; - if (z->pages_high * 3 > free) - goto out; - temp_free += free; + if (z->pages_high * 3 + z->lowmem_reserve[zone_idx(z)] > free) { + node_clear(node, sp_stat.prefetch_nodes); + continue; + } + sp_stat.current_free[node] += free; } /* - * We check to see that pages are not being allocated elsewhere - * at any significant rate implying any degree of memory pressure - * (eg during file reads) + * We iterate over each node testing to see if it is suitable for + * prefetching and clear the nodemask if it is not. */ - if (last_free) { - if (temp_free + SWAP_CLUSTER_MAX < last_free) { - last_free = temp_free; - goto out; - } - } else - last_free = temp_free; + for_each_node_mask(node, sp_stat.prefetch_nodes) { + /* + * We check to see that pages are not being allocated + * elsewhere at any significant rate implying any + * degree of memory pressure (eg during file reads) + */ + if (sp_stat.last_free[node]) { + if (sp_stat.current_free[node] + SWAP_CLUSTER_MAX < + sp_stat.last_free[node]) { + sp_stat.last_free[node] = + sp_stat.current_free[node]; + node_clear(node, + sp_stat.prefetch_nodes); + continue; + } + } else + sp_stat.last_free[node] = sp_stat.current_free[node]; - get_page_state(&ps); + /* + * get_page_state is super expensive so we only perform it + * every SWAP_CLUSTER_MAX prefetched_pages + */ + if (sp_stat.prefetched_pages % SWAP_CLUSTER_MAX) + continue; - /* We shouldn't prefetch when we are doing writeback */ - if (ps.nr_writeback) - goto out; + get_page_state_node(&ps, node); - /* - * >2/3 of the ram is mapped, swapcache or dirty, we need some free - * for pagecache - */ - limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty + ps.nr_unstable + - total_swapcache_pages; - if (limit > mapped_limit) + /* We shouldn't prefetch when we are doing writeback */ + if (ps.nr_writeback) { + node_clear(node, sp_stat.prefetch_nodes); + continue; + } + + /* + * >2/3 of the ram on this node is mapped, slab, swapcache or + * dirty, we need to leave some free for pagecache. + * Note that currently nr_slab is innacurate on numa because + * nr_slab is incremented on the node doing the accounting + * even if the slab is being allocated on a remote node. This + * would be expensive to fix and not of great significance. + */ + limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty + + ps.nr_unstable + total_swapcache_pages; + if (limit > sp_stat.prefetch_watermark[node]) { + node_clear(node, sp_stat.prefetch_nodes); + continue; + } + } + + if (nodes_empty(sp_stat.prefetch_nodes)) goto out; /* Survived all that? Hooray we can prefetch! */ @@ -336,51 +340,83 @@ out: } /* + * Get next swapped entry when iterating over all entries. swapped.lock + * should be held and we should already ensure that entry exists. + */ +static inline struct swapped_entry *next_swapped_entry + (struct swapped_entry *entry) +{ + return list_entry(entry->swapped_list.next->next, + struct swapped_entry, swapped_list); +} + +/* * trickle_swap is the main function that initiates the swap prefetching. It * first checks to see if the busy flag is set, and does not prefetch if it * is, as the flag implied we are low on memory or swapping in currently. - * Otherwise it runs till prefetch_pages() are prefetched. + * Otherwise it runs until prefetch_suitable fails which occurs when the + * vm is busy, we prefetch to the watermark, or the list is empty. */ static enum trickle_return trickle_swap(void) { enum trickle_return ret = TRICKLE_DELAY; struct swapped_entry *entry; - int pages = 0; - while (pages < prefetch_pages()) { - enum trickle_return got_page; + if (!swap_prefetch) + return ret; + + entry = NULL; + + for ( ; ; ) { + swp_entry_t swp_entry; + int node; if (!prefetch_suitable()) - goto out; - /* Lock is held? We must be busy elsewhere */ - if (unlikely(!spin_trylock(&swapped.lock))) - goto out; + break; + + spin_lock(&swapped.lock); if (list_empty(&swapped.list)) { - spin_unlock(&swapped.lock); ret = TRICKLE_FAILED; - goto out; + spin_unlock(&swapped.lock); + break; } - entry = list_entry(swapped.list.next, - struct swapped_entry, swapped_list); - spin_unlock(&swapped.lock); - got_page = trickle_swap_cache_async(entry->swp_entry); - switch (got_page) { - case TRICKLE_FAILED: - break; - case TRICKLE_SUCCESS: - last_free--; - pages++; + if (!entry) { + /* + * This sets the entry for the first iteration. It + * also is a safeguard against the entry disappearing + * while the lock is not held. + */ + entry = list_entry(swapped.list.next, + struct swapped_entry, swapped_list); + } else if (entry->swapped_list.next == swapped.list.next) { + /* Have we iterated over all entries? */ + spin_unlock(&swapped.lock); break; - case TRICKLE_DELAY: - goto out; } + + node = get_swap_entry_node(entry); + if (!node_isset(node, sp_stat.prefetch_nodes)) { + /* + * We found an entry that belongs to a node that is + * not suitable for prefetching so skip it. + */ + entry = next_swapped_entry(entry); + spin_unlock(&swapped.lock); + continue; + } + swp_entry = entry->swp_entry; + entry = next_swapped_entry(entry); + spin_unlock(&swapped.lock); + + if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) + break; } - ret = TRICKLE_SUCCESS; -out: - if (pages) + if (sp_stat.prefetched_pages) { lru_add_drain(); + sp_stat.prefetched_pages = 0; + } return ret; } @@ -391,35 +427,51 @@ static int kprefetchd(void *__unused) sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE); do { - enum trickle_return prefetched; - try_to_freeze(); /* * TRICKLE_FAILED implies no entries left - we do not schedule * a wakeup, and further delay the next one. */ - prefetched = trickle_swap(); - switch (prefetched) { - case TRICKLE_SUCCESS: - last_free = temp_free; - schedule_timeout_interruptible(PREFETCH_INTERVAL); - break; - case TRICKLE_DELAY: - last_free = 0; - schedule_timeout_interruptible(PREFETCH_DELAY); - break; - case TRICKLE_FAILED: - last_free = 0; - schedule_timeout_interruptible(MAX_SCHEDULE_TIMEOUT); - schedule_timeout_interruptible(PREFETCH_DELAY); - break; + if (trickle_swap() == TRICKLE_FAILED) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); } + clear_last_prefetch_free(); + schedule_timeout_interruptible(PREFETCH_DELAY); } while (!kthread_should_stop()); return 0; } +/* + * Create kmem cache for swapped entries + */ +void __init prepare_swap_prefetch(void) +{ + pg_data_t *pgdat; + int node; + + swapped.cache = kmem_cache_create("swapped_entry", + sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); + + /* + * Set max number of entries to 2/3 the size of physical ram as we + * only ever prefetch to consume 2/3 of the ram. + */ + swapped.maxcount = nr_free_pagecache_pages() / 3 * 2; + + for_each_pgdat(pgdat) { + unsigned long present; + + present = pgdat->node_present_pages; + if (!present) + continue; + node = pgdat->node_id; + sp_stat.prefetch_watermark[node] += present / 3 * 2; + } +} + static int __init kprefetchd_init(void) { kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); Index: linux-2.6.15-ck4/mm/swap_state.c =================================================================== --- linux-2.6.15-ck4.orig/mm/swap_state.c 2006-02-11 11:23:50.000000000 +1100 +++ linux-2.6.15-ck4/mm/swap_state.c 2006-02-12 02:20:31.000000000 +1100 @@ -148,7 +148,7 @@ int add_to_swap(struct page * page) int err; /* Swap prefetching is delayed if we're swapping pages */ - delay_prefetch(); + delay_swap_prefetch(); if (!PageLocked(page)) BUG(); @@ -325,7 +325,7 @@ struct page *read_swap_cache_async(swp_e int err; /* Swap prefetching is delayed if we're already reading from swap */ - delay_prefetch(); + delay_swap_prefetch(); do { /* Index: linux-2.6.15-ck4/mm/vmscan.c =================================================================== --- linux-2.6.15-ck4.orig/mm/vmscan.c 2006-02-11 11:23:51.000000000 +1100 +++ linux-2.6.15-ck4/mm/vmscan.c 2006-02-12 02:25:44.000000000 +1100 @@ -536,7 +536,7 @@ static int shrink_list(struct list_head #ifdef CONFIG_SWAP if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; - add_to_swapped_list(swap.val); + add_to_swapped_list(page); __delete_from_swap_cache(page); write_unlock_irq(&mapping->tree_lock); swap_free(swap); @@ -994,7 +994,7 @@ int try_to_free_pages(struct zone **zone if (p) scan_priority = sc_priority(p); - delay_prefetch(); + delay_swap_prefetch(); sc.gfp_mask = gfp_mask; sc.may_writepage = 0; @@ -1371,7 +1371,7 @@ int shrink_all_memory(int nr_pages) .reclaimed_slab = 0, }; - delay_prefetch(); + delay_swap_prefetch(); current->reclaim_state = &reclaim_state; for_each_pgdat(pgdat) { Index: linux-2.6.15-ck4/include/linux/mm_inline.h =================================================================== --- linux-2.6.15-ck4.orig/include/linux/mm_inline.h 2004-03-11 21:29:27.000000000 +1100 +++ linux-2.6.15-ck4/include/linux/mm_inline.h 2006-02-12 02:20:31.000000000 +1100 @@ -14,6 +14,13 @@ add_page_to_inactive_list(struct zone *z } static inline void +add_page_to_inactive_list_tail(struct zone *zone, struct page *page) +{ + list_add_tail(&page->lru, &zone->inactive_list); + zone->nr_inactive++; +} + +static inline void del_page_from_active_list(struct zone *zone, struct page *page) { list_del(&page->lru);