Index: linux-2.6.21-sp/include/linux/sched.h =================================================================== --- linux-2.6.21-sp.orig/include/linux/sched.h 2007-05-01 14:52:06.000000000 +1000 +++ linux-2.6.21-sp/include/linux/sched.h 2007-05-14 10:57:50.000000000 +1000 @@ -740,6 +740,22 @@ extern unsigned int max_cache_size; #endif /* CONFIG_SMP */ +/* + * A runqueue laden with a single nice 0 task scores a weighted_cpuload of + * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a + * task of nice 0 or enough lower priority tasks to bring up the + * weighted_cpuload + */ +static inline int above_background_load(void) +{ + unsigned long cpu; + + for_each_online_cpu(cpu) { + if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) + return 1; + } + return 0; +} struct io_context; /* See blkdev.h */ struct cpuset; Index: linux-2.6.21-sp/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.21-sp.orig/Documentation/sysctl/vm.txt 2007-02-14 09:09:32.000000000 +1100 +++ linux-2.6.21-sp/Documentation/sysctl/vm.txt 2007-05-14 10:59:14.000000000 +1000 @@ -31,6 +31,7 @@ Currently, these files are in /proc/sys/ - min_unmapped_ratio - min_slab_ratio - panic_on_oom +- swap_prefetch ============================================================== @@ -205,3 +206,16 @@ rather than killing rogue processes, set The default value is 0. +============================================================== + +swap_prefetch + +This enables or disables the swap prefetching feature. When the virtual +memory subsystem has been extremely idle for at least 5 seconds it will start +copying back pages from swap into the swapcache and keep a copy in swap. In +practice it can take many minutes before the vm is idle enough. A value of 0 +disables swap prefetching, 1 enables it unless laptop_mode is enabled, and 2 +enables it even in the presence of laptop_mode. + +The default value is 1. + Index: linux-2.6.21-sp/include/linux/swap.h =================================================================== --- linux-2.6.21-sp.orig/include/linux/swap.h 2007-05-01 14:52:06.000000000 +1000 +++ linux-2.6.21-sp/include/linux/swap.h 2007-05-14 10:58:04.000000000 +1000 @@ -180,6 +180,7 @@ extern unsigned int nr_free_pagecache_pa /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); extern void FASTCALL(lru_cache_add_active(struct page *)); +extern void FASTCALL(lru_cache_add_tail(struct page *)); extern void FASTCALL(activate_page(struct page *)); extern void FASTCALL(mark_page_accessed(struct page *)); extern void lru_add_drain(void); @@ -237,6 +238,7 @@ extern void free_pages_and_swap_cache(st extern struct page * lookup_swap_cache(swp_entry_t); extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, unsigned long addr); +extern int add_to_swap_cache(struct page *page, swp_entry_t entry); /* linux/mm/swapfile.c */ extern long total_swap_pages; extern unsigned int nr_swapfiles; Index: linux-2.6.21-sp/init/Kconfig =================================================================== --- linux-2.6.21-sp.orig/init/Kconfig 2007-05-01 14:52:06.000000000 +1000 +++ linux-2.6.21-sp/init/Kconfig 2007-05-14 10:58:04.000000000 +1000 @@ -101,6 +101,28 @@ config SWAP used to provide more virtual memory than the actual RAM present in your computer. If unsure say Y. +config SWAP_PREFETCH + bool "Support for prefetching swapped memory" + depends on SWAP + default y + ---help--- + This option will allow the kernel to prefetch swapped memory pages + when idle. The pages will be kept on both swap and in swap_cache + thus avoiding the need for further I/O if either ram or swap space + is required. + + What this will do on workstations is slowly bring back applications + that have swapped out after memory intensive workloads back into + physical ram if you have free ram at a later stage and the machine + is relatively idle. This means that when you come back to your + computer after leaving it idle for a while, applications will come + to life faster. Note that your swap usage will appear to increase + but these are cached pages, can be dropped freely by the vm, and it + should stabilise around 50% swap usage maximum. + + Workstations and multiuser workstation servers will most likely want + to say Y. + config SYSVIPC bool "System V IPC" ---help--- Index: linux-2.6.21-sp/kernel/sysctl.c =================================================================== --- linux-2.6.21-sp.orig/kernel/sysctl.c 2007-05-01 14:52:06.000000000 +1000 +++ linux-2.6.21-sp/kernel/sysctl.c 2007-05-14 10:58:04.000000000 +1000 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -859,6 +860,16 @@ static ctl_table vm_table[] = { .extra1 = &zero, }, #endif +#ifdef CONFIG_SWAP_PREFETCH + { + .ctl_name = CTL_UNNUMBERED, + .procname = "swap_prefetch", + .data = &swap_prefetch, + .maxlen = sizeof(swap_prefetch), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; Index: linux-2.6.21-sp/mm/Makefile =================================================================== --- linux-2.6.21-sp.orig/mm/Makefile 2006-11-30 08:57:37.000000000 +1100 +++ linux-2.6.21-sp/mm/Makefile 2007-05-14 10:58:04.000000000 +1000 @@ -17,6 +17,7 @@ ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) obj-y += bounce.o endif obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o Index: linux-2.6.21-sp/mm/swap.c =================================================================== --- linux-2.6.21-sp.orig/mm/swap.c 2007-02-14 09:09:44.000000000 +1100 +++ linux-2.6.21-sp/mm/swap.c 2007-05-14 10:58:04.000000000 +1000 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -176,6 +177,7 @@ EXPORT_SYMBOL(mark_page_accessed); */ static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; +static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, }; void fastcall lru_cache_add(struct page *page) { @@ -197,6 +199,31 @@ void fastcall lru_cache_add_active(struc put_cpu_var(lru_add_active_pvecs); } +static void __pagevec_lru_add_tail(struct pagevec *pvec) +{ + int i; + struct zone *zone = NULL; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + BUG_ON(PageLRU(page)); + SetPageLRU(page); + add_page_to_inactive_list_tail(zone, page); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + static void __lru_add_drain(int cpu) { struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); @@ -207,6 +234,9 @@ static void __lru_add_drain(int cpu) pvec = &per_cpu(lru_add_active_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add_active(pvec); + pvec = &per_cpu(lru_add_tail_pvecs, cpu); + if (pagevec_count(pvec)) + __pagevec_lru_add_tail(pvec); } void lru_add_drain(void) @@ -403,6 +433,21 @@ void __pagevec_lru_add_active(struct pag } /* + * Function used uniquely to put pages back to the lru at the end of the + * inactive list to preserve the lru order. Currently only used by swap + * prefetch. + */ +void fastcall lru_cache_add_tail(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + __pagevec_lru_add_tail(pvec); + put_cpu_var(lru_add_pvecs); +} + +/* * Try to drop buffers from the pages in a pagevec */ void pagevec_strip(struct pagevec *pvec) @@ -514,6 +559,9 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ + + prepare_swap_prefetch(); + #ifdef CONFIG_HOTPLUG_CPU hotcpu_notifier(cpu_swap_callback, 0); #endif Index: linux-2.6.21-sp/mm/swap_prefetch.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.21-sp/mm/swap_prefetch.c 2007-05-14 10:59:31.000000000 +1000 @@ -0,0 +1,562 @@ +/* + * linux/mm/swap_prefetch.c + * + * Copyright (C) 2005-2006 Con Kolivas + * + * Written by Con Kolivas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Time to delay prefetching if vm is busy or prefetching unsuccessful. There + * needs to be at least this duration of idle time meaning in practice it can + * be much longer + */ +#define PREFETCH_DELAY (HZ * 5) +#define DISABLED_PREFETCH_DELAY (HZ * 60) + +/* sysctl - enable/disable swap prefetching */ +int swap_prefetch __read_mostly = 1; + +struct swapped_root { + unsigned long busy; /* vm busy */ + spinlock_t lock; /* protects all data */ + struct list_head list; /* MRU list of swapped pages */ + struct radix_tree_root swap_tree; /* Lookup tree of pages */ + unsigned int count; /* Number of entries */ + unsigned int maxcount; /* Maximum entries allowed */ + struct kmem_cache *cache; /* Of struct swapped_entry */ +}; + +static struct swapped_root swapped = { + .lock = SPIN_LOCK_UNLOCKED, + .list = LIST_HEAD_INIT(swapped.list), + .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), +}; + +static struct task_struct *kprefetchd_task; + +/* + * We check to see no part of the vm is busy. If it is this will interrupt + * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. + */ +inline void delay_swap_prefetch(void) +{ + if (!test_bit(0, &swapped.busy)) + __set_bit(0, &swapped.busy); +} + +/* + * If laptop_mode is enabled don't prefetch to avoid hard drives + * doing unnecessary spin-ups unless swap_prefetch is explicitly + * set to a higher value. + */ +static inline int prefetch_enabled(void) +{ + if (swap_prefetch <= laptop_mode) + return 0; + return 1; +} + +static int wakeup_kprefetchd; + +/* + * Drop behind accounting which keeps a list of the most recently used swap + * entries. Entries are removed lazily by kprefetchd. + */ +void add_to_swapped_list(struct page *page) +{ + struct swapped_entry *entry; + unsigned long index, flags; + + if (!prefetch_enabled()) + goto out; + + spin_lock_irqsave(&swapped.lock, flags); + if (swapped.count >= swapped.maxcount) { + /* + * We limit the number of entries to 2/3 of physical ram. + * Once the number of entries exceeds this we start removing + * the least recently used entries. + */ + entry = list_entry(swapped.list.next, + struct swapped_entry, swapped_list); + radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); + list_del(&entry->swapped_list); + swapped.count--; + } else { + entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); + if (unlikely(!entry)) + /* bad, can't allocate more mem */ + goto out_locked; + } + + index = page_private(page); + entry->swp_entry.val = index; + /* + * On numa we need to store the node id to ensure that we prefetch to + * the same node it came from. + */ + store_swap_entry_node(entry, page); + + if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { + list_add(&entry->swapped_list, &swapped.list); + swapped.count++; + } else + kmem_cache_free(swapped.cache, entry); + +out_locked: + spin_unlock_irqrestore(&swapped.lock, flags); +out: + if (wakeup_kprefetchd) + wake_up_process(kprefetchd_task); + return; +} + +/* + * Removes entries from the swapped_list. The radix tree allows us to quickly + * look up the entry from the index without having to iterate over the whole + * list. + */ +void remove_from_swapped_list(const unsigned long index) +{ + struct swapped_entry *entry; + unsigned long flags; + + if (list_empty(&swapped.list)) + return; + + spin_lock_irqsave(&swapped.lock, flags); + entry = radix_tree_delete(&swapped.swap_tree, index); + if (likely(entry)) { + list_del(&entry->swapped_list); + swapped.count--; + kmem_cache_free(swapped.cache, entry); + } + spin_unlock_irqrestore(&swapped.lock, flags); +} + +enum trickle_return { + TRICKLE_SUCCESS, + TRICKLE_FAILED, + TRICKLE_DELAY, +}; + +struct node_stats { + /* Free ram after a cycle of prefetching */ + unsigned long last_free; + /* Free ram on this cycle of checking prefetch_suitable */ + unsigned long current_free; + /* Maximum amount we will prefetch to */ + unsigned long prefetch_watermark; + /* The amount of free ram before we start prefetching */ + unsigned long highfree[MAX_NR_ZONES]; + /* The amount of free ram where we will stop prefetching */ + unsigned long lowfree[MAX_NR_ZONES]; + /* highfree or lowfree depending on whether we've hit a watermark */ + unsigned long *pointfree[MAX_NR_ZONES]; +}; + +/* + * prefetch_stats stores the free ram data of each node and this is used to + * determine if a node is suitable for prefetching into. + */ +struct prefetch_stats { + /* Which nodes are currently suited to prefetching */ + nodemask_t prefetch_nodes; + /* Total pages we've prefetched on this wakeup of kprefetchd */ + unsigned long prefetched_pages; + struct node_stats node[MAX_NUMNODES]; +}; + +static struct prefetch_stats sp_stat; + +/* + * This tries to read a swp_entry_t into swap cache for swap prefetching. + * If it returns TRICKLE_DELAY we should delay further prefetching. + */ +static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, + const int node) +{ + enum trickle_return ret = TRICKLE_FAILED; + unsigned long flags; + struct page *page; + + read_lock_irqsave(&swapper_space.tree_lock, flags); + /* Entry may already exist */ + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + read_unlock_irqrestore(&swapper_space.tree_lock, flags); + if (page) + goto out; + + /* + * Get a new page to read from swap. We have already checked the + * watermarks so __alloc_pages will not call on reclaim. + */ + page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); + if (unlikely(!page)) { + ret = TRICKLE_DELAY; + goto out; + } + + if (add_to_swap_cache(page, entry)) { + /* Failed to add to swap cache */ + goto out_release; + } + + /* Add them to the tail of the inactive list to preserve LRU order */ + lru_cache_add_tail(page); + if (unlikely(swap_readpage(NULL, page))) + goto out_release; + + sp_stat.prefetched_pages++; + sp_stat.node[node].last_free--; + + ret = TRICKLE_SUCCESS; +out_release: + page_cache_release(page); +out: + /* + * All entries are removed here lazily. This avoids the cost of + * remove_from_swapped_list during normal swapin. Thus there are + * usually many stale entries. + */ + remove_from_swapped_list(entry.val); + return ret; +} + +static void clear_last_prefetch_free(void) +{ + int node; + + /* + * Reset the nodes suitable for prefetching to all nodes. We could + * update the data to take into account memory hotplug if desired.. + */ + sp_stat.prefetch_nodes = node_online_map; + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + + ns->last_free = 0; + } +} + +static void clear_current_prefetch_free(void) +{ + int node; + + sp_stat.prefetch_nodes = node_online_map; + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + + ns->current_free = 0; + } +} + +/* + * This updates the high and low watermarks of amount of free ram in each + * node used to start and stop prefetching. We prefetch from pages_high * 4 + * down to pages_high * 3. + */ +static void examine_free_limits(void) +{ + struct zone *z; + + for_each_zone(z) { + struct node_stats *ns; + int idx; + + if (!populated_zone(z)) + continue; + + ns = &sp_stat.node[z->zone_pgdat->node_id]; + idx = zone_idx(z); + ns->lowfree[idx] = z->pages_high * 3; + ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; + + if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) { + /* + * We've gotten above the high watermark of free pages + * so we can start prefetching till we get to the low + * watermark. + */ + ns->pointfree[idx] = &ns->lowfree[idx]; + } + } +} + +/* + * We want to be absolutely certain it's ok to start prefetching. + */ +static int prefetch_suitable(void) +{ + unsigned long limit; + struct zone *z; + int node, ret = 0, test_pagestate = 0; + + /* Purposefully racy */ + if (test_bit(0, &swapped.busy)) { + __clear_bit(0, &swapped.busy); + goto out; + } + + /* + * get_page_state and above_background_load are expensive so we only + * perform them every SWAP_CLUSTER_MAX prefetched_pages. + * We test to see if we're above_background_load as disk activity + * even at low priority can cause interrupt induced scheduling + * latencies. + */ + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) { + if (above_background_load()) + goto out; + test_pagestate = 1; + } + + clear_current_prefetch_free(); + + /* + * Have some hysteresis between where page reclaiming and prefetching + * will occur to prevent ping-ponging between them. + */ + for_each_zone(z) { + struct node_stats *ns; + unsigned long free; + int idx; + + if (!populated_zone(z)) + continue; + + node = z->zone_pgdat->node_id; + ns = &sp_stat.node[node]; + idx = zone_idx(z); + + free = zone_page_state(z, NR_FREE_PAGES); + if (free < *ns->pointfree[idx]) { + /* + * Free pages have dropped below the low watermark so + * we won't start prefetching again till we hit the + * high watermark of free pages. + */ + ns->pointfree[idx] = &ns->highfree[idx]; + node_clear(node, sp_stat.prefetch_nodes); + continue; + } + ns->current_free += free; + } + + /* + * We iterate over each node testing to see if it is suitable for + * prefetching and clear the nodemask if it is not. + */ + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + + /* + * We check to see that pages are not being allocated + * elsewhere at any significant rate implying any + * degree of memory pressure (eg during file reads) + */ + if (ns->last_free) { + if (ns->current_free + SWAP_CLUSTER_MAX < + ns->last_free) { + ns->last_free = ns->current_free; + node_clear(node, + sp_stat.prefetch_nodes); + continue; + } + } else + ns->last_free = ns->current_free; + + if (!test_pagestate) + continue; + + /* We shouldn't prefetch when we are doing writeback */ + if (node_page_state(node, NR_WRITEBACK)) { + node_clear(node, sp_stat.prefetch_nodes); + continue; + } + + /* + * >2/3 of the ram on this node is mapped, slab, swapcache or + * dirty, we need to leave some free for pagecache. + */ + limit = node_page_state(node, NR_FILE_PAGES); + limit += node_page_state(node, NR_SLAB_RECLAIMABLE); + limit += node_page_state(node, NR_SLAB_UNRECLAIMABLE); + limit += node_page_state(node, NR_FILE_DIRTY); + limit += node_page_state(node, NR_UNSTABLE_NFS); + limit += total_swapcache_pages; + if (limit > ns->prefetch_watermark) { + node_clear(node, sp_stat.prefetch_nodes); + continue; + } + } + + if (nodes_empty(sp_stat.prefetch_nodes)) + goto out; + + /* Survived all that? Hooray we can prefetch! */ + ret = 1; +out: + return ret; +} + +/* + * trickle_swap is the main function that initiates the swap prefetching. It + * first checks to see if the busy flag is set, and does not prefetch if it + * is, as the flag implied we are low on memory or swapping in currently. + * Otherwise it runs until prefetch_suitable fails which occurs when the + * vm is busy, we prefetch to the watermark, or the list is empty or we have + * iterated over all entries + */ +static enum trickle_return trickle_swap(void) +{ + enum trickle_return ret = TRICKLE_DELAY; + struct swapped_entry *pos, *n; + unsigned long flags; + + if (!prefetch_enabled()) + return ret; + + examine_free_limits(); + if (!prefetch_suitable()) + return ret; + if (list_empty(&swapped.list)) + return TRICKLE_FAILED; + + spin_lock_irqsave(&swapped.lock, flags); + list_for_each_entry_safe_reverse(pos, n, &swapped.list, swapped_list) { + swp_entry_t swp_entry; + int node; + + spin_unlock_irqrestore(&swapped.lock, flags); + /* Yield to anything else running */ + if (cond_resched() || !prefetch_suitable()) + goto out_unlocked; + + spin_lock_irqsave(&swapped.lock, flags); + if (unlikely(!pos)) + continue; + node = get_swap_entry_node(pos); + if (!node_isset(node, sp_stat.prefetch_nodes)) { + /* + * We found an entry that belongs to a node that is + * not suitable for prefetching so skip it. + */ + continue; + } + swp_entry = pos->swp_entry; + spin_unlock_irqrestore(&swapped.lock, flags); + + if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) + goto out_unlocked; + spin_lock_irqsave(&swapped.lock, flags); + } + spin_unlock_irqrestore(&swapped.lock, flags); + +out_unlocked: + if (sp_stat.prefetched_pages) { + lru_add_drain(); + sp_stat.prefetched_pages = 0; + } + return ret; +} + +static int kprefetchd(void *__unused) +{ + struct sched_param param = { .sched_priority = 0 }; + + sched_setscheduler(current, SCHED_BATCH, ¶m); + set_user_nice(current, 19); + /* Set ioprio to lowest if supported by i/o scheduler */ + sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE); + + /* kprefetchd has nothing to do until it is woken up the first time */ + wakeup_kprefetchd = 1; + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + while (!kthread_should_stop()) { + try_to_freeze(); + + /* + * TRICKLE_FAILED implies no entries left - we do not schedule + * a wakeup, and further delay the next one. + */ + if (trickle_swap() == TRICKLE_FAILED) { + wakeup_kprefetchd = 1; + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } else + wakeup_kprefetchd = 0; + clear_last_prefetch_free(); + if (!prefetch_enabled()) + schedule_timeout_interruptible(DISABLED_PREFETCH_DELAY); + else + schedule_timeout_interruptible(PREFETCH_DELAY); + } + return 0; +} + +/* + * Create kmem cache for swapped entries + */ +void __init prepare_swap_prefetch(void) +{ + struct zone *zone; + + swapped.cache = kmem_cache_create("swapped_entry", + sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); + + /* + * Set max number of entries to 2/3 the size of physical ram as we + * only ever prefetch to consume 2/3 of the ram. + */ + swapped.maxcount = nr_free_pagecache_pages() / 3 * 2; + + for_each_zone(zone) { + unsigned long present; + struct node_stats *ns; + int idx; + + present = zone->present_pages; + if (!present) + continue; + + ns = &sp_stat.node[zone->zone_pgdat->node_id]; + ns->prefetch_watermark += present / 3 * 2; + idx = zone_idx(zone); + ns->pointfree[idx] = &ns->highfree[idx]; + } +} + +static int __init kprefetchd_init(void) +{ + kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); + + return 0; +} + +static void __exit kprefetchd_exit(void) +{ + kthread_stop(kprefetchd_task); +} + +module_init(kprefetchd_init); +module_exit(kprefetchd_exit); Index: linux-2.6.21-sp/mm/swap_state.c =================================================================== --- linux-2.6.21-sp.orig/mm/swap_state.c 2006-11-30 08:57:37.000000000 +1100 +++ linux-2.6.21-sp/mm/swap_state.c 2007-05-14 10:59:14.000000000 +1000 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +96,7 @@ static int __add_to_swap_cache(struct pa return error; } -static int add_to_swap_cache(struct page *page, swp_entry_t entry) +int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; @@ -148,6 +149,9 @@ int add_to_swap(struct page * page, gfp_ swp_entry_t entry; int err; + /* Swap prefetching is delayed if we're swapping pages */ + delay_swap_prefetch(); + BUG_ON(!PageLocked(page)); for (;;) { @@ -320,6 +324,9 @@ struct page *read_swap_cache_async(swp_e struct page *found_page, *new_page = NULL; int err; + /* Swap prefetching is delayed if we're already reading from swap */ + delay_swap_prefetch(); + do { /* * First check the swap cache. Since this is normally Index: linux-2.6.21-sp/mm/vmscan.c =================================================================== --- linux-2.6.21-sp.orig/mm/vmscan.c 2007-05-01 14:52:06.000000000 +1000 +++ linux-2.6.21-sp/mm/vmscan.c 2007-05-14 10:59:14.000000000 +1000 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1032,6 +1033,8 @@ unsigned long try_to_free_pages(struct z .swappiness = vm_swappiness, }; + delay_swap_prefetch(); + count_vm_event(ALLOCSTALL); for (i = 0; zones[i] != NULL; i++) { @@ -1381,6 +1384,8 @@ static unsigned long shrink_all_zones(un struct zone *zone; unsigned long nr_to_scan, ret = 0; + delay_swap_prefetch(); + for_each_zone(zone) { if (!populated_zone(zone)) Index: linux-2.6.21-sp/include/linux/mm_inline.h =================================================================== --- linux-2.6.21-sp.orig/include/linux/mm_inline.h 2007-05-01 14:52:06.000000000 +1000 +++ linux-2.6.21-sp/include/linux/mm_inline.h 2007-05-14 10:58:04.000000000 +1000 @@ -13,6 +13,13 @@ add_page_to_inactive_list(struct zone *z } static inline void +add_page_to_inactive_list_tail(struct zone *zone, struct page *page) +{ + list_add_tail(&page->lru, &zone->inactive_list); + __inc_zone_state(zone, NR_INACTIVE); +} + +static inline void del_page_from_active_list(struct zone *zone, struct page *page) { list_del(&page->lru); Index: linux-2.6.21-sp/include/linux/swap-prefetch.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.21-sp/include/linux/swap-prefetch.h 2007-05-14 10:58:04.000000000 +1000 @@ -0,0 +1,55 @@ +#ifndef SWAP_PREFETCH_H_INCLUDED +#define SWAP_PREFETCH_H_INCLUDED + +#ifdef CONFIG_SWAP_PREFETCH +/* mm/swap_prefetch.c */ +extern int swap_prefetch; +struct swapped_entry { + swp_entry_t swp_entry; /* The actual swap entry */ + struct list_head swapped_list; /* Linked list of entries */ +#if MAX_NUMNODES > 1 + int node; /* Node id */ +#endif +} __attribute__((packed)); + +static inline void store_swap_entry_node(struct swapped_entry *entry, + struct page *page) +{ +#if MAX_NUMNODES > 1 + entry->node = page_to_nid(page); +#endif +} + +static inline int get_swap_entry_node(struct swapped_entry *entry) +{ +#if MAX_NUMNODES > 1 + return entry->node; +#else + return 0; +#endif +} + +extern void add_to_swapped_list(struct page *page); +extern void remove_from_swapped_list(const unsigned long index); +extern void delay_swap_prefetch(void); +extern void prepare_swap_prefetch(void); + +#else /* CONFIG_SWAP_PREFETCH */ +static inline void add_to_swapped_list(struct page *__unused) +{ +} + +static inline void prepare_swap_prefetch(void) +{ +} + +static inline void remove_from_swapped_list(const unsigned long __unused) +{ +} + +static inline void delay_swap_prefetch(void) +{ +} +#endif /* CONFIG_SWAP_PREFETCH */ + +#endif /* SWAP_PREFETCH_H_INCLUDED */ Index: linux-2.6.21-sp/mm/page_io.c =================================================================== --- linux-2.6.21-sp.orig/mm/page_io.c 2007-02-14 09:09:44.000000000 +1100 +++ linux-2.6.21-sp/mm/page_io.c 2007-05-14 10:59:14.000000000 +1000 @@ -17,6 +17,7 @@ #include #include #include +#include #include static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, @@ -118,6 +119,7 @@ int swap_writepage(struct page *page, st ret = -ENOMEM; goto out; } + add_to_swapped_list(page); if (wbc->sync_mode == WB_SYNC_ALL) rw |= (1 << BIO_RW_SYNC); count_vm_event(PSWPOUT);