Index: linux-2.6.13-ck7/include/linux/swap.h =================================================================== --- linux-2.6.13-ck7.orig/include/linux/swap.h 2005-10-05 21:54:27.000000000 +1000 +++ linux-2.6.13-ck7/include/linux/swap.h 2005-10-05 21:54:28.000000000 +1000 @@ -197,6 +197,7 @@ extern void delay_prefetch(void); extern struct page * buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags); extern void zone_statistics(struct zonelist *zonelist, struct zone *z); +extern int swap_prefetch; #else /* CONFIG_SWAP_PREFETCH */ static inline void add_to_swapped_list(unsigned long index) Index: linux-2.6.13-ck7/include/linux/sysctl.h =================================================================== --- linux-2.6.13-ck7.orig/include/linux/sysctl.h 2005-10-05 21:54:18.000000000 +1000 +++ linux-2.6.13-ck7/include/linux/sysctl.h 2005-10-05 21:54:28.000000000 +1000 @@ -184,6 +184,7 @@ enum VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ VM_HARDMAPLIMIT=29, /* Make mapped a hard limit */ + VM_SWAP_PREFETCH=30, /* int: amount to swap prefetch */ }; Index: linux-2.6.13-ck7/init/Kconfig =================================================================== --- linux-2.6.13-ck7.orig/init/Kconfig 2005-10-05 21:54:27.000000000 +1000 +++ linux-2.6.13-ck7/init/Kconfig 2005-10-05 21:54:28.000000000 +1000 @@ -89,7 +89,7 @@ config SWAP config SWAP_PREFETCH bool "Support for prefetching swapped memory" - depends on SWAP && EXPERIMENTAL + depends on SWAP default n ---help--- This option will allow the kernel to prefetch swapped memory pages Index: linux-2.6.13-ck7/kernel/sysctl.c =================================================================== --- linux-2.6.13-ck7.orig/kernel/sysctl.c 2005-10-05 21:54:18.000000000 +1000 +++ linux-2.6.13-ck7/kernel/sysctl.c 2005-10-05 21:54:28.000000000 +1000 @@ -884,6 +884,18 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies, }, +#ifdef CONFIG_SWAP_PREFETCH + { + .ctl_name = VM_SWAP_PREFETCH, + .procname = "swap_prefetch", + .data = &swap_prefetch, + .maxlen = sizeof(swap_prefetch), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, +#endif #endif { .ctl_name = 0 } }; Index: linux-2.6.13-ck7/mm/swap_prefetch.c =================================================================== --- linux-2.6.13-ck7.orig/mm/swap_prefetch.c 2005-10-05 21:54:27.000000000 +1000 +++ linux-2.6.13-ck7/mm/swap_prefetch.c 2005-10-05 21:54:28.000000000 +1000 @@ -13,18 +13,23 @@ #include #include #include +#include +#include /* Time to delay prefetching if vm is busy or prefetching unsuccessful */ #define PREFETCH_DELAY (HZ * 5) /* Time between attempting prefetching when vm is idle */ #define PREFETCH_INTERVAL (HZ) +int swap_prefetch = 2; /* sysctl - if/how much to prefetch at a time */ + struct swapped_root_t { - unsigned long busy; - spinlock_t lock; - struct list_head list; - unsigned int count; - unsigned int maxcount; + unsigned long busy; /* vm busy */ + spinlock_t lock; /* protects all data */ + struct list_head list; /* MRU list of swapped pages */ + struct radix_tree_root swap_tree; /* Lookup tree of pages */ + unsigned int count; /* Number of entries */ + unsigned int maxcount; /* Maximum entries allowed */ kmem_cache_t *cache; }; @@ -33,37 +38,39 @@ struct swapped_entry_t { struct list_head swapped_list; }; -static struct swapped_root_t swapped_root = { - .list = LIST_HEAD_INIT(swapped_root.list), - .count = 0, +static struct swapped_root_t swapped = { + .busy = 0, + .list = LIST_HEAD_INIT(swapped.list), + .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), + .count = 0, }; static struct timer_list prefetch_timer; static DECLARE_WAIT_QUEUE_HEAD(kprefetchd_wait); -static unsigned long mapped_limit; +static unsigned long mapped_limit; /* Max mapped we will prefetch to */ +static unsigned long last_free = 0; /* Last total free pages */ +static unsigned long temp_free = 0; /* * Create kmem cache for swapped entries */ -void prepare_prefetch(void) +void __init prepare_prefetch(void) { long total_memory = nr_free_pagecache_pages(); - long se_size = sizeof(struct swapped_entry_t); - swapped_root.cache = kmem_cache_create("swapped_entry", se_size, - 0, 0, NULL, NULL); - if (unlikely(!swapped_root.cache)) + swapped.cache = kmem_cache_create("swapped_entry", + sizeof(struct swapped_entry_t), 0, 0, NULL, NULL); + if (unlikely(!swapped.cache)) panic("prepare_prefetch(): cannot create swapped_entry SLAB cache"); - /* Set max count of swapped entries to 5% ram */ - swapped_root.maxcount = (total_memory / 20) * (PAGE_SIZE / se_size); + /* Set max number of entries to size of physical ram */ + swapped.maxcount = total_memory; /* Set maximum amount of mapped pages to prefetch to 2/3 ram */ mapped_limit = total_memory / 3 * 2; - spin_lock_init(&swapped_root.lock); - swapped_root.busy = 0; + spin_lock_init(&swapped.lock); } static inline void delay_prefetch_timer(void) @@ -82,7 +89,7 @@ static inline void reset_prefetch_timer( */ void delay_prefetch(void) { - __set_bit(0, &swapped_root.busy); + __set_bit(0, &swapped.busy); } /* @@ -94,21 +101,19 @@ void delay_prefetch(void) void add_to_swapped_list(unsigned long index) { struct swapped_entry_t *entry; - struct address_space *mapping = &swapper_space; - unsigned long flags; int error; - if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags))) + if (unlikely(!spin_trylock(&swapped.lock))) goto out; - if (swapped_root.count >= swapped_root.maxcount) { - entry = list_entry(swapped_root.list.next, + if (swapped.count >= swapped.maxcount) { + entry = list_entry(swapped.list.next, struct swapped_entry_t, swapped_list); - radix_tree_delete(&mapping->swap_tree, entry->swp_entry.val); + radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); list_del(&entry->swapped_list); - swapped_root.count--; + swapped.count--; } else { - entry = kmem_cache_alloc(swapped_root.cache, GFP_ATOMIC); + entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); if (unlikely(!entry)) /* bad, can't allocate more mem */ goto out_locked; @@ -118,23 +123,23 @@ void add_to_swapped_list(unsigned long i error = radix_tree_preload(GFP_ATOMIC); if (likely(!error)) { - error = radix_tree_insert(&mapping->swap_tree, index, entry); + error = radix_tree_insert(&swapped.swap_tree, index, entry); if (likely(!error)) { /* * If this is the first entry the timer needs to be * (re)started */ - if (list_empty(&swapped_root.list)) + if (list_empty(&swapped.list)) delay_prefetch_timer(); - list_add(&entry->swapped_list, &swapped_root.list); - swapped_root.count++; + list_add(&entry->swapped_list, &swapped.list); + swapped.count++; } radix_tree_preload_end(); } else - kmem_cache_free(swapped_root.cache, entry); + kmem_cache_free(swapped.cache, entry); out_locked: - spin_unlock_irqrestore(&swapped_root.lock, flags); + spin_unlock(&swapped.lock); out: return; } @@ -145,19 +150,18 @@ out: */ void remove_from_swapped_list(unsigned long index) { - struct address_space *mapping = &swapper_space; struct swapped_entry_t *entry; unsigned long flags; - if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags))) + if (unlikely(!spin_trylock_irqsave(&swapped.lock, flags))) return; - entry = radix_tree_delete(&mapping->swap_tree, index); - if (entry) { + entry = radix_tree_delete(&swapped.swap_tree, index); + if (likely(entry)) { list_del_init(&entry->swapped_list); - swapped_root.count--; - kmem_cache_free(swapped_root.cache, entry); + swapped.count--; + kmem_cache_free(swapped.cache, entry); } - spin_unlock_irqrestore(&swapped_root.lock, flags); + spin_unlock_irqrestore(&swapped.lock, flags); } /* @@ -165,7 +169,7 @@ void remove_from_swapped_list(unsigned l * then directly allocate the ram. We don't want prefetch to use * __alloc_pages and go calling on reclaim. */ -static struct page * prefetch_get_page(void) +static struct page *prefetch_get_page(void) { struct zone *zone = NULL, *z; struct page *page = NULL; @@ -179,18 +183,10 @@ static struct page * prefetch_get_page(v free = z->free_pages; - /* Check yet again we are above watermarks, by now likely */ - if (unlikely(free < z->pages_high * 3)) - goto out; - /* We don't prefetch into DMA */ if (zone_idx(z) == ZONE_DMA) continue; - /* Reasonably stressed zone, bypass it */ - if (z->prev_priority < DEF_PRIORITY / 2) - continue; - /* Select the zone with the most free ram */ if (free > most_free) { most_free = free; @@ -207,7 +203,7 @@ static struct page * prefetch_get_page(v zonelist = NODE_DATA(numa_node_id())->node_zonelists + (GFP_HIGHUSER & GFP_ZONEMASK); -; + zone_statistics(zonelist, zone); } out: @@ -216,21 +212,21 @@ out: /* * This tries to read a swp_entry_t into swap cache for swap prefetching. + * Returns 1 on success, 0 on failure, -1 on failure and we should delay + * further prefetching. */ static int trickle_swap_cache_async(swp_entry_t entry) { struct page *page = NULL; - struct address_space *mapping = &swapper_space; - unsigned long flags; + int ret = 0; - /* Entry may already exist */ - local_irq_save(flags); - if (unlikely(!read_trylock(&mapping->tree_lock))) { - local_irq_restore(flags); - goto out_delay; + if (unlikely(!read_trylock(&swapper_space.tree_lock))) { + ret = -1; + goto out; } - page = radix_tree_lookup(&mapping->page_tree, entry.val); - read_unlock_irqrestore(&mapping->tree_lock, flags); + /* Entry may already exist */ + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + read_unlock(&swapper_space.tree_lock); if (page) { remove_from_swapped_list(entry.val); goto out; @@ -238,22 +234,26 @@ static int trickle_swap_cache_async(swp_ /* Get a new page to read from swap */ page = prefetch_get_page(); - if (unlikely(!page)) - goto out_delay; + if (unlikely(!page)) { + ret = -1; + goto out; + } - if (add_to_swap_cache(page, entry)) { + if (add_to_swap_cache(page, entry)) /* Failed to add to swap cache */ - page_cache_release(page); - goto out; + goto out_release; + + if (unlikely(swap_readpage(NULL, page))) { + ret = -1; + goto out_release; } - lru_cache_add_active(page); - swap_readpage(NULL, page); - return 1; -out_delay: - return -1; + ret = 1; + +out_release: + page_cache_release(page); out: - return 0; + return ret; } /* @@ -264,97 +264,113 @@ static int prefetch_suitable(void) struct page_state ps; unsigned long pending_writes, limit; struct zone *z; + int ret = 0; /* Purposefully racy and might return false positive which is ok */ - if (__test_and_clear_bit(0, &swapped_root.busy)) - goto out_delay; + if (__test_and_clear_bit(0, &swapped.busy)) + goto out; + temp_free = 0; /* * Have some hysteresis between where page reclaiming and prefetching * will occur to prevent ping-ponging between them. */ for_each_zone(z) { + unsigned long free; + if (z->present_pages == 0) continue; - if (z->pages_high * 3 > z->free_pages) - goto out_delay; + free = z->free_pages; + if (z->pages_high * 3 > free) + goto out; + temp_free += free; } + /* + * We check to see that pages are not being allocated elsewhere + * at any significant rate implying any degree of memory pressure + * (eg during file reads) + */ + if (last_free) { + if (temp_free + SWAP_CLUSTER_MAX * (swap_prefetch + 1) < + last_free) { + last_free = temp_free; + goto out; + } + } else + last_free = temp_free; + get_page_state(&ps); /* We shouldn't prefetch when we are doing writeback */ if (ps.nr_writeback) - goto out_delay; + goto out; /* Delay prefetching if we have significant amounts of dirty data */ pending_writes = ps.nr_dirty + ps.nr_unstable; if (pending_writes > SWAP_CLUSTER_MAX) - goto out_delay; + goto out; /* >2/3 of the ram is mapped, we need some free for pagecache */ limit = ps.nr_mapped + ps.nr_slab + pending_writes; if (limit > mapped_limit) - goto out_delay; + goto out; /* * Add swapcache to limit as well, but check this last since it needs * locking */ if (unlikely(!read_trylock(&swapper_space.tree_lock))) - goto out_delay; + goto out; limit += total_swapcache_pages; read_unlock(&swapper_space.tree_lock); if (limit > mapped_limit) - goto out_delay; + goto out; /* Survived all that? Hooray we can prefetch! */ - return 1; - -out_delay: - return 0; + ret = 1; +out: + return ret; } /* * trickle_swap is the main function that initiates the swap prefetching. It * first checks to see if the busy flag is set, and does not prefetch if it * is, as the flag implied we are low on memory or swapping in currently. - * Otherwise it runs till SWAP_CLUSTER_MAX is prefetched. This function - * returns 1 if it succeeds in a cycle of prefetching, 0 if it is interrupted - * or -1 if there is nothing left to prefetch. + * Otherwise it runs till SWAP_CLUSTER_MAX * swap_prefetch is prefetched. + * This function returns 1 if it succeeds in a cycle of prefetching, 0 if it + * is interrupted or -1 if there is nothing left to prefetch. */ static int trickle_swap(void) { int ret = 0, pages = 0; struct swapped_entry_t *entry; - while (pages < SWAP_CLUSTER_MAX) { + while (pages < SWAP_CLUSTER_MAX * swap_prefetch) { int got_page; if (!prefetch_suitable()) goto out; /* Lock is held? We must be busy elsewhere */ - if (unlikely(!spin_trylock(&swapped_root.lock))) + if (unlikely(!spin_trylock(&swapped.lock))) goto out; - if (list_empty(&swapped_root.list)) { + if (list_empty(&swapped.list)) { ret = -1; goto out_unlock; } - entry = list_entry(swapped_root.list.next, + entry = list_entry(swapped.list.next, struct swapped_entry_t, swapped_list); - spin_unlock(&swapped_root.lock); + spin_unlock(&swapped.lock); got_page = trickle_swap_cache_async(entry->swp_entry); - if (unlikely(got_page == -1)) { - ret = -1; - goto out_unlock; - } + if (unlikely(got_page == -1)) + goto out; pages += got_page; } - ret = 1; - goto out; + return 1; out_unlock: - spin_unlock(&swapped_root.lock); + spin_unlock(&swapped.lock); out: return ret; } @@ -365,6 +381,8 @@ static int kprefetchd(void *data) daemonize("kprefetchd"); set_user_nice(current, 19); + /* Set ioprio to lowest if supported by i/o scheduler */ + sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE); for ( ; ; ) { int prefetched; @@ -375,10 +393,15 @@ static int kprefetchd(void *data) finish_wait(&kprefetchd_wait, &wait); /* If trickle_swap() returns -1 the timer is not reset */ - if (!(prefetched = trickle_swap())) - delay_prefetch_timer(); - else if (prefetched == 1) + prefetched = trickle_swap(); + if (prefetched == 1) { + last_free = temp_free; reset_prefetch_timer(); + } else { + last_free = 0; + if (!prefetched) + delay_prefetch_timer(); + } } return 0; } Index: linux-2.6.13-ck7/mm/swap_state.c =================================================================== --- linux-2.6.13-ck7.orig/mm/swap_state.c 2005-10-05 21:54:26.000000000 +1000 +++ linux-2.6.13-ck7/mm/swap_state.c 2005-10-05 21:54:28.000000000 +1000 @@ -36,8 +36,6 @@ static struct backing_dev_info swap_back struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), .tree_lock = RW_LOCK_UNLOCKED, - .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), - .swapped_pages = LIST_HEAD_INIT(swapper_space.swapped_pages), .a_ops = &swap_aops, .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info,