Index: linux-2.6.13-sp/include/linux/fs.h =================================================================== --- linux-2.6.13-sp.orig/include/linux/fs.h 2005-09-05 22:02:01.000000000 +1000 +++ linux-2.6.13-sp/include/linux/fs.h 2005-09-05 22:02:44.000000000 +1000 @@ -340,6 +340,8 @@ struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ rwlock_t tree_lock; /* and rwlock protecting it */ + struct radix_tree_root swap_tree; /* radix tree of swapped pages */ + struct list_head swapped_pages; /* list of swapped pages */ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ Index: linux-2.6.13-sp/include/linux/swap.h =================================================================== --- linux-2.6.13-sp.orig/include/linux/swap.h 2005-09-05 22:02:01.000000000 +1000 +++ linux-2.6.13-sp/include/linux/swap.h 2005-09-07 20:47:57.000000000 +1000 @@ -185,6 +185,32 @@ extern int shmem_unuse(swp_entry_t entry extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); +#ifdef CONFIG_SWAP_PREFETCH +/* mm/swap_prefetch.c */ +extern void prepare_prefetch(void); +extern void add_to_swapped_list(unsigned long index); +extern void remove_from_swapped_list(unsigned long index); +extern void delay_prefetch(void); + +#else /* CONFIG_SWAP_PREFETCH */ +static inline void add_to_swapped_list(unsigned long index) +{ +} + +static inline void prepare_prefetch(void) +{ +} + +static inline void remove_from_swapped_list(unsigned long index) +{ +} + +static inline void delay_prefetch(void) +{ +} + +#endif /* CONFIG_SWAP_PREFETCH */ + #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ extern int swap_readpage(struct file *, struct page *); @@ -206,6 +232,7 @@ extern void free_pages_and_swap_cache(st extern struct page * lookup_swap_cache(swp_entry_t); extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, unsigned long addr); +extern int add_to_swap_cache(struct page *page, swp_entry_t entry); /* linux/mm/swapfile.c */ extern long total_swap_pages; extern unsigned int nr_swapfiles; Index: linux-2.6.13-sp/init/Kconfig =================================================================== --- linux-2.6.13-sp.orig/init/Kconfig 2005-09-05 22:02:01.000000000 +1000 +++ linux-2.6.13-sp/init/Kconfig 2005-09-05 22:02:44.000000000 +1000 @@ -87,6 +87,17 @@ config SWAP used to provide more virtual memory than the actual RAM present in your computer. If unsure say Y. +config SWAP_PREFETCH + bool "Support for prefetching swapped memory (EXPERIMENTAL)" + depends on SWAP && EXPERIMENTAL + default n + ---help--- + This option will allow the kernel to prefetch swapped memory pages + when idle. The pages will be kept on both swap and in swap_cache + thus avoiding the need for further I/O if either ram or swap space + is required. This is desirable on workstations. + Desktop users will most likely want to say Y. + config SYSVIPC bool "System V IPC" depends on MMU Index: linux-2.6.13-sp/mm/Makefile =================================================================== --- linux-2.6.13-sp.orig/mm/Makefile 2005-09-05 22:02:01.000000000 +1000 +++ linux-2.6.13-sp/mm/Makefile 2005-09-05 22:02:44.000000000 +1000 @@ -13,6 +13,7 @@ obj-y := bootmem.o filemap.o mempool.o prio_tree.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o Index: linux-2.6.13-sp/mm/page_alloc.c =================================================================== --- linux-2.6.13-sp.orig/mm/page_alloc.c 2005-08-29 13:31:26.000000000 +1000 +++ linux-2.6.13-sp/mm/page_alloc.c 2005-09-05 22:02:44.000000000 +1000 @@ -745,7 +745,7 @@ int zone_watermark_ok(struct zone *z, in min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) - return 0; + goto out_failed; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o; @@ -754,9 +754,15 @@ int zone_watermark_ok(struct zone *z, in min >>= 1; if (free_pages <= min) - return 0; + goto out_failed; } + return 1; +out_failed: + /* Swap prefetching is delayed if any watermark is low */ + delay_prefetch(); + + return 0; } static inline int Index: linux-2.6.13-sp/mm/swap.c =================================================================== --- linux-2.6.13-sp.orig/mm/swap.c 2005-09-05 22:02:01.000000000 +1000 +++ linux-2.6.13-sp/mm/swap.c 2005-09-05 22:02:44.000000000 +1000 @@ -481,5 +481,8 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ + + prepare_prefetch(); + hotcpu_notifier(cpu_swap_callback, 0); } Index: linux-2.6.13-sp/mm/swap_prefetch.c =================================================================== --- linux-2.6.13-sp.orig/mm/swap_prefetch.c 2005-01-12 16:19:45.000000000 +1100 +++ linux-2.6.13-sp/mm/swap_prefetch.c 2005-09-05 22:18:45.000000000 +1000 @@ -0,0 +1,288 @@ +/* + * linux/mm/swap_prefetch.c + * + * Written by Con Kolivas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +/* Time to delay prefetching if vm is busy or prefetching unsuccessful */ +#define PREFETCH_DELAY (HZ * 5) +/* Time between attempting prefetching when vm is idle */ +#define PREFETCH_INTERVAL (HZ) + +struct swapped_root_t { + spinlock_t lock; + unsigned int count; + unsigned int maxcount; + kmem_cache_t *cache; + struct list_head list; + int busy; + spinlock_t busylock; +}; + +struct swapped_entry_t { + swp_entry_t swp_entry; + struct list_head swapped_list; +}; + +static struct swapped_root_t swapped_root = { + .count = 0, + .list = LIST_HEAD_INIT(swapped_root.list), +}; + +static struct timer_list prefetch_timer; + +static DECLARE_WAIT_QUEUE_HEAD(kprefetchd_wait); + +/* + * Create kmem cache for swapped entries + */ +void prepare_prefetch(void) +{ + swapped_root.cache = kmem_cache_create("swapped_entry", + sizeof(struct swapped_entry_t), 0, 0, NULL, NULL); + if (unlikely(!swapped_root.cache)) + panic("prepare_prefetch(): cannot create swapped_entry SLAB cache"); + + /* + * Set max count of swapped entries + */ + swapped_root.maxcount = nr_free_pagecache_pages(); + spin_lock_init(&swapped_root.lock); + spin_lock_init(&swapped_root.busylock); +} + +static inline void delay_prefetch_timer(void) +{ + mod_timer(&prefetch_timer, jiffies + PREFETCH_DELAY); +} + +static inline void reset_prefetch_timer(void) +{ + mod_timer(&prefetch_timer, jiffies + PREFETCH_INTERVAL); +} + +void add_to_swapped_list(unsigned long index) +{ + struct swapped_entry_t *entry; + struct address_space *mapping = &swapper_space; + int error; + + spin_lock(&swapped_root.lock); + + if (swapped_root.count >= swapped_root.maxcount) { + entry = list_entry(swapped_root.list.next, + struct swapped_entry_t, swapped_list); + radix_tree_delete(&mapping->swap_tree, entry->swp_entry.val); + list_del_init(&entry->swapped_list); + swapped_root.count--; + } else { + entry = kmem_cache_alloc(swapped_root.cache, GFP_ATOMIC); + if (!entry) + goto out_locked; + } + + entry->swp_entry.val = index; + + error = radix_tree_preload(GFP_ATOMIC); + if (likely(!error)) { + error = radix_tree_insert(&mapping->swap_tree, index, entry); + if (likely(!error)) { + /* + * If this is the first entry the timer needs to be + * restarted + */ + if (list_empty(&swapped_root.list)) + delay_prefetch_timer(); + list_add(&entry->swapped_list, &swapped_root.list); + swapped_root.count++; + } + radix_tree_preload_end(); + } else + kmem_cache_free(swapped_root.cache, entry); + +out_locked: + spin_unlock(&swapped_root.lock); +} + +void remove_from_swapped_list(unsigned long index) +{ + struct address_space *mapping = &swapper_space; + struct swapped_entry_t *entry; + + spin_lock(&swapped_root.lock); + entry = radix_tree_delete(&mapping->swap_tree, index); + if (entry) { + list_del_init(&entry->swapped_list); + swapped_root.count--; + kmem_cache_free(swapped_root.cache, entry); + } + spin_unlock(&swapped_root.lock); +} + +/* + * This tries to read a swp_entry_t into swap cache for swap prefetching. + */ +static int trickle_swap_cache_async(swp_entry_t entry) +{ + struct page *found_page, *new_page = NULL; + struct address_space *mapping = &swapper_space; + + /* May already exist, check it as cheaply as possible */ + read_lock_irq(&mapping->tree_lock); + found_page = radix_tree_lookup(&mapping->page_tree, entry.val); + read_unlock_irq(&mapping->tree_lock); + if (found_page) { + remove_from_swapped_list(entry.val); + goto out; + } + + /* Get a new page to read from swap */ + new_page = alloc_page_vma(GFP_HIGHUSER, NULL, 0); + if (!new_page) + goto out; /* Out of memory */ + + if (add_to_swap_cache(new_page, entry)) { + /* Failed to add to swap cache */ + page_cache_release(new_page); + goto out; + } + + lru_cache_add_active(new_page); + swap_readpage(NULL, new_page); + return 1; +out: + return 0; +} + +static int test_clear_busy(void) +{ + int ret; + + spin_lock(&swapped_root.busylock); + ret = swapped_root.busy; + swapped_root.busy = 0; + spin_unlock(&swapped_root.busylock); + return ret; +} + +/* + * trickle_swap is the main function that initiates the swap prefetching. It + * first checks to see if the busy flag is set, and does not prefetch if it + * is, as the flag implied we are low on memory or swapping in currently. + * Otherwise it runs till SWAP_CLUSTER_MAX is prefetched. This function + * returns 1 if it succeeds in a cycle of prefetching, 0 if it is interrupted + * or -1 if there is nothing left to prefetch. + */ +static int trickle_swap(void) +{ + int ret = 0, pages = 0; + struct swapped_entry_t *entry; + + while (pages < SWAP_CLUSTER_MAX) { + if (test_clear_busy()) + goto out; + spin_lock(&swapped_root.lock); + if (list_empty(&swapped_root.list)) { + ret = -1; + goto out_unlock; + } + entry = list_entry(swapped_root.list.next, + struct swapped_entry_t, swapped_list); + spin_unlock(&swapped_root.lock); + + if (trickle_swap_cache_async(entry->swp_entry)) + pages++; + } + ret = 1; + goto out; + +out_unlock: + spin_unlock(&swapped_root.lock); +out: + return ret; +} + +static int kprefetchd(void *data) +{ + DEFINE_WAIT(wait); + + daemonize("kprefetchd"); + set_user_nice(current, 19); + + for ( ; ; ) { + int prefetched; + + try_to_freeze(); + prepare_to_wait(&kprefetchd_wait, &wait, TASK_INTERRUPTIBLE); + schedule(); + finish_wait(&kprefetchd_wait, &wait); + + if (!(prefetched = trickle_swap())) + delay_prefetch_timer(); + else if (prefetched == 1) + reset_prefetch_timer(); + } + return 0; +} + +static void prefetch_wakeup(unsigned long data) +{ + pg_data_t *pgdat; + int i; + + /* + * Make sure we really have spare ram before doing any prefetching. + * Check for significantly more than pages_high to have some + * hysteresis to prevent prefetch keeping the vm active when nothing + * else is happening. + */ + for_each_pgdat(pgdat) { + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *z = pgdat->node_zones + i; + if (!zone_watermark_ok(z, 0, z->pages_high * 4, + 0, 0, 0)) { + delay_prefetch_timer(); + return; + } + } + } + + if (waitqueue_active(&kprefetchd_wait)) + wake_up_interruptible(&kprefetchd_wait); +} + +/* + * We check to see no part of the vm is busy. If it is this will interrupt + * trickle_swap and wait another PREFETCH_DELAY + */ +void delay_prefetch(void) +{ + spin_lock(&swapped_root.busylock); + swapped_root.busy = 1; + spin_unlock(&swapped_root.busylock); +} + +static int __init kprefetchd_init(void) +{ + /* + * Prepare the prefetch timer. It is inactive until entries are placed + * on the swapped_list + */ + init_timer(&prefetch_timer); + prefetch_timer.data = 0; + prefetch_timer.function = prefetch_wakeup; + + kernel_thread(kprefetchd, NULL, CLONE_KERNEL); + + return 0; +} + +module_init(kprefetchd_init) Index: linux-2.6.13-sp/mm/swap_state.c =================================================================== --- linux-2.6.13-sp.orig/mm/swap_state.c 2005-09-05 22:02:01.000000000 +1000 +++ linux-2.6.13-sp/mm/swap_state.c 2005-09-05 22:02:44.000000000 +1000 @@ -36,6 +36,8 @@ static struct backing_dev_info swap_back struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), .tree_lock = RW_LOCK_UNLOCKED, + .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), + .swapped_pages = LIST_HEAD_INIT(swapper_space.swapped_pages), .a_ops = &swap_aops, .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, @@ -80,6 +82,7 @@ static int __add_to_swap_cache(struct pa error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); if (!error) { + remove_from_swapped_list(entry.val); page_cache_get(page); SetPageLocked(page); SetPageSwapCache(page); @@ -93,11 +96,12 @@ static int __add_to_swap_cache(struct pa return error; } -static int add_to_swap_cache(struct page *page, swp_entry_t entry) +int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; if (!swap_duplicate(entry)) { + remove_from_swapped_list(entry.val); INC_CACHE_INFO(noent_race); return -ENOENT; } @@ -148,6 +152,9 @@ int add_to_swap(struct page * page) if (!PageLocked(page)) BUG(); + /* Swap prefetching is delayed if we're swapping pages */ + delay_prefetch(); + for (;;) { entry = get_swap_page(); if (!entry.val) @@ -325,6 +332,9 @@ struct page *read_swap_cache_async(swp_e struct page *found_page, *new_page = NULL; int err; + /* Swap prefetching is delayed if we're already reading from swap */ + delay_prefetch(); + do { /* * First check the swap cache. Since this is normally Index: linux-2.6.13-sp/mm/vmscan.c =================================================================== --- linux-2.6.13-sp.orig/mm/vmscan.c 2005-09-05 22:02:01.000000000 +1000 +++ linux-2.6.13-sp/mm/vmscan.c 2005-09-05 22:02:44.000000000 +1000 @@ -519,6 +519,7 @@ static int shrink_list(struct list_head #ifdef CONFIG_SWAP if (PageSwapCache(page)) { swp_entry_t swap = { .val = page->private }; + add_to_swapped_list(swap.val); __delete_from_swap_cache(page); write_unlock_irq(&mapping->tree_lock); swap_free(swap);