Add a background scanning timer to restore the watermarks to the pages_lots level and only call on it if kswapd has not been called upon for the last 5 seconds. This allows us to balance all zones to the more generous pages_lots watermark at a time unrelated to page allocation thus leading to lighter levels of vm load when called upon under page allocation. Signed-off-by: Con Kolivas The -ck patches modify mm/vmscan.c and add a timer to wake up kswapd every 5 seconds. This timer is initialized after the creation of the kswapd thread. The kswapd() thread function calls mod_timer at the front of its infinite service loop (to reset the timer to 5 seconds in the future). mod_timer() includes a BUG_ON() to assert that the timer's callback function is set. Since the wakeup timer is initialized after the kswapd thread is created, if kswapd gets scheduled before kswapd_run() has prepared the timer, the BUG_ON() check will throw a stack trace and immediately terminate the kswapd thread. This patch modifies the kswapd_run() function in mm/vmscan.c to initialize the watermark timer before starting the kswapd thread. Signed-off-by: Chase Venters -ck --- --- include/linux/mmzone.h | 6 +++++- mm/vmscan.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) Index: linux-3.1-ck2/include/linux/mmzone.h =================================================================== --- linux-3.1-ck2.orig/include/linux/mmzone.h 2011-10-25 21:07:13.000000000 +1100 +++ linux-3.1-ck2/include/linux/mmzone.h 2011-11-11 13:28:21.939507636 +1100 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -168,12 +169,14 @@ enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, + WMARK_LOTS, NR_WMARK }; #define min_wmark_pages(z) (z->watermark[WMARK_MIN]) #define low_wmark_pages(z) (z->watermark[WMARK_LOW]) #define high_wmark_pages(z) (z->watermark[WMARK_HIGH]) +#define lots_wmark_pages(z) (z->watermark[WMARK_LOTS]) struct per_cpu_pages { int count; /* number of pages in the list */ @@ -345,7 +348,7 @@ struct zone { ZONE_PADDING(_pad1_) /* Fields commonly accessed by the page reclaim scanner */ - spinlock_t lru_lock; + spinlock_t lru_lock; struct zone_lru { struct list_head list; } lru[NR_LRU_LISTS]; @@ -641,6 +644,7 @@ typedef struct pglist_data { wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + struct timer_list watermark_timer; enum zone_type classzone_idx; } pg_data_t; Index: linux-3.1-ck2/mm/vmscan.c =================================================================== --- linux-3.1-ck2.orig/mm/vmscan.c 2011-11-11 13:28:21.763507649 +1100 +++ linux-3.1-ck2/mm/vmscan.c 2011-11-11 13:28:21.940507635 +1100 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -2774,6 +2775,8 @@ static void kswapd_try_to_sleep(pg_data_ finish_wait(&pgdat->kswapd_wait, &wait); } +#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ + /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -2825,6 +2828,9 @@ static int kswapd(void *p) for ( ; ; ) { int ret; + /* kswapd has been busy so delay watermark_timer */ + mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); + /* * If the last balance_pgdat was unsuccessful it's unlikely a * new request of a similar or harder type will succeed soon @@ -3000,20 +3006,57 @@ static int __devinit cpu_callback(struct } /* + * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots + */ +static void watermark_wakeup(unsigned long data) +{ + pg_data_t *pgdat = (pg_data_t *)data; + struct timer_list *wt = &pgdat->watermark_timer; + int i; + + if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) + goto out; + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *z = pgdat->node_zones + i; + + if (!populated_zone(z) || is_highmem(z)) { + /* We are better off leaving highmem full */ + continue; + } + if (!zone_watermark_ok(z, 0, lots_wmark_pages(z), 0, 0)) { + wake_up_interruptible(&pgdat->kswapd_wait); + goto out; + } + } +out: + mod_timer(wt, jiffies + WT_EXPIRY); + return; +} + +/* * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ int kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); + struct timer_list *wt; int ret = 0; if (pgdat->kswapd) return 0; + wt = &pgdat->watermark_timer; + init_timer(wt); + wt->data = (unsigned long)pgdat; + wt->function = watermark_wakeup; + wt->expires = jiffies + WT_EXPIRY; + add_timer(wt); + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ + del_timer(wt); BUG_ON(system_state == SYSTEM_BOOTING); printk("Failed to start kswapd on node %d\n",nid); ret = -1;