Turn the "swappiness" knob into one with well defined semantics. Rename it "mapped" to correspond directly with the percentage of mapped ram or "applications" as users think of it. Currently the swappiness algorithm can easily lead to swapping situations on simple file copies due to the distress algorithm which too easily overrides the swappiness value. Add a "hardmaplimit" tunable, on by default, which only allows the vm to override the "mapped" tunable when distress is at its greatest to prevent false out-of-memory situations. Signed-off-by: Con Kolivas include/linux/swap.h | 3 ++- include/linux/sysctl.h | 3 ++- kernel/sysctl.c | 16 ++++++++++++---- mm/vmscan.c | 29 +++++++++++++++++------------ 4 files changed, 33 insertions(+), 18 deletions(-) Index: linux-2.6.17-rc5-ck1/include/linux/swap.h =================================================================== --- linux-2.6.17-rc5-ck1.orig/include/linux/swap.h 2006-05-25 13:03:03.000000000 +1000 +++ linux-2.6.17-rc5-ck1/include/linux/swap.h 2006-05-25 13:03:04.000000000 +1000 @@ -176,7 +176,8 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zone **, gfp_t); extern unsigned long shrink_all_memory(unsigned long nr_pages); -extern int vm_swappiness; +extern int vm_mapped; +extern int vm_hardmaplimit; extern int remove_mapping(struct address_space *mapping, struct page *page); /* possible outcome of pageout() */ Index: linux-2.6.17-rc5-ck1/include/linux/sysctl.h =================================================================== --- linux-2.6.17-rc5-ck1.orig/include/linux/sysctl.h 2006-05-25 13:03:03.000000000 +1000 +++ linux-2.6.17-rc5-ck1/include/linux/sysctl.h 2006-05-25 13:03:04.000000000 +1000 @@ -175,7 +175,7 @@ enum VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ - VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ + VM_MAPPED=19, /* percent mapped min while evicting cache */ VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ @@ -190,6 +190,7 @@ enum VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ VM_SWAP_PREFETCH=33, /* swap prefetch */ + VM_HARDMAPLIMIT=34, /* Make mapped a hard limit */ }; Index: linux-2.6.17-rc5-ck1/kernel/sysctl.c =================================================================== --- linux-2.6.17-rc5-ck1.orig/kernel/sysctl.c 2006-05-25 13:03:03.000000000 +1000 +++ linux-2.6.17-rc5-ck1/kernel/sysctl.c 2006-05-25 13:03:04.000000000 +1000 @@ -791,16 +791,24 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = VM_SWAPPINESS, - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), + .ctl_name = VM_MAPPED, + .procname = "mapped", + .data = &vm_mapped, + .maxlen = sizeof(vm_mapped), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = VM_HARDMAPLIMIT, + .procname = "hardmaplimit", + .data = &vm_hardmaplimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_HUGETLB_PAGE { .ctl_name = VM_HUGETLB_PAGES, Index: linux-2.6.17-rc5-ck1/mm/vmscan.c =================================================================== --- linux-2.6.17-rc5-ck1.orig/mm/vmscan.c 2006-05-25 13:03:04.000000000 +1000 +++ linux-2.6.17-rc5-ck1/mm/vmscan.c 2006-05-25 13:03:04.000000000 +1000 @@ -63,7 +63,7 @@ struct scan_control { * whole list at once. */ int swap_cluster_max; - int swappiness; + int mapped; }; /* @@ -108,10 +108,11 @@ struct shrinker { #endif /* - * From 0 .. 100. Higher means more swappy. + * From 0 .. 100. Lower means more swappy. */ -int vm_swappiness = 60; -static long total_memory; +int vm_mapped __read_mostly = 66; +int vm_hardmaplimit __read_mostly = 1; +static long total_memory __read_mostly; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -742,10 +743,14 @@ static void shrink_active_list(unsigned * The distress ratio is important - we don't want to start * going oom. * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. + * This distress value is ignored if we apply a hardmaplimit except + * in extreme distress. + * + * A 0% value of vm_mapped overrides this algorithm altogether. */ - swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; + swap_tendency = mapped_ratio * 100 / (sc->mapped + 1); + if (!vm_hardmaplimit || distress == 100) + swap_tendency += distress; /* * Now use this metric to decide whether to start moving mapped @@ -961,7 +966,7 @@ unsigned long try_to_free_pages(struct z .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, .may_swap = 1, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; delay_swap_prefetch(); @@ -1057,7 +1062,7 @@ static unsigned long balance_pgdat(pg_da .gfp_mask = GFP_KERNEL, .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; loop_again: @@ -1346,7 +1351,7 @@ unsigned long shrink_all_memory(unsigned .may_swap = 0, .swap_cluster_max = nr_pages, .may_writepage = 1, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; current->reclaim_state = &reclaim_state; @@ -1391,7 +1396,7 @@ unsigned long shrink_all_memory(unsigned /* Force reclaiming mapped pages in the passes #3 and #4 */ if (pass > 2) { sc.may_swap = 1; - sc.swappiness = 100; + sc.mapped = 0; } for (prio = DEF_PRIORITY; prio >= 0; prio--) { @@ -1528,7 +1533,7 @@ static int __zone_reclaim(struct zone *z .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; disable_swap_token();