aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRohit Seth <rohit.seth@intel.com>2006-01-08 04:00:40 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-08 23:12:40 -0500
commit8ad4b1fb8205340dba16b63467bb23efc27264d6 (patch)
tree9f5c237ead93976e5454c5da5d3bba350a2419c5
parent9d0243bca345d5ce25d3f4b74b7facb3a6df1232 (diff)
[PATCH] Make high and batch sizes of per_cpu_pagelists configurable
As recently there has been lot of traffic on the right values for batch and high water marks for per_cpu_pagelists. This patch makes these two variables configurable through /proc interface. A new tunable /proc/sys/vm/percpu_pagelist_fraction is added. This entry controls the fraction of pages at most in each zone that are allocated for each per cpu page list. The min value for this is 8. It means that we don't allow more than 1/8th of pages in each zone to be allocated in any single per_cpu_pagelist. The batch value of each per cpu pagelist is also updated as a result. It is set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) Signed-off-by: Rohit Seth <rohit.seth@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--Documentation/sysctl/vm.txt17
-rw-r--r--include/linux/mmzone.h2
-rw-r--r--include/linux/sysctl.h1
-rw-r--r--kernel/sysctl.c12
-rw-r--r--mm/page_alloc.c49
5 files changed, 81 insertions, 0 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 89ba1a42a17d..6910c0136f8d 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -103,3 +103,20 @@ This is used to force the Linux VM to keep a minimum number
103of kilobytes free. The VM uses this number to compute a pages_min 103of kilobytes free. The VM uses this number to compute a pages_min
104value for each lowmem zone in the system. Each lowmem zone gets 104value for each lowmem zone in the system. Each lowmem zone gets
105a number of reserved free pages based proportionally on its size. 105a number of reserved free pages based proportionally on its size.
106
107==============================================================
108
109percpu_pagelist_fraction
110
111This is the fraction of pages at most (high mark pcp->high) in each zone that
112are allocated for each per cpu page list. The min value for this is 8. It
113means that we don't allow more than 1/8th of pages in each zone to be
114allocated in any single per_cpu_pagelist. This entry only changes the value
115of hot per cpu pagelists. User can specify a number like 100 to allocate
1161/100th of each zone to each per cpu page list.
117
118The batch value of each per cpu pagelist is also updated as a result. It is
119set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
120
121The initial value is zero. Kernel does not use this value at boot time to set
122the high water marks for each per cpu page list.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c34f4a2c62f8..2a89c132ba9c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -437,6 +437,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
437extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; 437extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
438int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, 438int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
439 void __user *, size_t *, loff_t *); 439 void __user *, size_t *, loff_t *);
440int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
441 void __user *, size_t *, loff_t *);
440 442
441#include <linux/topology.h> 443#include <linux/topology.h>
442/* Returns the number of the current Node. */ 444/* Returns the number of the current Node. */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4cd267fe87ec..7f472127b7b5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -181,6 +181,7 @@ enum
181 VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ 181 VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
182 VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ 182 VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
183 VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ 183 VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */
184 VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
184}; 185};
185 186
186 187
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8dcf6fd5b0f9..03b0598f2369 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -69,6 +69,7 @@ extern int printk_ratelimit_jiffies;
69extern int printk_ratelimit_burst; 69extern int printk_ratelimit_burst;
70extern int pid_max_min, pid_max_max; 70extern int pid_max_min, pid_max_max;
71extern int sysctl_drop_caches; 71extern int sysctl_drop_caches;
72extern int percpu_pagelist_fraction;
72 73
73#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 74#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
74int unknown_nmi_panic; 75int unknown_nmi_panic;
@@ -79,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
79/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 80/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
80static int maxolduid = 65535; 81static int maxolduid = 65535;
81static int minolduid; 82static int minolduid;
83static int min_percpu_pagelist_fract = 8;
82 84
83static int ngroups_max = NGROUPS_MAX; 85static int ngroups_max = NGROUPS_MAX;
84 86
@@ -794,6 +796,16 @@ static ctl_table vm_table[] = {
794 .strategy = &sysctl_intvec, 796 .strategy = &sysctl_intvec,
795 .extra1 = &zero, 797 .extra1 = &zero,
796 }, 798 },
799 {
800 .ctl_name = VM_PERCPU_PAGELIST_FRACTION,
801 .procname = "percpu_pagelist_fraction",
802 .data = &percpu_pagelist_fraction,
803 .maxlen = sizeof(percpu_pagelist_fraction),
804 .mode = 0644,
805 .proc_handler = &percpu_pagelist_fraction_sysctl_handler,
806 .strategy = &sysctl_intvec,
807 .extra1 = &min_percpu_pagelist_fract,
808 },
797#ifdef CONFIG_MMU 809#ifdef CONFIG_MMU
798 { 810 {
799 .ctl_name = VM_MAX_MAP_COUNT, 811 .ctl_name = VM_MAX_MAP_COUNT,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5eeeadd9f66a..2c46f697e8ff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly;
53unsigned long totalram_pages __read_mostly; 53unsigned long totalram_pages __read_mostly;
54unsigned long totalhigh_pages __read_mostly; 54unsigned long totalhigh_pages __read_mostly;
55long nr_swap_pages; 55long nr_swap_pages;
56int percpu_pagelist_fraction;
56 57
57static void fastcall free_hot_cold_page(struct page *page, int cold); 58static void fastcall free_hot_cold_page(struct page *page, int cold);
58 59
@@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1831 INIT_LIST_HEAD(&pcp->list); 1832 INIT_LIST_HEAD(&pcp->list);
1832} 1833}
1833 1834
1835/*
1836 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
1837 * to the value high for the pageset p.
1838 */
1839
1840static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1841 unsigned long high)
1842{
1843 struct per_cpu_pages *pcp;
1844
1845 pcp = &p->pcp[0]; /* hot list */
1846 pcp->high = high;
1847 pcp->batch = max(1UL, high/4);
1848 if ((high/4) > (PAGE_SHIFT * 8))
1849 pcp->batch = PAGE_SHIFT * 8;
1850}
1851
1852
1834#ifdef CONFIG_NUMA 1853#ifdef CONFIG_NUMA
1835/* 1854/*
1836 * Boot pageset table. One per cpu which is going to be used for all 1855 * Boot pageset table. One per cpu which is going to be used for all
@@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu)
1868 goto bad; 1887 goto bad;
1869 1888
1870 setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); 1889 setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
1890
1891 if (percpu_pagelist_fraction)
1892 setup_pagelist_highmark(zone_pcp(zone, cpu),
1893 (zone->present_pages / percpu_pagelist_fraction));
1871 } 1894 }
1872 1895
1873 return 0; 1896 return 0;
@@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
2567 return 0; 2590 return 0;
2568} 2591}
2569 2592
2593/*
2594 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
2595 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
2596 * can have before it gets flushed back to buddy allocator.
2597 */
2598
2599int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
2600 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2601{
2602 struct zone *zone;
2603 unsigned int cpu;
2604 int ret;
2605
2606 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2607 if (!write || (ret == -EINVAL))
2608 return ret;
2609 for_each_zone(zone) {
2610 for_each_online_cpu(cpu) {
2611 unsigned long high;
2612 high = zone->present_pages / percpu_pagelist_fraction;
2613 setup_pagelist_highmark(zone_pcp(zone, cpu), high);
2614 }
2615 }
2616 return 0;
2617}
2618
2570__initdata int hashdist = HASHDIST_DEFAULT; 2619__initdata int hashdist = HASHDIST_DEFAULT;
2571 2620
2572#ifdef CONFIG_NUMA 2621#ifdef CONFIG_NUMA