diff options
-rw-r--r-- | Documentation/sysctl/vm.txt | 17 | ||||
-rw-r--r-- | include/linux/mmzone.h | 2 | ||||
-rw-r--r-- | include/linux/sysctl.h | 1 | ||||
-rw-r--r-- | kernel/sysctl.c | 12 | ||||
-rw-r--r-- | mm/page_alloc.c | 49 |
5 files changed, 81 insertions, 0 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 89ba1a42a17d..6910c0136f8d 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -103,3 +103,20 @@ This is used to force the Linux VM to keep a minimum number | |||
103 | of kilobytes free. The VM uses this number to compute a pages_min | 103 | of kilobytes free. The VM uses this number to compute a pages_min |
104 | value for each lowmem zone in the system. Each lowmem zone gets | 104 | value for each lowmem zone in the system. Each lowmem zone gets |
105 | a number of reserved free pages based proportionally on its size. | 105 | a number of reserved free pages based proportionally on its size. |
106 | |||
107 | ============================================================== | ||
108 | |||
109 | percpu_pagelist_fraction | ||
110 | |||
111 | This is the fraction of pages at most (high mark pcp->high) in each zone that | ||
112 | are allocated for each per cpu page list. The min value for this is 8. It | ||
113 | means that we don't allow more than 1/8th of pages in each zone to be | ||
114 | allocated in any single per_cpu_pagelist. This entry only changes the value | ||
115 | of hot per cpu pagelists. User can specify a number like 100 to allocate | ||
116 | 1/100th of each zone to each per cpu page list. | ||
117 | |||
118 | The batch value of each per cpu pagelist is also updated as a result. It is | ||
119 | set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) | ||
120 | |||
121 | The initial value is zero. Kernel does not use this value at boot time to set | ||
122 | the high water marks for each per cpu page list. | ||
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c34f4a2c62f8..2a89c132ba9c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -437,6 +437,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, | |||
437 | extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; | 437 | extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; |
438 | int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, | 438 | int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, |
439 | void __user *, size_t *, loff_t *); | 439 | void __user *, size_t *, loff_t *); |
440 | int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *, | ||
441 | void __user *, size_t *, loff_t *); | ||
440 | 442 | ||
441 | #include <linux/topology.h> | 443 | #include <linux/topology.h> |
442 | /* Returns the number of the current Node. */ | 444 | /* Returns the number of the current Node. */ |
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 4cd267fe87ec..7f472127b7b5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h | |||
@@ -181,6 +181,7 @@ enum | |||
181 | VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ | 181 | VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ |
182 | VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ | 182 | VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ |
183 | VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ | 183 | VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ |
184 | VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ | ||
184 | }; | 185 | }; |
185 | 186 | ||
186 | 187 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8dcf6fd5b0f9..03b0598f2369 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -69,6 +69,7 @@ extern int printk_ratelimit_jiffies; | |||
69 | extern int printk_ratelimit_burst; | 69 | extern int printk_ratelimit_burst; |
70 | extern int pid_max_min, pid_max_max; | 70 | extern int pid_max_min, pid_max_max; |
71 | extern int sysctl_drop_caches; | 71 | extern int sysctl_drop_caches; |
72 | extern int percpu_pagelist_fraction; | ||
72 | 73 | ||
73 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 74 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
74 | int unknown_nmi_panic; | 75 | int unknown_nmi_panic; |
@@ -79,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, | |||
79 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 80 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
80 | static int maxolduid = 65535; | 81 | static int maxolduid = 65535; |
81 | static int minolduid; | 82 | static int minolduid; |
83 | static int min_percpu_pagelist_fract = 8; | ||
82 | 84 | ||
83 | static int ngroups_max = NGROUPS_MAX; | 85 | static int ngroups_max = NGROUPS_MAX; |
84 | 86 | ||
@@ -794,6 +796,16 @@ static ctl_table vm_table[] = { | |||
794 | .strategy = &sysctl_intvec, | 796 | .strategy = &sysctl_intvec, |
795 | .extra1 = &zero, | 797 | .extra1 = &zero, |
796 | }, | 798 | }, |
799 | { | ||
800 | .ctl_name = VM_PERCPU_PAGELIST_FRACTION, | ||
801 | .procname = "percpu_pagelist_fraction", | ||
802 | .data = &percpu_pagelist_fraction, | ||
803 | .maxlen = sizeof(percpu_pagelist_fraction), | ||
804 | .mode = 0644, | ||
805 | .proc_handler = &percpu_pagelist_fraction_sysctl_handler, | ||
806 | .strategy = &sysctl_intvec, | ||
807 | .extra1 = &min_percpu_pagelist_fract, | ||
808 | }, | ||
797 | #ifdef CONFIG_MMU | 809 | #ifdef CONFIG_MMU |
798 | { | 810 | { |
799 | .ctl_name = VM_MAX_MAP_COUNT, | 811 | .ctl_name = VM_MAX_MAP_COUNT, |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5eeeadd9f66a..2c46f697e8ff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly; | |||
53 | unsigned long totalram_pages __read_mostly; | 53 | unsigned long totalram_pages __read_mostly; |
54 | unsigned long totalhigh_pages __read_mostly; | 54 | unsigned long totalhigh_pages __read_mostly; |
55 | long nr_swap_pages; | 55 | long nr_swap_pages; |
56 | int percpu_pagelist_fraction; | ||
56 | 57 | ||
57 | static void fastcall free_hot_cold_page(struct page *page, int cold); | 58 | static void fastcall free_hot_cold_page(struct page *page, int cold); |
58 | 59 | ||
@@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
1831 | INIT_LIST_HEAD(&pcp->list); | 1832 | INIT_LIST_HEAD(&pcp->list); |
1832 | } | 1833 | } |
1833 | 1834 | ||
1835 | /* | ||
1836 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist | ||
1837 | * to the value high for the pageset p. | ||
1838 | */ | ||
1839 | |||
1840 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, | ||
1841 | unsigned long high) | ||
1842 | { | ||
1843 | struct per_cpu_pages *pcp; | ||
1844 | |||
1845 | pcp = &p->pcp[0]; /* hot list */ | ||
1846 | pcp->high = high; | ||
1847 | pcp->batch = max(1UL, high/4); | ||
1848 | if ((high/4) > (PAGE_SHIFT * 8)) | ||
1849 | pcp->batch = PAGE_SHIFT * 8; | ||
1850 | } | ||
1851 | |||
1852 | |||
1834 | #ifdef CONFIG_NUMA | 1853 | #ifdef CONFIG_NUMA |
1835 | /* | 1854 | /* |
1836 | * Boot pageset table. One per cpu which is going to be used for all | 1855 | * Boot pageset table. One per cpu which is going to be used for all |
@@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu) | |||
1868 | goto bad; | 1887 | goto bad; |
1869 | 1888 | ||
1870 | setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); | 1889 | setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); |
1890 | |||
1891 | if (percpu_pagelist_fraction) | ||
1892 | setup_pagelist_highmark(zone_pcp(zone, cpu), | ||
1893 | (zone->present_pages / percpu_pagelist_fraction)); | ||
1871 | } | 1894 | } |
1872 | 1895 | ||
1873 | return 0; | 1896 | return 0; |
@@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | |||
2567 | return 0; | 2590 | return 0; |
2568 | } | 2591 | } |
2569 | 2592 | ||
2593 | /* | ||
2594 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each | ||
2595 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist | ||
2596 | * can have before it gets flushed back to buddy allocator. | ||
2597 | */ | ||
2598 | |||
2599 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | ||
2600 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | ||
2601 | { | ||
2602 | struct zone *zone; | ||
2603 | unsigned int cpu; | ||
2604 | int ret; | ||
2605 | |||
2606 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | ||
2607 | if (!write || (ret == -EINVAL)) | ||
2608 | return ret; | ||
2609 | for_each_zone(zone) { | ||
2610 | for_each_online_cpu(cpu) { | ||
2611 | unsigned long high; | ||
2612 | high = zone->present_pages / percpu_pagelist_fraction; | ||
2613 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | ||
2614 | } | ||
2615 | } | ||
2616 | return 0; | ||
2617 | } | ||
2618 | |||
2570 | __initdata int hashdist = HASHDIST_DEFAULT; | 2619 | __initdata int hashdist = HASHDIST_DEFAULT; |
2571 | 2620 | ||
2572 | #ifdef CONFIG_NUMA | 2621 | #ifdef CONFIG_NUMA |