From 6b314d0e11924c803bf8cd944e87fd58cdb5088c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 2 Dec 2009 18:58:05 +0100
Subject: sched: Remove sysctl.sched_features

Since we've had a much saner debugfs interface to this, remove the
sysctl one.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
[ v2: build fix ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel/sysctl.c')
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dbf93a52ee9..e5cc53514caa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -314,14 +314,6 @@ static struct ctl_table kern_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_features",
-		.data		= &sysctl_sched_features,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_migration_cost",
-- 
cgit v1.2.2


From 1983a922a1bc843806b9a36cf3a370b242783140 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Mon, 30 Nov 2009 12:16:47 +0100
Subject: sched: Make tunable scaling style configurable

As scaling now takes place on all kind of cpu add/remove events a user
that configures values via proc should be able to configure if his set
values are still rescaled or kept whatever happens.

As the comments state that log2 was just a second guess that worked the
interface is not just designed for on/off, but to choose a scaling type.
Currently this allows none, log and linear, but more important it allwos
us to keep the interface even if someone has an even better idea how to
scale the values.

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1259579808-11357-3-git-send-email-ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel/sysctl.c')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e5cc53514caa..d10406e5fdfe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -251,6 +251,8 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_wakeup_granularity_ns;			/* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
 #endif
 
 static struct ctl_table kern_table[] = {
@@ -304,6 +306,18 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_tunable_scaling",
+		.data		= &sysctl_sched_tunable_scaling,
+		.maxlen		= sizeof(enum sched_tunable_scaling),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_sched_tunable_scaling,
+		.extra2		= &max_sched_tunable_scaling,
+	},
+
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_shares_thresh",
-- 
cgit v1.2.2


From acb4a848da821a095ae9e4d8b22ae2d9633ba5cd Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Mon, 30 Nov 2009 12:16:48 +0100
Subject: sched: Update normalized values on user updates via proc

The normalized values are also recalculated in case the scaling factor
changes.

This patch updates the internally used scheduler tuning values that are
normalized to one cpu in case a user sets new values via sysfs.

Together with patch 2 of this series this allows to let user configured
values scale (or not) to cpu add/remove events taking place later.

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1259579808-11357-4-git-send-email-ehrhardt@linux.vnet.ibm.com>
[ v2: fix warning ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel/sysctl.c')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d10406e5fdfe..b9e5a45f1e28 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -253,6 +253,8 @@ static int min_wakeup_granularity_ns;			/* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+static int min_sched_shares_ratelimit = 100000; /* 100 usec */
+static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 
 static struct ctl_table kern_table[] = {
@@ -271,7 +273,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_min_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &sched_nr_latency_handler,
+		.proc_handler	= &sched_proc_update_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
@@ -282,7 +284,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_latency,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &sched_nr_latency_handler,
+		.proc_handler	= &sched_proc_update_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
@@ -293,7 +295,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_wakeup_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &sched_proc_update_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
@@ -304,7 +306,9 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_shares_ratelimit,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &sched_proc_update_handler,
+		.extra1		= &min_sched_shares_ratelimit,
+		.extra2		= &max_sched_shares_ratelimit,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -312,7 +316,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_tunable_scaling,
 		.maxlen		= sizeof(enum sched_tunable_scaling),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &sched_proc_update_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &min_sched_tunable_scaling,
 		.extra2		= &max_sched_tunable_scaling,
-- 
cgit v1.2.2


From 06808b0827e1cd14eedc96bac2655d5b37ac246c Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 14 Dec 2009 17:58:21 -0800
Subject: hugetlb: derive huge pages nodes allowed from task mempolicy

This patch derives a "nodes_allowed" node mask from the numa mempolicy of
the task modifying the number of persistent huge pages to control the
allocation, freeing and adjusting of surplus huge pages when the pool page
count is modified via the new sysctl or sysfs attribute
"nr_hugepages_mempolicy".  The nodes_allowed mask is derived as follows:

* For "default" [NULL] task mempolicy, a NULL nodemask_t pointer
  is produced.  This will cause the hugetlb subsystem to use
  node_online_map as the "nodes_allowed".  This preserves the
  behavior before this patch.
* For "preferred" mempolicy, including explicit local allocation,
  a nodemask with the single preferred node will be produced.
  "local" policy will NOT track any internode migrations of the
  task adjusting nr_hugepages.
* For "bind" and "interleave" policy, the mempolicy's nodemask
  will be used.
* Other than to inform the construction of the nodes_allowed node
  mask, the actual mempolicy mode is ignored.  That is, all modes
  behave like interleave over the resulting nodes_allowed mask
  with no "fallback".

See the updated documentation [next patch] for more information
about the implications of this patch.

Examples:

Starting with:

	Node 0 HugePages_Total:     0
	Node 1 HugePages_Total:     0
	Node 2 HugePages_Total:     0
	Node 3 HugePages_Total:     0

Default behavior [with or without this patch] balances persistent
hugepage allocation across nodes [with sufficient contiguous memory]:

	sysctl vm.nr_hugepages[_mempolicy]=32

yields:

	Node 0 HugePages_Total:     8
	Node 1 HugePages_Total:     8
	Node 2 HugePages_Total:     8
	Node 3 HugePages_Total:     8

Of course, we only have nr_hugepages_mempolicy with the patch,
but with default mempolicy, nr_hugepages_mempolicy behaves the
same as nr_hugepages.

Applying mempolicy--e.g., with numactl [using '-m' a.k.a.
'--membind' because it allows multiple nodes to be specified
and it's easy to type]--we can allocate huge pages on
individual nodes or sets of nodes.  So, starting from the
condition above, with 8 huge pages per node, add 8 more to
node 2 using:

	numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40

This yields:

	Node 0 HugePages_Total:     8
	Node 1 HugePages_Total:     8
	Node 2 HugePages_Total:    16
	Node 3 HugePages_Total:     8

The incremental 8 huge pages were restricted to node 2 by the
specified mempolicy.

Similarly, we can use mempolicy to free persistent huge pages
from specified nodes:

	numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32

yields:

	Node 0 HugePages_Total:     4
	Node 1 HugePages_Total:     4
	Node 2 HugePages_Total:    16
	Node 3 HugePages_Total:     8

The 8 huge pages freed were balanced over nodes 0 and 1.

[rientjes@google.com: accomodate reworked NODEMASK_ALLOC]
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Andi Kleen <andi@firstfloor.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel/sysctl.c')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 554ac4894f0f..60fc93131095 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1051,7 +1051,7 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one_hundred,
 	},
 #ifdef CONFIG_HUGETLB_PAGE
-	 {
+	{
 		.procname	= "nr_hugepages",
 		.data		= NULL,
 		.maxlen		= sizeof(unsigned long),
@@ -1059,7 +1059,18 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= hugetlb_sysctl_handler,
 		.extra1		= (void *)&hugetlb_zero,
 		.extra2		= (void *)&hugetlb_infinity,
-	 },
+	},
+#ifdef CONFIG_NUMA
+	{
+		.procname       = "nr_hugepages_mempolicy",
+		.data           = NULL,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &hugetlb_mempolicy_sysctl_handler,
+		.extra1		= (void *)&hugetlb_zero,
+		.extra2		= (void *)&hugetlb_infinity,
+	},
+#endif
 	 {
 		.procname	= "hugetlb_shm_group",
 		.data		= &sysctl_hugetlb_shm_group,
-- 
cgit v1.2.2


From 70da2340fbc68e91e701762f785479ab495a0869 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Mon, 14 Dec 2009 17:59:52 -0800
Subject: 'sysctl_max_map_count' should be non-negative

Jan Engelhardt reported we have this problem:

setting max_map_count to a value large enough results in programs dying at
first try.  This is on 2.6.31.6:

15:59 borg:/proc/sys/vm # echo $[1<<31-1] >max_map_count
15:59 borg:/proc/sys/vm # cat max_map_count
1073741824
15:59 borg:/proc/sys/vm # echo $[1<<31] >max_map_count
15:59 borg:/proc/sys/vm # cat max_map_count
Killed

This is because we have a chance to make 'max_map_count' negative.  but
it's meaningless.  Make it only accept non-negative values.

Reported-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: James Morris <jmorris@namei.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/sysctl.c')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 60fc93131095..45e4bef0012a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1131,7 +1131,8 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_max_map_count,
 		.maxlen		= sizeof(sysctl_max_map_count),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
 	},
 #else
 	{
-- 
cgit v1.2.2


From 6e1415467614e854fee660ff6648bd10fa976e95 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 15 Dec 2009 19:27:45 +0000
Subject: NOMMU: Optimise away the {dac_,}mmap_min_addr tests

In NOMMU mode clamp dac_mmap_min_addr to zero to cause the tests on it to be
skipped by the compiler.  We do this as the minimum mmap address doesn't make
any sense in NOMMU mode.

mmap_min_addr and round_hint_to_min() can be discarded entirely in NOMMU mode.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/sysctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/sysctl.c')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 45e4bef0012a..856a24eadf7e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1214,6 +1214,7 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec_jiffies,
 	},
 #endif
+#ifdef CONFIG_MMU
 	{
 		.procname	= "mmap_min_addr",
 		.data		= &dac_mmap_min_addr,
@@ -1221,6 +1222,7 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= mmap_min_addr_handler,
 	},
+#endif
 #ifdef CONFIG_NUMA
 	{
 		.procname	= "numa_zonelist_order",
-- 
cgit v1.2.2


From 3e26120cc7c819c97bc07281ca1fb9017cfe9a39 Mon Sep 17 00:00:00 2001
From: WANG Cong <amwang@redhat.com>
Date: Thu, 17 Dec 2009 15:27:05 -0800
Subject: kernel/sysctl.c: fix the incomplete part of
 sysctl_max_map_count-should-be-non-negative.patch

It is a mistake that we used 'proc_dointvec', it should be
'proc_dointvec_minmax', as in the original patch.

Signed-off-by: WANG Cong <amwang@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sysctl.c')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 45e4bef0012a..6665761c006d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1131,7 +1131,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_max_map_count,
 		.maxlen		= sizeof(sysctl_max_map_count),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
 #else
-- 
cgit v1.2.2