aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLee Schermerhorn <lee.schermerhorn@hp.com>2009-12-14 20:58:21 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 11:53:12 -0500
commit06808b0827e1cd14eedc96bac2655d5b37ac246c (patch)
tree8f7b52a4af1532ed414631f68b99a059e299d83f
parentc1e6c8d074ea3621106548654cc244d2edc12ead (diff)
hugetlb: derive huge pages nodes allowed from task mempolicy
This patch derives a "nodes_allowed" node mask from the numa mempolicy of the task modifying the number of persistent huge pages to control the allocation, freeing and adjusting of surplus huge pages when the pool page count is modified via the new sysctl or sysfs attribute "nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows: * For "default" [NULL] task mempolicy, a NULL nodemask_t pointer is produced. This will cause the hugetlb subsystem to use node_online_map as the "nodes_allowed". This preserves the behavior before this patch. * For "preferred" mempolicy, including explicit local allocation, a nodemask with the single preferred node will be produced. "local" policy will NOT track any internode migrations of the task adjusting nr_hugepages. * For "bind" and "interleave" policy, the mempolicy's nodemask will be used. * Other than to inform the construction of the nodes_allowed node mask, the actual mempolicy mode is ignored. That is, all modes behave like interleave over the resulting nodes_allowed mask with no "fallback". See the updated documentation [next patch] for more information about the implications of this patch. Examples: Starting with: Node 0 HugePages_Total: 0 Node 1 HugePages_Total: 0 Node 2 HugePages_Total: 0 Node 3 HugePages_Total: 0 Default behavior [with or without this patch] balances persistent hugepage allocation across nodes [with sufficient contiguous memory]: sysctl vm.nr_hugepages[_mempolicy]=32 yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 8 Node 3 HugePages_Total: 8 Of course, we only have nr_hugepages_mempolicy with the patch, but with default mempolicy, nr_hugepages_mempolicy behaves the same as nr_hugepages. Applying mempolicy--e.g., with numactl [using '-m' a.k.a. '--membind' because it allows multiple nodes to be specified and it's easy to type]--we can allocate huge pages on individual nodes or sets of nodes. So, starting from the condition above, with 8 huge pages per node, add 8 more to node 2 using: numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40 This yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The incremental 8 huge pages were restricted to node 2 by the specified mempolicy. Similarly, we can use mempolicy to free persistent huge pages from specified nodes: numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32 yields: Node 0 HugePages_Total: 4 Node 1 HugePages_Total: 4 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The 8 huge pages freed were balanced over nodes 0 and 1. [rientjes@google.com: accomodate reworked NODEMASK_ALLOC] Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Andi Kleen <andi@firstfloor.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Randy Dunlap <randy.dunlap@oracle.com> Cc: Nishanth Aravamudan <nacc@us.ibm.com> Cc: Adam Litke <agl@us.ibm.com> Cc: Andy Whitcroft <apw@canonical.com> Cc: Eric Whitney <eric.whitney@hp.com> Cc: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/hugetlb.h6
-rw-r--r--include/linux/mempolicy.h3
-rw-r--r--kernel/sysctl.c15
-rw-r--r--mm/hugetlb.c97
-rw-r--r--mm/mempolicy.c47
5 files changed, 153 insertions, 15 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 41a59afc70fa..78b4bc64c006 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -23,6 +23,12 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
23int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); 23int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
24int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); 24int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
25int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); 25int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
26
27#ifdef CONFIG_NUMA
28int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
29 void __user *, size_t *, loff_t *);
30#endif
31
26int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); 32int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
27int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, 33int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
28 struct page **, struct vm_area_struct **, 34 struct page **, struct vm_area_struct **,
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 085c903fe0f1..1cc966cd3e5f 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -201,6 +201,7 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p);
201extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, 201extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
202 unsigned long addr, gfp_t gfp_flags, 202 unsigned long addr, gfp_t gfp_flags,
203 struct mempolicy **mpol, nodemask_t **nodemask); 203 struct mempolicy **mpol, nodemask_t **nodemask);
204extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
204extern unsigned slab_node(struct mempolicy *policy); 205extern unsigned slab_node(struct mempolicy *policy);
205 206
206extern enum zone_type policy_zone; 207extern enum zone_type policy_zone;
@@ -328,6 +329,8 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
328 return node_zonelist(0, gfp_flags); 329 return node_zonelist(0, gfp_flags);
329} 330}
330 331
332static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; }
333
331static inline int do_migrate_pages(struct mm_struct *mm, 334static inline int do_migrate_pages(struct mm_struct *mm,
332 const nodemask_t *from_nodes, 335 const nodemask_t *from_nodes,
333 const nodemask_t *to_nodes, int flags) 336 const nodemask_t *to_nodes, int flags)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 554ac4894f0f..60fc93131095 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1051,7 +1051,7 @@ static struct ctl_table vm_table[] = {
1051 .extra2 = &one_hundred, 1051 .extra2 = &one_hundred,
1052 }, 1052 },
1053#ifdef CONFIG_HUGETLB_PAGE 1053#ifdef CONFIG_HUGETLB_PAGE
1054 { 1054 {
1055 .procname = "nr_hugepages", 1055 .procname = "nr_hugepages",
1056 .data = NULL, 1056 .data = NULL,
1057 .maxlen = sizeof(unsigned long), 1057 .maxlen = sizeof(unsigned long),
@@ -1059,7 +1059,18 @@ static struct ctl_table vm_table[] = {
1059 .proc_handler = hugetlb_sysctl_handler, 1059 .proc_handler = hugetlb_sysctl_handler,
1060 .extra1 = (void *)&hugetlb_zero, 1060 .extra1 = (void *)&hugetlb_zero,
1061 .extra2 = (void *)&hugetlb_infinity, 1061 .extra2 = (void *)&hugetlb_infinity,
1062 }, 1062 },
1063#ifdef CONFIG_NUMA
1064 {
1065 .procname = "nr_hugepages_mempolicy",
1066 .data = NULL,
1067 .maxlen = sizeof(unsigned long),
1068 .mode = 0644,
1069 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1070 .extra1 = (void *)&hugetlb_zero,
1071 .extra2 = (void *)&hugetlb_infinity,
1072 },
1073#endif
1063 { 1074 {
1064 .procname = "hugetlb_shm_group", 1075 .procname = "hugetlb_shm_group",
1065 .data = &sysctl_hugetlb_shm_group, 1076 .data = &sysctl_hugetlb_shm_group,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 324d1abae876..1125d818ea06 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1330,29 +1330,71 @@ static struct hstate *kobj_to_hstate(struct kobject *kobj)
1330 return NULL; 1330 return NULL;
1331} 1331}
1332 1332
1333static ssize_t nr_hugepages_show(struct kobject *kobj, 1333static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1334 struct kobj_attribute *attr, char *buf) 1334 struct kobj_attribute *attr, char *buf)
1335{ 1335{
1336 struct hstate *h = kobj_to_hstate(kobj); 1336 struct hstate *h = kobj_to_hstate(kobj);
1337 return sprintf(buf, "%lu\n", h->nr_huge_pages); 1337 return sprintf(buf, "%lu\n", h->nr_huge_pages);
1338} 1338}
1339static ssize_t nr_hugepages_store(struct kobject *kobj, 1339static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1340 struct kobj_attribute *attr, const char *buf, size_t count) 1340 struct kobject *kobj, struct kobj_attribute *attr,
1341 const char *buf, size_t len)
1341{ 1342{
1342 int err; 1343 int err;
1343 unsigned long input; 1344 unsigned long count;
1344 struct hstate *h = kobj_to_hstate(kobj); 1345 struct hstate *h = kobj_to_hstate(kobj);
1346 NODEMASK_ALLOC(nodemask_t, nodes_allowed);
1345 1347
1346 err = strict_strtoul(buf, 10, &input); 1348 err = strict_strtoul(buf, 10, &count);
1347 if (err) 1349 if (err)
1348 return 0; 1350 return 0;
1349 1351
1350 h->max_huge_pages = set_max_huge_pages(h, input, &node_online_map); 1352 if (!(obey_mempolicy && init_nodemask_of_mempolicy(nodes_allowed))) {
1353 NODEMASK_FREE(nodes_allowed);
1354 nodes_allowed = &node_online_map;
1355 }
1356 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1351 1357
1352 return count; 1358 if (nodes_allowed != &node_online_map)
1359 NODEMASK_FREE(nodes_allowed);
1360
1361 return len;
1362}
1363
1364static ssize_t nr_hugepages_show(struct kobject *kobj,
1365 struct kobj_attribute *attr, char *buf)
1366{
1367 return nr_hugepages_show_common(kobj, attr, buf);
1368}
1369
1370static ssize_t nr_hugepages_store(struct kobject *kobj,
1371 struct kobj_attribute *attr, const char *buf, size_t len)
1372{
1373 return nr_hugepages_store_common(false, kobj, attr, buf, len);
1353} 1374}
1354HSTATE_ATTR(nr_hugepages); 1375HSTATE_ATTR(nr_hugepages);
1355 1376
1377#ifdef CONFIG_NUMA
1378
1379/*
1380 * hstate attribute for optionally mempolicy-based constraint on persistent
1381 * huge page alloc/free.
1382 */
1383static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1384 struct kobj_attribute *attr, char *buf)
1385{
1386 return nr_hugepages_show_common(kobj, attr, buf);
1387}
1388
1389static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1390 struct kobj_attribute *attr, const char *buf, size_t len)
1391{
1392 return nr_hugepages_store_common(true, kobj, attr, buf, len);
1393}
1394HSTATE_ATTR(nr_hugepages_mempolicy);
1395#endif
1396
1397
1356static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 1398static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1357 struct kobj_attribute *attr, char *buf) 1399 struct kobj_attribute *attr, char *buf)
1358{ 1400{
@@ -1408,6 +1450,9 @@ static struct attribute *hstate_attrs[] = {
1408 &free_hugepages_attr.attr, 1450 &free_hugepages_attr.attr,
1409 &resv_hugepages_attr.attr, 1451 &resv_hugepages_attr.attr,
1410 &surplus_hugepages_attr.attr, 1452 &surplus_hugepages_attr.attr,
1453#ifdef CONFIG_NUMA
1454 &nr_hugepages_mempolicy_attr.attr,
1455#endif
1411 NULL, 1456 NULL,
1412}; 1457};
1413 1458
@@ -1574,9 +1619,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1574} 1619}
1575 1620
1576#ifdef CONFIG_SYSCTL 1621#ifdef CONFIG_SYSCTL
1577int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1622static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1578 void __user *buffer, 1623 struct ctl_table *table, int write,
1579 size_t *length, loff_t *ppos) 1624 void __user *buffer, size_t *length, loff_t *ppos)
1580{ 1625{
1581 struct hstate *h = &default_hstate; 1626 struct hstate *h = &default_hstate;
1582 unsigned long tmp; 1627 unsigned long tmp;
@@ -1588,13 +1633,39 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1588 table->maxlen = sizeof(unsigned long); 1633 table->maxlen = sizeof(unsigned long);
1589 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1634 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1590 1635
1591 if (write) 1636 if (write) {
1592 h->max_huge_pages = set_max_huge_pages(h, tmp, 1637 NODEMASK_ALLOC(nodemask_t, nodes_allowed);
1593 &node_online_map); 1638 if (!(obey_mempolicy &&
1639 init_nodemask_of_mempolicy(nodes_allowed))) {
1640 NODEMASK_FREE(nodes_allowed);
1641 nodes_allowed = &node_states[N_HIGH_MEMORY];
1642 }
1643 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
1644
1645 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1646 NODEMASK_FREE(nodes_allowed);
1647 }
1594 1648
1595 return 0; 1649 return 0;
1596} 1650}
1597 1651
1652int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1653 void __user *buffer, size_t *length, loff_t *ppos)
1654{
1655
1656 return hugetlb_sysctl_handler_common(false, table, write,
1657 buffer, length, ppos);
1658}
1659
1660#ifdef CONFIG_NUMA
1661int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
1662 void __user *buffer, size_t *length, loff_t *ppos)
1663{
1664 return hugetlb_sysctl_handler_common(true, table, write,
1665 buffer, length, ppos);
1666}
1667#endif /* CONFIG_NUMA */
1668
1598int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1669int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1599 void __user *buffer, 1670 void __user *buffer,
1600 size_t *length, loff_t *ppos) 1671 size_t *length, loff_t *ppos)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f89eabbaf3e..f11fdad06204 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1568,6 +1568,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1568 } 1568 }
1569 return zl; 1569 return zl;
1570} 1570}
1571
1572/*
1573 * init_nodemask_of_mempolicy
1574 *
1575 * If the current task's mempolicy is "default" [NULL], return 'false'
1576 * to indicate default policy. Otherwise, extract the policy nodemask
1577 * for 'bind' or 'interleave' policy into the argument nodemask, or
1578 * initialize the argument nodemask to contain the single node for
1579 * 'preferred' or 'local' policy and return 'true' to indicate presence
1580 * of non-default mempolicy.
1581 *
1582 * We don't bother with reference counting the mempolicy [mpol_get/put]
1583 * because the current task is examining it's own mempolicy and a task's
1584 * mempolicy is only ever changed by the task itself.
1585 *
1586 * N.B., it is the caller's responsibility to free a returned nodemask.
1587 */
1588bool init_nodemask_of_mempolicy(nodemask_t *mask)
1589{
1590 struct mempolicy *mempolicy;
1591 int nid;
1592
1593 if (!(mask && current->mempolicy))
1594 return false;
1595
1596 mempolicy = current->mempolicy;
1597 switch (mempolicy->mode) {
1598 case MPOL_PREFERRED:
1599 if (mempolicy->flags & MPOL_F_LOCAL)
1600 nid = numa_node_id();
1601 else
1602 nid = mempolicy->v.preferred_node;
1603 init_nodemask_of_node(mask, nid);
1604 break;
1605
1606 case MPOL_BIND:
1607 /* Fall through */
1608 case MPOL_INTERLEAVE:
1609 *mask = mempolicy->v.nodes;
1610 break;
1611
1612 default:
1613 BUG();
1614 }
1615
1616 return true;
1617}
1571#endif 1618#endif
1572 1619
1573/* Allocate a page in interleaved policy. 1620/* Allocate a page in interleaved policy.