aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cpusets.txt12
-rw-r--r--include/linux/cpuset.h5
-rw-r--r--kernel/cpuset.c80
-rw-r--r--mm/page_alloc.c16
-rw-r--r--mm/vmscan.c8
5 files changed, 101 insertions, 20 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index ad944c060312..47f4114fbf54 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -60,6 +60,18 @@ all of the cpus in the system. This removes any overhead due to
60load balancing code trying to pull tasks outside of the cpu exclusive 60load balancing code trying to pull tasks outside of the cpu exclusive
61cpuset only to be prevented by the tasks' cpus_allowed mask. 61cpuset only to be prevented by the tasks' cpus_allowed mask.
62 62
63A cpuset that is mem_exclusive restricts kernel allocations for
64page, buffer and other data commonly shared by the kernel across
65multiple users. All cpusets, whether mem_exclusive or not, restrict
66allocations of memory for user space. This enables configuring a
67system so that several independent jobs can share common kernel
68data, such as file system pages, while isolating each jobs user
69allocation in its own cpuset. To do this, construct a large
70mem_exclusive cpuset to hold all the jobs, and construct child,
71non-mem_exclusive cpusets for each individual job. Only a small
72amount of typical kernel memory, such as requests from interrupt
73handlers, is allowed to be taken outside even a mem_exclusive cpuset.
74
63User level code may create and destroy cpusets by name in the cpuset 75User level code may create and destroy cpusets by name in the cpuset
64virtual file system, manage the attributes and permissions of these 76virtual file system, manage the attributes and permissions of these
65cpusets and which CPUs and Memory Nodes are assigned to each cpuset, 77cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 3438233305a3..1fe1c3ebad30 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -23,7 +23,7 @@ void cpuset_init_current_mems_allowed(void);
23void cpuset_update_current_mems_allowed(void); 23void cpuset_update_current_mems_allowed(void);
24void cpuset_restrict_to_mems_allowed(unsigned long *nodes); 24void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
25int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); 25int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
26int cpuset_zone_allowed(struct zone *z); 26extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask);
27extern struct file_operations proc_cpuset_operations; 27extern struct file_operations proc_cpuset_operations;
28extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer); 28extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
29 29
@@ -48,7 +48,8 @@ static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
48 return 1; 48 return 1;
49} 49}
50 50
51static inline int cpuset_zone_allowed(struct zone *z) 51static inline int cpuset_zone_allowed(struct zone *z,
52 unsigned int __nocast gfp_mask)
52{ 53{
53 return 1; 54 return 1;
54} 55}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b8..214806deca99 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1611,17 +1611,81 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1611 return 0; 1611 return 0;
1612} 1612}
1613 1613
1614/*
1615 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
1616 * ancestor to the specified cpuset. Call while holding cpuset_sem.
1617 * If no ancestor is mem_exclusive (an unusual configuration), then
1618 * returns the root cpuset.
1619 */
1620static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1621{
1622 while (!is_mem_exclusive(cs) && cs->parent)
1623 cs = cs->parent;
1624 return cs;
1625}
1626
1614/** 1627/**
1615 * cpuset_zone_allowed - is zone z allowed in current->mems_allowed 1628 * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
1616 * @z: zone in question 1629 * @z: is this zone on an allowed node?
1630 * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
1617 * 1631 *
1618 * Is zone z allowed in current->mems_allowed, or is 1632 * If we're in interrupt, yes, we can always allocate. If zone
1619 * the CPU in interrupt context? (zone is always allowed in this case) 1633 * z's node is in our tasks mems_allowed, yes. If it's not a
1620 */ 1634 * __GFP_HARDWALL request and this zone's nodes is in the nearest
1621int cpuset_zone_allowed(struct zone *z) 1635 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
1636 * Otherwise, no.
1637 *
1638 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
1639 * and do not allow allocations outside the current tasks cpuset.
1640 * GFP_KERNEL allocations are not so marked, so can escape to the
1641 * nearest mem_exclusive ancestor cpuset.
1642 *
1643 * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages()
1644 * routine only calls here with __GFP_HARDWALL bit _not_ set if
1645 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
1646 * mems_allowed came up empty on the first pass over the zonelist.
1647 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
1648 * short of memory, might require taking the cpuset_sem semaphore.
1649 *
1650 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
1651 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
1652 * hardwall cpusets - no allocation on a node outside the cpuset is
1653 * allowed (unless in interrupt, of course).
1654 *
1655 * The second loop doesn't even call here for GFP_ATOMIC requests
1656 * (if the __alloc_pages() local variable 'wait' is set). That check
1657 * and the checks below have the combined affect in the second loop of
1658 * the __alloc_pages() routine that:
1659 * in_interrupt - any node ok (current task context irrelevant)
1660 * GFP_ATOMIC - any node ok
1661 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
1662 * GFP_USER - only nodes in current tasks mems allowed ok.
1663 **/
1664
1665int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
1622{ 1666{
1623 return in_interrupt() || 1667 int node; /* node that zone z is on */
1624 node_isset(z->zone_pgdat->node_id, current->mems_allowed); 1668 const struct cpuset *cs; /* current cpuset ancestors */
1669 int allowed = 1; /* is allocation in zone z allowed? */
1670
1671 if (in_interrupt())
1672 return 1;
1673 node = z->zone_pgdat->node_id;
1674 if (node_isset(node, current->mems_allowed))
1675 return 1;
1676 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
1677 return 0;
1678
1679 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1680 down(&cpuset_sem);
1681 cs = current->cpuset;
1682 if (!cs)
1683 goto done; /* current task exiting */
1684 cs = nearest_exclusive_ancestor(cs);
1685 allowed = node_isset(node, cs->mems_allowed);
1686done:
1687 up(&cpuset_sem);
1688 return allowed;
1625} 1689}
1626 1690
1627/* 1691/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 14d7032c1d12..3974fd81d27c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -806,11 +806,14 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
806 classzone_idx = zone_idx(zones[0]); 806 classzone_idx = zone_idx(zones[0]);
807 807
808restart: 808restart:
809 /* Go through the zonelist once, looking for a zone with enough free */ 809 /*
810 * Go through the zonelist once, looking for a zone with enough free.
811 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
812 */
810 for (i = 0; (z = zones[i]) != NULL; i++) { 813 for (i = 0; (z = zones[i]) != NULL; i++) {
811 int do_reclaim = should_reclaim_zone(z, gfp_mask); 814 int do_reclaim = should_reclaim_zone(z, gfp_mask);
812 815
813 if (!cpuset_zone_allowed(z)) 816 if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
814 continue; 817 continue;
815 818
816 /* 819 /*
@@ -845,6 +848,7 @@ zone_reclaim_retry:
845 * 848 *
846 * This is the last chance, in general, before the goto nopage. 849 * This is the last chance, in general, before the goto nopage.
847 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 850 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
851 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
848 */ 852 */
849 for (i = 0; (z = zones[i]) != NULL; i++) { 853 for (i = 0; (z = zones[i]) != NULL; i++) {
850 if (!zone_watermark_ok(z, order, z->pages_min, 854 if (!zone_watermark_ok(z, order, z->pages_min,
@@ -852,7 +856,7 @@ zone_reclaim_retry:
852 gfp_mask & __GFP_HIGH)) 856 gfp_mask & __GFP_HIGH))
853 continue; 857 continue;
854 858
855 if (wait && !cpuset_zone_allowed(z)) 859 if (wait && !cpuset_zone_allowed(z, gfp_mask))
856 continue; 860 continue;
857 861
858 page = buffered_rmqueue(z, order, gfp_mask); 862 page = buffered_rmqueue(z, order, gfp_mask);
@@ -867,7 +871,7 @@ zone_reclaim_retry:
867 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 871 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
868 /* go through the zonelist yet again, ignoring mins */ 872 /* go through the zonelist yet again, ignoring mins */
869 for (i = 0; (z = zones[i]) != NULL; i++) { 873 for (i = 0; (z = zones[i]) != NULL; i++) {
870 if (!cpuset_zone_allowed(z)) 874 if (!cpuset_zone_allowed(z, gfp_mask))
871 continue; 875 continue;
872 page = buffered_rmqueue(z, order, gfp_mask); 876 page = buffered_rmqueue(z, order, gfp_mask);
873 if (page) 877 if (page)
@@ -903,7 +907,7 @@ rebalance:
903 gfp_mask & __GFP_HIGH)) 907 gfp_mask & __GFP_HIGH))
904 continue; 908 continue;
905 909
906 if (!cpuset_zone_allowed(z)) 910 if (!cpuset_zone_allowed(z, gfp_mask))
907 continue; 911 continue;
908 912
909 page = buffered_rmqueue(z, order, gfp_mask); 913 page = buffered_rmqueue(z, order, gfp_mask);
@@ -922,7 +926,7 @@ rebalance:
922 classzone_idx, 0, 0)) 926 classzone_idx, 0, 0))
923 continue; 927 continue;
924 928
925 if (!cpuset_zone_allowed(z)) 929 if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
926 continue; 930 continue;
927 931
928 page = buffered_rmqueue(z, order, gfp_mask); 932 page = buffered_rmqueue(z, order, gfp_mask);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0095533cdde9..a740778f688d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -894,7 +894,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
894 if (zone->present_pages == 0) 894 if (zone->present_pages == 0)
895 continue; 895 continue;
896 896
897 if (!cpuset_zone_allowed(zone)) 897 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
898 continue; 898 continue;
899 899
900 zone->temp_priority = sc->priority; 900 zone->temp_priority = sc->priority;
@@ -940,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
940 for (i = 0; zones[i] != NULL; i++) { 940 for (i = 0; zones[i] != NULL; i++) {
941 struct zone *zone = zones[i]; 941 struct zone *zone = zones[i];
942 942
943 if (!cpuset_zone_allowed(zone)) 943 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
944 continue; 944 continue;
945 945
946 zone->temp_priority = DEF_PRIORITY; 946 zone->temp_priority = DEF_PRIORITY;
@@ -986,7 +986,7 @@ out:
986 for (i = 0; zones[i] != 0; i++) { 986 for (i = 0; zones[i] != 0; i++) {
987 struct zone *zone = zones[i]; 987 struct zone *zone = zones[i];
988 988
989 if (!cpuset_zone_allowed(zone)) 989 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
990 continue; 990 continue;
991 991
992 zone->prev_priority = zone->temp_priority; 992 zone->prev_priority = zone->temp_priority;
@@ -1256,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order)
1256 return; 1256 return;
1257 if (pgdat->kswapd_max_order < order) 1257 if (pgdat->kswapd_max_order < order)
1258 pgdat->kswapd_max_order = order; 1258 pgdat->kswapd_max_order = order;
1259 if (!cpuset_zone_allowed(zone)) 1259 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1260 return; 1260 return;
1261 if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) 1261 if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
1262 return; 1262 return;