diff options
-rw-r--r-- | Documentation/cpusets.txt | 12 | ||||
-rw-r--r-- | include/linux/cpuset.h | 5 | ||||
-rw-r--r-- | kernel/cpuset.c | 80 | ||||
-rw-r--r-- | mm/page_alloc.c | 16 | ||||
-rw-r--r-- | mm/vmscan.c | 8 |
5 files changed, 101 insertions, 20 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index ad944c060312..47f4114fbf54 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt | |||
@@ -60,6 +60,18 @@ all of the cpus in the system. This removes any overhead due to | |||
60 | load balancing code trying to pull tasks outside of the cpu exclusive | 60 | load balancing code trying to pull tasks outside of the cpu exclusive |
61 | cpuset only to be prevented by the tasks' cpus_allowed mask. | 61 | cpuset only to be prevented by the tasks' cpus_allowed mask. |
62 | 62 | ||
63 | A cpuset that is mem_exclusive restricts kernel allocations for | ||
64 | page, buffer and other data commonly shared by the kernel across | ||
65 | multiple users. All cpusets, whether mem_exclusive or not, restrict | ||
66 | allocations of memory for user space. This enables configuring a | ||
67 | system so that several independent jobs can share common kernel | ||
68 | data, such as file system pages, while isolating each jobs user | ||
69 | allocation in its own cpuset. To do this, construct a large | ||
70 | mem_exclusive cpuset to hold all the jobs, and construct child, | ||
71 | non-mem_exclusive cpusets for each individual job. Only a small | ||
72 | amount of typical kernel memory, such as requests from interrupt | ||
73 | handlers, is allowed to be taken outside even a mem_exclusive cpuset. | ||
74 | |||
63 | User level code may create and destroy cpusets by name in the cpuset | 75 | User level code may create and destroy cpusets by name in the cpuset |
64 | virtual file system, manage the attributes and permissions of these | 76 | virtual file system, manage the attributes and permissions of these |
65 | cpusets and which CPUs and Memory Nodes are assigned to each cpuset, | 77 | cpusets and which CPUs and Memory Nodes are assigned to each cpuset, |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 3438233305a3..1fe1c3ebad30 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -23,7 +23,7 @@ void cpuset_init_current_mems_allowed(void); | |||
23 | void cpuset_update_current_mems_allowed(void); | 23 | void cpuset_update_current_mems_allowed(void); |
24 | void cpuset_restrict_to_mems_allowed(unsigned long *nodes); | 24 | void cpuset_restrict_to_mems_allowed(unsigned long *nodes); |
25 | int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); | 25 | int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); |
26 | int cpuset_zone_allowed(struct zone *z); | 26 | extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask); |
27 | extern struct file_operations proc_cpuset_operations; | 27 | extern struct file_operations proc_cpuset_operations; |
28 | extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer); | 28 | extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer); |
29 | 29 | ||
@@ -48,7 +48,8 @@ static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
48 | return 1; | 48 | return 1; |
49 | } | 49 | } |
50 | 50 | ||
51 | static inline int cpuset_zone_allowed(struct zone *z) | 51 | static inline int cpuset_zone_allowed(struct zone *z, |
52 | unsigned int __nocast gfp_mask) | ||
52 | { | 53 | { |
53 | return 1; | 54 | return 1; |
54 | } | 55 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8ab1b4e518b8..214806deca99 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1611,17 +1611,81 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
1611 | return 0; | 1611 | return 0; |
1612 | } | 1612 | } |
1613 | 1613 | ||
1614 | /* | ||
1615 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | ||
1616 | * ancestor to the specified cpuset. Call while holding cpuset_sem. | ||
1617 | * If no ancestor is mem_exclusive (an unusual configuration), then | ||
1618 | * returns the root cpuset. | ||
1619 | */ | ||
1620 | static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | ||
1621 | { | ||
1622 | while (!is_mem_exclusive(cs) && cs->parent) | ||
1623 | cs = cs->parent; | ||
1624 | return cs; | ||
1625 | } | ||
1626 | |||
1614 | /** | 1627 | /** |
1615 | * cpuset_zone_allowed - is zone z allowed in current->mems_allowed | 1628 | * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? |
1616 | * @z: zone in question | 1629 | * @z: is this zone on an allowed node? |
1630 | * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) | ||
1617 | * | 1631 | * |
1618 | * Is zone z allowed in current->mems_allowed, or is | 1632 | * If we're in interrupt, yes, we can always allocate. If zone |
1619 | * the CPU in interrupt context? (zone is always allowed in this case) | 1633 | * z's node is in our tasks mems_allowed, yes. If it's not a |
1620 | */ | 1634 | * __GFP_HARDWALL request and this zone's nodes is in the nearest |
1621 | int cpuset_zone_allowed(struct zone *z) | 1635 | * mem_exclusive cpuset ancestor to this tasks cpuset, yes. |
1636 | * Otherwise, no. | ||
1637 | * | ||
1638 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, | ||
1639 | * and do not allow allocations outside the current tasks cpuset. | ||
1640 | * GFP_KERNEL allocations are not so marked, so can escape to the | ||
1641 | * nearest mem_exclusive ancestor cpuset. | ||
1642 | * | ||
1643 | * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() | ||
1644 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | ||
1645 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | ||
1646 | * mems_allowed came up empty on the first pass over the zonelist. | ||
1647 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | ||
1648 | * short of memory, might require taking the cpuset_sem semaphore. | ||
1649 | * | ||
1650 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | ||
1651 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | ||
1652 | * hardwall cpusets - no allocation on a node outside the cpuset is | ||
1653 | * allowed (unless in interrupt, of course). | ||
1654 | * | ||
1655 | * The second loop doesn't even call here for GFP_ATOMIC requests | ||
1656 | * (if the __alloc_pages() local variable 'wait' is set). That check | ||
1657 | * and the checks below have the combined affect in the second loop of | ||
1658 | * the __alloc_pages() routine that: | ||
1659 | * in_interrupt - any node ok (current task context irrelevant) | ||
1660 | * GFP_ATOMIC - any node ok | ||
1661 | * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok | ||
1662 | * GFP_USER - only nodes in current tasks mems allowed ok. | ||
1663 | **/ | ||
1664 | |||
1665 | int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) | ||
1622 | { | 1666 | { |
1623 | return in_interrupt() || | 1667 | int node; /* node that zone z is on */ |
1624 | node_isset(z->zone_pgdat->node_id, current->mems_allowed); | 1668 | const struct cpuset *cs; /* current cpuset ancestors */ |
1669 | int allowed = 1; /* is allocation in zone z allowed? */ | ||
1670 | |||
1671 | if (in_interrupt()) | ||
1672 | return 1; | ||
1673 | node = z->zone_pgdat->node_id; | ||
1674 | if (node_isset(node, current->mems_allowed)) | ||
1675 | return 1; | ||
1676 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ | ||
1677 | return 0; | ||
1678 | |||
1679 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | ||
1680 | down(&cpuset_sem); | ||
1681 | cs = current->cpuset; | ||
1682 | if (!cs) | ||
1683 | goto done; /* current task exiting */ | ||
1684 | cs = nearest_exclusive_ancestor(cs); | ||
1685 | allowed = node_isset(node, cs->mems_allowed); | ||
1686 | done: | ||
1687 | up(&cpuset_sem); | ||
1688 | return allowed; | ||
1625 | } | 1689 | } |
1626 | 1690 | ||
1627 | /* | 1691 | /* |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 14d7032c1d12..3974fd81d27c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -806,11 +806,14 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, | |||
806 | classzone_idx = zone_idx(zones[0]); | 806 | classzone_idx = zone_idx(zones[0]); |
807 | 807 | ||
808 | restart: | 808 | restart: |
809 | /* Go through the zonelist once, looking for a zone with enough free */ | 809 | /* |
810 | * Go through the zonelist once, looking for a zone with enough free. | ||
811 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | ||
812 | */ | ||
810 | for (i = 0; (z = zones[i]) != NULL; i++) { | 813 | for (i = 0; (z = zones[i]) != NULL; i++) { |
811 | int do_reclaim = should_reclaim_zone(z, gfp_mask); | 814 | int do_reclaim = should_reclaim_zone(z, gfp_mask); |
812 | 815 | ||
813 | if (!cpuset_zone_allowed(z)) | 816 | if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) |
814 | continue; | 817 | continue; |
815 | 818 | ||
816 | /* | 819 | /* |
@@ -845,6 +848,7 @@ zone_reclaim_retry: | |||
845 | * | 848 | * |
846 | * This is the last chance, in general, before the goto nopage. | 849 | * This is the last chance, in general, before the goto nopage. |
847 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 850 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
851 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | ||
848 | */ | 852 | */ |
849 | for (i = 0; (z = zones[i]) != NULL; i++) { | 853 | for (i = 0; (z = zones[i]) != NULL; i++) { |
850 | if (!zone_watermark_ok(z, order, z->pages_min, | 854 | if (!zone_watermark_ok(z, order, z->pages_min, |
@@ -852,7 +856,7 @@ zone_reclaim_retry: | |||
852 | gfp_mask & __GFP_HIGH)) | 856 | gfp_mask & __GFP_HIGH)) |
853 | continue; | 857 | continue; |
854 | 858 | ||
855 | if (wait && !cpuset_zone_allowed(z)) | 859 | if (wait && !cpuset_zone_allowed(z, gfp_mask)) |
856 | continue; | 860 | continue; |
857 | 861 | ||
858 | page = buffered_rmqueue(z, order, gfp_mask); | 862 | page = buffered_rmqueue(z, order, gfp_mask); |
@@ -867,7 +871,7 @@ zone_reclaim_retry: | |||
867 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 871 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
868 | /* go through the zonelist yet again, ignoring mins */ | 872 | /* go through the zonelist yet again, ignoring mins */ |
869 | for (i = 0; (z = zones[i]) != NULL; i++) { | 873 | for (i = 0; (z = zones[i]) != NULL; i++) { |
870 | if (!cpuset_zone_allowed(z)) | 874 | if (!cpuset_zone_allowed(z, gfp_mask)) |
871 | continue; | 875 | continue; |
872 | page = buffered_rmqueue(z, order, gfp_mask); | 876 | page = buffered_rmqueue(z, order, gfp_mask); |
873 | if (page) | 877 | if (page) |
@@ -903,7 +907,7 @@ rebalance: | |||
903 | gfp_mask & __GFP_HIGH)) | 907 | gfp_mask & __GFP_HIGH)) |
904 | continue; | 908 | continue; |
905 | 909 | ||
906 | if (!cpuset_zone_allowed(z)) | 910 | if (!cpuset_zone_allowed(z, gfp_mask)) |
907 | continue; | 911 | continue; |
908 | 912 | ||
909 | page = buffered_rmqueue(z, order, gfp_mask); | 913 | page = buffered_rmqueue(z, order, gfp_mask); |
@@ -922,7 +926,7 @@ rebalance: | |||
922 | classzone_idx, 0, 0)) | 926 | classzone_idx, 0, 0)) |
923 | continue; | 927 | continue; |
924 | 928 | ||
925 | if (!cpuset_zone_allowed(z)) | 929 | if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) |
926 | continue; | 930 | continue; |
927 | 931 | ||
928 | page = buffered_rmqueue(z, order, gfp_mask); | 932 | page = buffered_rmqueue(z, order, gfp_mask); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 0095533cdde9..a740778f688d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -894,7 +894,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
894 | if (zone->present_pages == 0) | 894 | if (zone->present_pages == 0) |
895 | continue; | 895 | continue; |
896 | 896 | ||
897 | if (!cpuset_zone_allowed(zone)) | 897 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
898 | continue; | 898 | continue; |
899 | 899 | ||
900 | zone->temp_priority = sc->priority; | 900 | zone->temp_priority = sc->priority; |
@@ -940,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask) | |||
940 | for (i = 0; zones[i] != NULL; i++) { | 940 | for (i = 0; zones[i] != NULL; i++) { |
941 | struct zone *zone = zones[i]; | 941 | struct zone *zone = zones[i]; |
942 | 942 | ||
943 | if (!cpuset_zone_allowed(zone)) | 943 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
944 | continue; | 944 | continue; |
945 | 945 | ||
946 | zone->temp_priority = DEF_PRIORITY; | 946 | zone->temp_priority = DEF_PRIORITY; |
@@ -986,7 +986,7 @@ out: | |||
986 | for (i = 0; zones[i] != 0; i++) { | 986 | for (i = 0; zones[i] != 0; i++) { |
987 | struct zone *zone = zones[i]; | 987 | struct zone *zone = zones[i]; |
988 | 988 | ||
989 | if (!cpuset_zone_allowed(zone)) | 989 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
990 | continue; | 990 | continue; |
991 | 991 | ||
992 | zone->prev_priority = zone->temp_priority; | 992 | zone->prev_priority = zone->temp_priority; |
@@ -1256,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1256 | return; | 1256 | return; |
1257 | if (pgdat->kswapd_max_order < order) | 1257 | if (pgdat->kswapd_max_order < order) |
1258 | pgdat->kswapd_max_order = order; | 1258 | pgdat->kswapd_max_order = order; |
1259 | if (!cpuset_zone_allowed(zone)) | 1259 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
1260 | return; | 1260 | return; |
1261 | if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) | 1261 | if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) |
1262 | return; | 1262 | return; |