aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMartin Bligh <mbligh@mbligh.org>2006-10-28 13:38:24 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-10-28 14:30:50 -0400
commit3bb1a852ab6c9cdf211a2f4a2f502340c8c38eca (patch)
treed08aa652e8eb40c47d5bc37fa1a240b4fb7db029 /mm
parent2ae88149a27cadf2840e0ab8155bef13be285c03 (diff)
[PATCH] vmscan: Fix temp_priority race
The temp_priority field in zone is racy, as we can walk through a reclaim path, and just before we copy it into prev_priority, it can be overwritten (say with DEF_PRIORITY) by another reclaimer. The same bug is contained in both try_to_free_pages and balance_pgdat, but it is fixed slightly differently. In balance_pgdat, we keep a separate priority record per zone in a local array. In try_to_free_pages there is no need to do this, as the priority level is the same for all zones that we reclaim from. Impact of this bug is that temp_priority is copied into prev_priority, and setting this artificially high causes reclaimers to set distress artificially low. They then fail to reclaim mapped pages, when they are, in fact, under severe memory pressure (their priority may be as low as 0). This causes the OOM killer to fire incorrectly. From: Andrew Morton <akpm@osdl.org> __zone_reclaim() isn't modifying zone->prev_priority. But zone->prev_priority is used in the decision whether or not to bring mapped pages onto the inactive list. Hence there's a risk here that __zone_reclaim() will fail because zone->prev_priority ir large (ie: low urgency) and lots of mapped pages end up stuck on the active list. Fix that up by decreasing (ie making more urgent) zone->prev_priority as __zone_reclaim() scans the zone's pages. This bug perhaps explains why ZONE_RECLAIM_PRIORITY was created. It should be possible to remove that now, and to just start out at DEF_PRIORITY? Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Christoph Lameter <clameter@engr.sgi.com> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/vmscan.c55
-rw-r--r--mm/vmstat.c2
3 files changed, 42 insertions, 17 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f5fc45472d5c..ecf853b5e30e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2407,7 +2407,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2407 zone->zone_pgdat = pgdat; 2407 zone->zone_pgdat = pgdat;
2408 zone->free_pages = 0; 2408 zone->free_pages = 0;
2409 2409
2410 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 2410 zone->prev_priority = DEF_PRIORITY;
2411 2411
2412 zone_pcp_init(zone); 2412 zone_pcp_init(zone);
2413 INIT_LIST_HEAD(&zone->active_list); 2413 INIT_LIST_HEAD(&zone->active_list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f05527bf792b..b32560ead5c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -723,6 +723,20 @@ done:
723 return nr_reclaimed; 723 return nr_reclaimed;
724} 724}
725 725
726/*
727 * We are about to scan this zone at a certain priority level. If that priority
728 * level is smaller (ie: more urgent) than the previous priority, then note
729 * that priority level within the zone. This is done so that when the next
730 * process comes in to scan this zone, it will immediately start out at this
731 * priority level rather than having to build up its own scanning priority.
732 * Here, this priority affects only the reclaim-mapped threshold.
733 */
734static inline void note_zone_scanning_priority(struct zone *zone, int priority)
735{
736 if (priority < zone->prev_priority)
737 zone->prev_priority = priority;
738}
739
726static inline int zone_is_near_oom(struct zone *zone) 740static inline int zone_is_near_oom(struct zone *zone)
727{ 741{
728 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; 742 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
@@ -972,9 +986,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
972 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 986 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
973 continue; 987 continue;
974 988
975 zone->temp_priority = priority; 989 note_zone_scanning_priority(zone, priority);
976 if (zone->prev_priority > priority)
977 zone->prev_priority = priority;
978 990
979 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 991 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
980 continue; /* Let kswapd poll it */ 992 continue; /* Let kswapd poll it */
@@ -1024,7 +1036,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1024 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1036 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1025 continue; 1037 continue;
1026 1038
1027 zone->temp_priority = DEF_PRIORITY;
1028 lru_pages += zone->nr_active + zone->nr_inactive; 1039 lru_pages += zone->nr_active + zone->nr_inactive;
1029 } 1040 }
1030 1041
@@ -1065,13 +1076,22 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1065 if (!sc.all_unreclaimable) 1076 if (!sc.all_unreclaimable)
1066 ret = 1; 1077 ret = 1;
1067out: 1078out:
1079 /*
1080 * Now that we've scanned all the zones at this priority level, note
1081 * that level within the zone so that the next thread which performs
1082 * scanning of this zone will immediately start out at this priority
1083 * level. This affects only the decision whether or not to bring
1084 * mapped pages onto the inactive list.
1085 */
1086 if (priority < 0)
1087 priority = 0;
1068 for (i = 0; zones[i] != 0; i++) { 1088 for (i = 0; zones[i] != 0; i++) {
1069 struct zone *zone = zones[i]; 1089 struct zone *zone = zones[i];
1070 1090
1071 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1091 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1072 continue; 1092 continue;
1073 1093
1074 zone->prev_priority = zone->temp_priority; 1094 zone->prev_priority = priority;
1075 } 1095 }
1076 return ret; 1096 return ret;
1077} 1097}
@@ -1111,6 +1131,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1111 .swap_cluster_max = SWAP_CLUSTER_MAX, 1131 .swap_cluster_max = SWAP_CLUSTER_MAX,
1112 .swappiness = vm_swappiness, 1132 .swappiness = vm_swappiness,
1113 }; 1133 };
1134 /*
1135 * temp_priority is used to remember the scanning priority at which
1136 * this zone was successfully refilled to free_pages == pages_high.
1137 */
1138 int temp_priority[MAX_NR_ZONES];
1114 1139
1115loop_again: 1140loop_again:
1116 total_scanned = 0; 1141 total_scanned = 0;
@@ -1118,11 +1143,8 @@ loop_again:
1118 sc.may_writepage = !laptop_mode; 1143 sc.may_writepage = !laptop_mode;
1119 count_vm_event(PAGEOUTRUN); 1144 count_vm_event(PAGEOUTRUN);
1120 1145
1121 for (i = 0; i < pgdat->nr_zones; i++) { 1146 for (i = 0; i < pgdat->nr_zones; i++)
1122 struct zone *zone = pgdat->node_zones + i; 1147 temp_priority[i] = DEF_PRIORITY;
1123
1124 zone->temp_priority = DEF_PRIORITY;
1125 }
1126 1148
1127 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1149 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1128 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1150 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -1183,10 +1205,9 @@ scan:
1183 if (!zone_watermark_ok(zone, order, zone->pages_high, 1205 if (!zone_watermark_ok(zone, order, zone->pages_high,
1184 end_zone, 0)) 1206 end_zone, 0))
1185 all_zones_ok = 0; 1207 all_zones_ok = 0;
1186 zone->temp_priority = priority; 1208 temp_priority[i] = priority;
1187 if (zone->prev_priority > priority)
1188 zone->prev_priority = priority;
1189 sc.nr_scanned = 0; 1209 sc.nr_scanned = 0;
1210 note_zone_scanning_priority(zone, priority);
1190 nr_reclaimed += shrink_zone(priority, zone, &sc); 1211 nr_reclaimed += shrink_zone(priority, zone, &sc);
1191 reclaim_state->reclaimed_slab = 0; 1212 reclaim_state->reclaimed_slab = 0;
1192 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1213 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -1226,10 +1247,15 @@ scan:
1226 break; 1247 break;
1227 } 1248 }
1228out: 1249out:
1250 /*
1251 * Note within each zone the priority level at which this zone was
1252 * brought into a happy state. So that the next thread which scans this
1253 * zone will start out at that priority level.
1254 */
1229 for (i = 0; i < pgdat->nr_zones; i++) { 1255 for (i = 0; i < pgdat->nr_zones; i++) {
1230 struct zone *zone = pgdat->node_zones + i; 1256 struct zone *zone = pgdat->node_zones + i;
1231 1257
1232 zone->prev_priority = zone->temp_priority; 1258 zone->prev_priority = temp_priority[i];
1233 } 1259 }
1234 if (!all_zones_ok) { 1260 if (!all_zones_ok) {
1235 cond_resched(); 1261 cond_resched();
@@ -1614,6 +1640,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1614 */ 1640 */
1615 priority = ZONE_RECLAIM_PRIORITY; 1641 priority = ZONE_RECLAIM_PRIORITY;
1616 do { 1642 do {
1643 note_zone_scanning_priority(zone, priority);
1617 nr_reclaimed += shrink_zone(priority, zone, &sc); 1644 nr_reclaimed += shrink_zone(priority, zone, &sc);
1618 priority--; 1645 priority--;
1619 } while (priority >= 0 && nr_reclaimed < nr_pages); 1646 } while (priority >= 0 && nr_reclaimed < nr_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 45b124e012f5..8614e8f6743b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -587,11 +587,9 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
587 seq_printf(m, 587 seq_printf(m,
588 "\n all_unreclaimable: %u" 588 "\n all_unreclaimable: %u"
589 "\n prev_priority: %i" 589 "\n prev_priority: %i"
590 "\n temp_priority: %i"
591 "\n start_pfn: %lu", 590 "\n start_pfn: %lu",
592 zone->all_unreclaimable, 591 zone->all_unreclaimable,
593 zone->prev_priority, 592 zone->prev_priority,
594 zone->temp_priority,
595 zone->zone_start_pfn); 593 zone->zone_start_pfn);
596 spin_unlock_irqrestore(&zone->lock, flags); 594 spin_unlock_irqrestore(&zone->lock, flags);
597 seq_putc(m, '\n'); 595 seq_putc(m, '\n');