diff options
Diffstat (limited to 'include/linux/mmzone.h')
-rw-r--r-- | include/linux/mmzone.h | 99 |
1 files changed, 88 insertions, 11 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 59855b8718a0..e339a7345f25 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -218,13 +218,9 @@ struct zone { | |||
218 | * under - it drives the swappiness decision: whether to unmap mapped | 218 | * under - it drives the swappiness decision: whether to unmap mapped |
219 | * pages. | 219 | * pages. |
220 | * | 220 | * |
221 | * temp_priority is used to remember the scanning priority at which | 221 | * Access to both this field is quite racy even on uniprocessor. But |
222 | * this zone was successfully refilled to free_pages == pages_high. | ||
223 | * | ||
224 | * Access to both these fields is quite racy even on uniprocessor. But | ||
225 | * it is expected to average out OK. | 222 | * it is expected to average out OK. |
226 | */ | 223 | */ |
227 | int temp_priority; | ||
228 | int prev_priority; | 224 | int prev_priority; |
229 | 225 | ||
230 | 226 | ||
@@ -282,7 +278,7 @@ struct zone { | |||
282 | /* | 278 | /* |
283 | * rarely used fields: | 279 | * rarely used fields: |
284 | */ | 280 | */ |
285 | char *name; | 281 | const char *name; |
286 | } ____cacheline_internodealigned_in_smp; | 282 | } ____cacheline_internodealigned_in_smp; |
287 | 283 | ||
288 | /* | 284 | /* |
@@ -292,19 +288,94 @@ struct zone { | |||
292 | */ | 288 | */ |
293 | #define DEF_PRIORITY 12 | 289 | #define DEF_PRIORITY 12 |
294 | 290 | ||
291 | /* Maximum number of zones on a zonelist */ | ||
292 | #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) | ||
293 | |||
294 | #ifdef CONFIG_NUMA | ||
295 | /* | ||
296 | * We cache key information from each zonelist for smaller cache | ||
297 | * footprint when scanning for free pages in get_page_from_freelist(). | ||
298 | * | ||
299 | * 1) The BITMAP fullzones tracks which zones in a zonelist have come | ||
300 | * up short of free memory since the last time (last_fullzone_zap) | ||
301 | * we zero'd fullzones. | ||
302 | * 2) The array z_to_n[] maps each zone in the zonelist to its node | ||
303 | * id, so that we can efficiently evaluate whether that node is | ||
304 | * set in the current tasks mems_allowed. | ||
305 | * | ||
306 | * Both fullzones and z_to_n[] are one-to-one with the zonelist, | ||
307 | * indexed by a zones offset in the zonelist zones[] array. | ||
308 | * | ||
309 | * The get_page_from_freelist() routine does two scans. During the | ||
310 | * first scan, we skip zones whose corresponding bit in 'fullzones' | ||
311 | * is set or whose corresponding node in current->mems_allowed (which | ||
312 | * comes from cpusets) is not set. During the second scan, we bypass | ||
313 | * this zonelist_cache, to ensure we look methodically at each zone. | ||
314 | * | ||
315 | * Once per second, we zero out (zap) fullzones, forcing us to | ||
316 | * reconsider nodes that might have regained more free memory. | ||
317 | * The field last_full_zap is the time we last zapped fullzones. | ||
318 | * | ||
319 | * This mechanism reduces the amount of time we waste repeatedly | ||
320 | * reexaming zones for free memory when they just came up low on | ||
321 | * memory momentarilly ago. | ||
322 | * | ||
323 | * The zonelist_cache struct members logically belong in struct | ||
324 | * zonelist. However, the mempolicy zonelists constructed for | ||
325 | * MPOL_BIND are intentionally variable length (and usually much | ||
326 | * shorter). A general purpose mechanism for handling structs with | ||
327 | * multiple variable length members is more mechanism than we want | ||
328 | * here. We resort to some special case hackery instead. | ||
329 | * | ||
330 | * The MPOL_BIND zonelists don't need this zonelist_cache (in good | ||
331 | * part because they are shorter), so we put the fixed length stuff | ||
332 | * at the front of the zonelist struct, ending in a variable length | ||
333 | * zones[], as is needed by MPOL_BIND. | ||
334 | * | ||
335 | * Then we put the optional zonelist cache on the end of the zonelist | ||
336 | * struct. This optional stuff is found by a 'zlcache_ptr' pointer in | ||
337 | * the fixed length portion at the front of the struct. This pointer | ||
338 | * both enables us to find the zonelist cache, and in the case of | ||
339 | * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL) | ||
340 | * to know that the zonelist cache is not there. | ||
341 | * | ||
342 | * The end result is that struct zonelists come in two flavors: | ||
343 | * 1) The full, fixed length version, shown below, and | ||
344 | * 2) The custom zonelists for MPOL_BIND. | ||
345 | * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache. | ||
346 | * | ||
347 | * Even though there may be multiple CPU cores on a node modifying | ||
348 | * fullzones or last_full_zap in the same zonelist_cache at the same | ||
349 | * time, we don't lock it. This is just hint data - if it is wrong now | ||
350 | * and then, the allocator will still function, perhaps a bit slower. | ||
351 | */ | ||
352 | |||
353 | |||
354 | struct zonelist_cache { | ||
355 | unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ | ||
356 | DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ | ||
357 | unsigned long last_full_zap; /* when last zap'd (jiffies) */ | ||
358 | }; | ||
359 | #else | ||
360 | struct zonelist_cache; | ||
361 | #endif | ||
362 | |||
295 | /* | 363 | /* |
296 | * One allocation request operates on a zonelist. A zonelist | 364 | * One allocation request operates on a zonelist. A zonelist |
297 | * is a list of zones, the first one is the 'goal' of the | 365 | * is a list of zones, the first one is the 'goal' of the |
298 | * allocation, the other zones are fallback zones, in decreasing | 366 | * allocation, the other zones are fallback zones, in decreasing |
299 | * priority. | 367 | * priority. |
300 | * | 368 | * |
301 | * Right now a zonelist takes up less than a cacheline. We never | 369 | * If zlcache_ptr is not NULL, then it is just the address of zlcache, |
302 | * modify it apart from boot-up, and only a few indices are used, | 370 | * as explained above. If zlcache_ptr is NULL, there is no zlcache. |
303 | * so despite the zonelist table being relatively big, the cache | ||
304 | * footprint of this construct is very small. | ||
305 | */ | 371 | */ |
372 | |||
306 | struct zonelist { | 373 | struct zonelist { |
307 | struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited | 374 | struct zonelist_cache *zlcache_ptr; // NULL or &zlcache |
375 | struct zone *zones[MAX_ZONES_PER_ZONELIST + 1]; // NULL delimited | ||
376 | #ifdef CONFIG_NUMA | ||
377 | struct zonelist_cache zlcache; // optional ... | ||
378 | #endif | ||
308 | }; | 379 | }; |
309 | 380 | ||
310 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 381 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
@@ -674,6 +745,12 @@ void sparse_init(void); | |||
674 | #define sparse_index_init(_sec, _nid) do {} while (0) | 745 | #define sparse_index_init(_sec, _nid) do {} while (0) |
675 | #endif /* CONFIG_SPARSEMEM */ | 746 | #endif /* CONFIG_SPARSEMEM */ |
676 | 747 | ||
748 | #ifdef CONFIG_NODES_SPAN_OTHER_NODES | ||
749 | #define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) | ||
750 | #else | ||
751 | #define early_pfn_in_nid(pfn, nid) (1) | ||
752 | #endif | ||
753 | |||
677 | #ifndef early_pfn_valid | 754 | #ifndef early_pfn_valid |
678 | #define early_pfn_valid(pfn) (1) | 755 | #define early_pfn_valid(pfn) (1) |
679 | #endif | 756 | #endif |