diff options
Diffstat (limited to 'include/linux/mmzone.h')
-rw-r--r-- | include/linux/mmzone.h | 85 |
1 files changed, 80 insertions, 5 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e06683e2bea3..09bf9d8d7b72 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -288,19 +288,94 @@ struct zone { | |||
288 | */ | 288 | */ |
289 | #define DEF_PRIORITY 12 | 289 | #define DEF_PRIORITY 12 |
290 | 290 | ||
291 | /* Maximum number of zones on a zonelist */ | ||
292 | #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) | ||
293 | |||
294 | #ifdef CONFIG_NUMA | ||
295 | /* | ||
296 | * We cache key information from each zonelist for smaller cache | ||
297 | * footprint when scanning for free pages in get_page_from_freelist(). | ||
298 | * | ||
299 | * 1) The BITMAP fullzones tracks which zones in a zonelist have come | ||
300 | * up short of free memory since the last time (last_fullzone_zap) | ||
301 | * we zero'd fullzones. | ||
302 | * 2) The array z_to_n[] maps each zone in the zonelist to its node | ||
303 | * id, so that we can efficiently evaluate whether that node is | ||
304 | * set in the current tasks mems_allowed. | ||
305 | * | ||
306 | * Both fullzones and z_to_n[] are one-to-one with the zonelist, | ||
307 | * indexed by a zones offset in the zonelist zones[] array. | ||
308 | * | ||
309 | * The get_page_from_freelist() routine does two scans. During the | ||
310 | * first scan, we skip zones whose corresponding bit in 'fullzones' | ||
311 | * is set or whose corresponding node in current->mems_allowed (which | ||
312 | * comes from cpusets) is not set. During the second scan, we bypass | ||
313 | * this zonelist_cache, to ensure we look methodically at each zone. | ||
314 | * | ||
315 | * Once per second, we zero out (zap) fullzones, forcing us to | ||
316 | * reconsider nodes that might have regained more free memory. | ||
317 | * The field last_full_zap is the time we last zapped fullzones. | ||
318 | * | ||
319 | * This mechanism reduces the amount of time we waste repeatedly | ||
320 | * reexaming zones for free memory when they just came up low on | ||
321 | * memory momentarilly ago. | ||
322 | * | ||
323 | * The zonelist_cache struct members logically belong in struct | ||
324 | * zonelist. However, the mempolicy zonelists constructed for | ||
325 | * MPOL_BIND are intentionally variable length (and usually much | ||
326 | * shorter). A general purpose mechanism for handling structs with | ||
327 | * multiple variable length members is more mechanism than we want | ||
328 | * here. We resort to some special case hackery instead. | ||
329 | * | ||
330 | * The MPOL_BIND zonelists don't need this zonelist_cache (in good | ||
331 | * part because they are shorter), so we put the fixed length stuff | ||
332 | * at the front of the zonelist struct, ending in a variable length | ||
333 | * zones[], as is needed by MPOL_BIND. | ||
334 | * | ||
335 | * Then we put the optional zonelist cache on the end of the zonelist | ||
336 | * struct. This optional stuff is found by a 'zlcache_ptr' pointer in | ||
337 | * the fixed length portion at the front of the struct. This pointer | ||
338 | * both enables us to find the zonelist cache, and in the case of | ||
339 | * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL) | ||
340 | * to know that the zonelist cache is not there. | ||
341 | * | ||
342 | * The end result is that struct zonelists come in two flavors: | ||
343 | * 1) The full, fixed length version, shown below, and | ||
344 | * 2) The custom zonelists for MPOL_BIND. | ||
345 | * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache. | ||
346 | * | ||
347 | * Even though there may be multiple CPU cores on a node modifying | ||
348 | * fullzones or last_full_zap in the same zonelist_cache at the same | ||
349 | * time, we don't lock it. This is just hint data - if it is wrong now | ||
350 | * and then, the allocator will still function, perhaps a bit slower. | ||
351 | */ | ||
352 | |||
353 | |||
354 | struct zonelist_cache { | ||
355 | DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ | ||
356 | unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ | ||
357 | unsigned long last_full_zap; /* when last zap'd (jiffies) */ | ||
358 | }; | ||
359 | #else | ||
360 | struct zonelist_cache; | ||
361 | #endif | ||
362 | |||
291 | /* | 363 | /* |
292 | * One allocation request operates on a zonelist. A zonelist | 364 | * One allocation request operates on a zonelist. A zonelist |
293 | * is a list of zones, the first one is the 'goal' of the | 365 | * is a list of zones, the first one is the 'goal' of the |
294 | * allocation, the other zones are fallback zones, in decreasing | 366 | * allocation, the other zones are fallback zones, in decreasing |
295 | * priority. | 367 | * priority. |
296 | * | 368 | * |
297 | * Right now a zonelist takes up less than a cacheline. We never | 369 | * If zlcache_ptr is not NULL, then it is just the address of zlcache, |
298 | * modify it apart from boot-up, and only a few indices are used, | 370 | * as explained above. If zlcache_ptr is NULL, there is no zlcache. |
299 | * so despite the zonelist table being relatively big, the cache | ||
300 | * footprint of this construct is very small. | ||
301 | */ | 371 | */ |
372 | |||
302 | struct zonelist { | 373 | struct zonelist { |
303 | struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited | 374 | struct zonelist_cache *zlcache_ptr; // NULL or &zlcache |
375 | struct zone *zones[MAX_ZONES_PER_ZONELIST + 1]; // NULL delimited | ||
376 | #ifdef CONFIG_NUMA | ||
377 | struct zonelist_cache zlcache; // optional ... | ||
378 | #endif | ||
304 | }; | 379 | }; |
305 | 380 | ||
306 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 381 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |