aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/mmzone.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mmzone.h')
-rw-r--r--include/linux/mmzone.h95
1 files changed, 87 insertions, 8 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e06683e2bea3..b262f47961fb 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -278,7 +278,7 @@ struct zone {
278 /* 278 /*
279 * rarely used fields: 279 * rarely used fields:
280 */ 280 */
281 char *name; 281 const char *name;
282} ____cacheline_internodealigned_in_smp; 282} ____cacheline_internodealigned_in_smp;
283 283
284/* 284/*
@@ -288,19 +288,94 @@ struct zone {
288 */ 288 */
289#define DEF_PRIORITY 12 289#define DEF_PRIORITY 12
290 290
291/* Maximum number of zones on a zonelist */
292#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
293
294#ifdef CONFIG_NUMA
295/*
296 * We cache key information from each zonelist for smaller cache
297 * footprint when scanning for free pages in get_page_from_freelist().
298 *
299 * 1) The BITMAP fullzones tracks which zones in a zonelist have come
300 * up short of free memory since the last time (last_fullzone_zap)
301 * we zero'd fullzones.
302 * 2) The array z_to_n[] maps each zone in the zonelist to its node
303 * id, so that we can efficiently evaluate whether that node is
304 * set in the current tasks mems_allowed.
305 *
306 * Both fullzones and z_to_n[] are one-to-one with the zonelist,
307 * indexed by a zones offset in the zonelist zones[] array.
308 *
309 * The get_page_from_freelist() routine does two scans. During the
310 * first scan, we skip zones whose corresponding bit in 'fullzones'
311 * is set or whose corresponding node in current->mems_allowed (which
312 * comes from cpusets) is not set. During the second scan, we bypass
313 * this zonelist_cache, to ensure we look methodically at each zone.
314 *
315 * Once per second, we zero out (zap) fullzones, forcing us to
316 * reconsider nodes that might have regained more free memory.
317 * The field last_full_zap is the time we last zapped fullzones.
318 *
319 * This mechanism reduces the amount of time we waste repeatedly
320 * reexaming zones for free memory when they just came up low on
321 * memory momentarilly ago.
322 *
323 * The zonelist_cache struct members logically belong in struct
324 * zonelist. However, the mempolicy zonelists constructed for
325 * MPOL_BIND are intentionally variable length (and usually much
326 * shorter). A general purpose mechanism for handling structs with
327 * multiple variable length members is more mechanism than we want
328 * here. We resort to some special case hackery instead.
329 *
330 * The MPOL_BIND zonelists don't need this zonelist_cache (in good
331 * part because they are shorter), so we put the fixed length stuff
332 * at the front of the zonelist struct, ending in a variable length
333 * zones[], as is needed by MPOL_BIND.
334 *
335 * Then we put the optional zonelist cache on the end of the zonelist
336 * struct. This optional stuff is found by a 'zlcache_ptr' pointer in
337 * the fixed length portion at the front of the struct. This pointer
338 * both enables us to find the zonelist cache, and in the case of
339 * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
340 * to know that the zonelist cache is not there.
341 *
342 * The end result is that struct zonelists come in two flavors:
343 * 1) The full, fixed length version, shown below, and
344 * 2) The custom zonelists for MPOL_BIND.
345 * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
346 *
347 * Even though there may be multiple CPU cores on a node modifying
348 * fullzones or last_full_zap in the same zonelist_cache at the same
349 * time, we don't lock it. This is just hint data - if it is wrong now
350 * and then, the allocator will still function, perhaps a bit slower.
351 */
352
353
354struct zonelist_cache {
355 unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
356 DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
357 unsigned long last_full_zap; /* when last zap'd (jiffies) */
358};
359#else
360struct zonelist_cache;
361#endif
362
291/* 363/*
292 * One allocation request operates on a zonelist. A zonelist 364 * One allocation request operates on a zonelist. A zonelist
293 * is a list of zones, the first one is the 'goal' of the 365 * is a list of zones, the first one is the 'goal' of the
294 * allocation, the other zones are fallback zones, in decreasing 366 * allocation, the other zones are fallback zones, in decreasing
295 * priority. 367 * priority.
296 * 368 *
297 * Right now a zonelist takes up less than a cacheline. We never 369 * If zlcache_ptr is not NULL, then it is just the address of zlcache,
298 * modify it apart from boot-up, and only a few indices are used, 370 * as explained above. If zlcache_ptr is NULL, there is no zlcache.
299 * so despite the zonelist table being relatively big, the cache
300 * footprint of this construct is very small.
301 */ 371 */
372
302struct zonelist { 373struct zonelist {
303 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited 374 struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
375 struct zone *zones[MAX_ZONES_PER_ZONELIST + 1]; // NULL delimited
376#ifdef CONFIG_NUMA
377 struct zonelist_cache zlcache; // optional ...
378#endif
304}; 379};
305 380
306#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 381#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@ -375,9 +450,13 @@ void build_all_zonelists(void);
375void wakeup_kswapd(struct zone *zone, int order); 450void wakeup_kswapd(struct zone *zone, int order);
376int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 451int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
377 int classzone_idx, int alloc_flags); 452 int classzone_idx, int alloc_flags);
378 453enum memmap_context {
454 MEMMAP_EARLY,
455 MEMMAP_HOTPLUG,
456};
379extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, 457extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
380 unsigned long size); 458 unsigned long size,
459 enum memmap_context context);
381 460
382#ifdef CONFIG_HAVE_MEMORY_PRESENT 461#ifdef CONFIG_HAVE_MEMORY_PRESENT
383void memory_present(int nid, unsigned long start, unsigned long end); 462void memory_present(int nid, unsigned long start, unsigned long end);