aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/mm.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r--include/linux/mm.h249
1 files changed, 154 insertions, 95 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 224178a000d2..7b703b6d4358 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -15,6 +15,8 @@
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/debug_locks.h> 17#include <linux/debug_locks.h>
18#include <linux/backing-dev.h>
19#include <linux/mm_types.h>
18 20
19struct mempolicy; 21struct mempolicy;
20struct anon_vma; 22struct anon_vma;
@@ -197,6 +199,7 @@ struct vm_operations_struct {
197 void (*open)(struct vm_area_struct * area); 199 void (*open)(struct vm_area_struct * area);
198 void (*close)(struct vm_area_struct * area); 200 void (*close)(struct vm_area_struct * area);
199 struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type); 201 struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
202 unsigned long (*nopfn)(struct vm_area_struct * area, unsigned long address);
200 int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); 203 int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
201 204
202 /* notification that a previously read-only page is about to become 205 /* notification that a previously read-only page is about to become
@@ -214,61 +217,6 @@ struct vm_operations_struct {
214struct mmu_gather; 217struct mmu_gather;
215struct inode; 218struct inode;
216 219
217/*
218 * Each physical page in the system has a struct page associated with
219 * it to keep track of whatever it is we are using the page for at the
220 * moment. Note that we have no way to track which tasks are using
221 * a page.
222 */
223struct page {
224 unsigned long flags; /* Atomic flags, some possibly
225 * updated asynchronously */
226 atomic_t _count; /* Usage count, see below. */
227 atomic_t _mapcount; /* Count of ptes mapped in mms,
228 * to show when page is mapped
229 * & limit reverse map searches.
230 */
231 union {
232 struct {
233 unsigned long private; /* Mapping-private opaque data:
234 * usually used for buffer_heads
235 * if PagePrivate set; used for
236 * swp_entry_t if PageSwapCache;
237 * indicates order in the buddy
238 * system if PG_buddy is set.
239 */
240 struct address_space *mapping; /* If low bit clear, points to
241 * inode address_space, or NULL.
242 * If page mapped as anonymous
243 * memory, low bit is set, and
244 * it points to anon_vma object:
245 * see PAGE_MAPPING_ANON below.
246 */
247 };
248#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
249 spinlock_t ptl;
250#endif
251 };
252 pgoff_t index; /* Our offset within mapping. */
253 struct list_head lru; /* Pageout list, eg. active_list
254 * protected by zone->lru_lock !
255 */
256 /*
257 * On machines where all RAM is mapped into kernel address space,
258 * we can simply calculate the virtual address. On machines with
259 * highmem some memory is mapped into kernel virtual memory
260 * dynamically, so we need a place to store that address.
261 * Note that this field could be 16 bits on x86 ... ;)
262 *
263 * Architectures with slow multiplication can define
264 * WANT_PAGE_VIRTUAL in asm/page.h
265 */
266#if defined(WANT_PAGE_VIRTUAL)
267 void *virtual; /* Kernel virtual address (NULL if
268 not kmapped, ie. highmem) */
269#endif /* WANT_PAGE_VIRTUAL */
270};
271
272#define page_private(page) ((page)->private) 220#define page_private(page) ((page)->private)
273#define set_page_private(page, v) ((page)->private = (v)) 221#define set_page_private(page, v) ((page)->private = (v))
274 222
@@ -278,6 +226,12 @@ struct page {
278 */ 226 */
279#include <linux/page-flags.h> 227#include <linux/page-flags.h>
280 228
229#ifdef CONFIG_DEBUG_VM
230#define VM_BUG_ON(cond) BUG_ON(cond)
231#else
232#define VM_BUG_ON(condition) do { } while(0)
233#endif
234
281/* 235/*
282 * Methods to modify the page usage count. 236 * Methods to modify the page usage count.
283 * 237 *
@@ -292,12 +246,11 @@ struct page {
292 */ 246 */
293 247
294/* 248/*
295 * Drop a ref, return true if the logical refcount fell to zero (the page has 249 * Drop a ref, return true if the refcount fell to zero (the page has no users)
296 * no users)
297 */ 250 */
298static inline int put_page_testzero(struct page *page) 251static inline int put_page_testzero(struct page *page)
299{ 252{
300 BUG_ON(atomic_read(&page->_count) == 0); 253 VM_BUG_ON(atomic_read(&page->_count) == 0);
301 return atomic_dec_and_test(&page->_count); 254 return atomic_dec_and_test(&page->_count);
302} 255}
303 256
@@ -307,11 +260,10 @@ static inline int put_page_testzero(struct page *page)
307 */ 260 */
308static inline int get_page_unless_zero(struct page *page) 261static inline int get_page_unless_zero(struct page *page)
309{ 262{
263 VM_BUG_ON(PageCompound(page));
310 return atomic_inc_not_zero(&page->_count); 264 return atomic_inc_not_zero(&page->_count);
311} 265}
312 266
313extern void FASTCALL(__page_cache_release(struct page *));
314
315static inline int page_count(struct page *page) 267static inline int page_count(struct page *page)
316{ 268{
317 if (unlikely(PageCompound(page))) 269 if (unlikely(PageCompound(page)))
@@ -323,6 +275,7 @@ static inline void get_page(struct page *page)
323{ 275{
324 if (unlikely(PageCompound(page))) 276 if (unlikely(PageCompound(page)))
325 page = (struct page *)page_private(page); 277 page = (struct page *)page_private(page);
278 VM_BUG_ON(atomic_read(&page->_count) == 0);
326 atomic_inc(&page->_count); 279 atomic_inc(&page->_count);
327} 280}
328 281
@@ -349,43 +302,55 @@ void split_page(struct page *page, unsigned int order);
349 * For the non-reserved pages, page_count(page) denotes a reference count. 302 * For the non-reserved pages, page_count(page) denotes a reference count.
350 * page_count() == 0 means the page is free. page->lru is then used for 303 * page_count() == 0 means the page is free. page->lru is then used for
351 * freelist management in the buddy allocator. 304 * freelist management in the buddy allocator.
352 * page_count() == 1 means the page is used for exactly one purpose 305 * page_count() > 0 means the page has been allocated.
353 * (e.g. a private data page of one process). 306 *
307 * Pages are allocated by the slab allocator in order to provide memory
308 * to kmalloc and kmem_cache_alloc. In this case, the management of the
309 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
310 * unless a particular usage is carefully commented. (the responsibility of
311 * freeing the kmalloc memory is the caller's, of course).
354 * 312 *
355 * A page may be used for kmalloc() or anyone else who does a 313 * A page may be used by anyone else who does a __get_free_page().
356 * __get_free_page(). In this case the page_count() is at least 1, and 314 * In this case, page_count still tracks the references, and should only
357 * all other fields are unused but should be 0 or NULL. The 315 * be used through the normal accessor functions. The top bits of page->flags
358 * management of this page is the responsibility of the one who uses 316 * and page->virtual store page management information, but all other fields
359 * it. 317 * are unused and could be used privately, carefully. The management of this
318 * page is the responsibility of the one who allocated it, and those who have
319 * subsequently been given references to it.
360 * 320 *
361 * The other pages (we may call them "process pages") are completely 321 * The other pages (we may call them "pagecache pages") are completely
362 * managed by the Linux memory manager: I/O, buffers, swapping etc. 322 * managed by the Linux memory manager: I/O, buffers, swapping etc.
363 * The following discussion applies only to them. 323 * The following discussion applies only to them.
364 * 324 *
365 * A page may belong to an inode's memory mapping. In this case, 325 * A pagecache page contains an opaque `private' member, which belongs to the
366 * page->mapping is the pointer to the inode, and page->index is the 326 * page's address_space. Usually, this is the address of a circular list of
367 * file offset of the page, in units of PAGE_CACHE_SIZE. 327 * the page's disk buffers. PG_private must be set to tell the VM to call
328 * into the filesystem to release these pages.
368 * 329 *
369 * A page contains an opaque `private' member, which belongs to the 330 * A page may belong to an inode's memory mapping. In this case, page->mapping
370 * page's address_space. Usually, this is the address of a circular 331 * is the pointer to the inode, and page->index is the file offset of the page,
371 * list of the page's disk buffers. 332 * in units of PAGE_CACHE_SIZE.
372 * 333 *
373 * For pages belonging to inodes, the page_count() is the number of 334 * If pagecache pages are not associated with an inode, they are said to be
374 * attaches, plus 1 if `private' contains something, plus one for 335 * anonymous pages. These may become associated with the swapcache, and in that
375 * the page cache itself. 336 * case PG_swapcache is set, and page->private is an offset into the swapcache.
376 * 337 *
377 * Instead of keeping dirty/clean pages in per address-space lists, we instead 338 * In either case (swapcache or inode backed), the pagecache itself holds one
378 * now tag pages as dirty/under writeback in the radix tree. 339 * reference to the page. Setting PG_private should also increment the
340 * refcount. The each user mapping also has a reference to the page.
379 * 341 *
380 * There is also a per-mapping radix tree mapping index to the page 342 * The pagecache pages are stored in a per-mapping radix tree, which is
381 * in memory if present. The tree is rooted at mapping->root. 343 * rooted at mapping->page_tree, and indexed by offset.
344 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
345 * lists, we instead now tag pages as dirty/writeback in the radix tree.
382 * 346 *
383 * All process pages can do I/O: 347 * All pagecache pages may be subject to I/O:
384 * - inode pages may need to be read from disk, 348 * - inode pages may need to be read from disk,
385 * - inode pages which have been modified and are MAP_SHARED may need 349 * - inode pages which have been modified and are MAP_SHARED may need
386 * to be written to disk, 350 * to be written back to the inode on disk,
387 * - private pages which have been modified may need to be swapped out 351 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
388 * to swap space and (later) to be read back into memory. 352 * modified may need to be swapped out to swap space and (later) to be read
353 * back into memory.
389 */ 354 */
390 355
391/* 356/*
@@ -463,7 +428,7 @@ void split_page(struct page *page, unsigned int order);
463#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) 428#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
464#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) 429#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1)
465 430
466static inline unsigned long page_zonenum(struct page *page) 431static inline enum zone_type page_zonenum(struct page *page)
467{ 432{
468 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; 433 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
469} 434}
@@ -480,23 +445,33 @@ static inline struct zone *page_zone(struct page *page)
480 return zone_table[page_zone_id(page)]; 445 return zone_table[page_zone_id(page)];
481} 446}
482 447
448static inline unsigned long zone_to_nid(struct zone *zone)
449{
450#ifdef CONFIG_NUMA
451 return zone->node;
452#else
453 return 0;
454#endif
455}
456
483static inline unsigned long page_to_nid(struct page *page) 457static inline unsigned long page_to_nid(struct page *page)
484{ 458{
485 if (FLAGS_HAS_NODE) 459 if (FLAGS_HAS_NODE)
486 return (page->flags >> NODES_PGSHIFT) & NODES_MASK; 460 return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
487 else 461 else
488 return page_zone(page)->zone_pgdat->node_id; 462 return zone_to_nid(page_zone(page));
489} 463}
490static inline unsigned long page_to_section(struct page *page) 464static inline unsigned long page_to_section(struct page *page)
491{ 465{
492 return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; 466 return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
493} 467}
494 468
495static inline void set_page_zone(struct page *page, unsigned long zone) 469static inline void set_page_zone(struct page *page, enum zone_type zone)
496{ 470{
497 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); 471 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
498 page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; 472 page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
499} 473}
474
500static inline void set_page_node(struct page *page, unsigned long node) 475static inline void set_page_node(struct page *page, unsigned long node)
501{ 476{
502 page->flags &= ~(NODES_MASK << NODES_PGSHIFT); 477 page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
@@ -508,7 +483,7 @@ static inline void set_page_section(struct page *page, unsigned long section)
508 page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; 483 page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
509} 484}
510 485
511static inline void set_page_links(struct page *page, unsigned long zone, 486static inline void set_page_links(struct page *page, enum zone_type zone,
512 unsigned long node, unsigned long pfn) 487 unsigned long node, unsigned long pfn)
513{ 488{
514 set_page_zone(page, zone); 489 set_page_zone(page, zone);
@@ -521,11 +496,6 @@ static inline void set_page_links(struct page *page, unsigned long zone,
521 */ 496 */
522#include <linux/vmstat.h> 497#include <linux/vmstat.h>
523 498
524#ifndef CONFIG_DISCONTIGMEM
525/* The array of struct pages - for discontigmem use pgdat->lmem_map */
526extern struct page *mem_map;
527#endif
528
529static __always_inline void *lowmem_page_address(struct page *page) 499static __always_inline void *lowmem_page_address(struct page *page)
530{ 500{
531 return __va(page_to_pfn(page) << PAGE_SHIFT); 501 return __va(page_to_pfn(page) << PAGE_SHIFT);
@@ -625,6 +595,12 @@ static inline int page_mapped(struct page *page)
625#define NOPAGE_OOM ((struct page *) (-1)) 595#define NOPAGE_OOM ((struct page *) (-1))
626 596
627/* 597/*
598 * Error return values for the *_nopfn functions
599 */
600#define NOPFN_SIGBUS ((unsigned long) -1)
601#define NOPFN_OOM ((unsigned long) -2)
602
603/*
628 * Different kinds of faults, as returned by handle_mm_fault(). 604 * Different kinds of faults, as returned by handle_mm_fault().
629 * Used to decide whether a process gets delivered SIGBUS or 605 * Used to decide whether a process gets delivered SIGBUS or
630 * just gets major/minor fault counters bumped up. 606 * just gets major/minor fault counters bumped up.
@@ -802,6 +778,39 @@ struct shrinker;
802extern struct shrinker *set_shrinker(int, shrinker_t); 778extern struct shrinker *set_shrinker(int, shrinker_t);
803extern void remove_shrinker(struct shrinker *shrinker); 779extern void remove_shrinker(struct shrinker *shrinker);
804 780
781/*
782 * Some shared mappigns will want the pages marked read-only
783 * to track write events. If so, we'll downgrade vm_page_prot
784 * to the private version (using protection_map[] without the
785 * VM_SHARED bit).
786 */
787static inline int vma_wants_writenotify(struct vm_area_struct *vma)
788{
789 unsigned int vm_flags = vma->vm_flags;
790
791 /* If it was private or non-writable, the write bit is already clear */
792 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
793 return 0;
794
795 /* The backer wishes to know when pages are first written to? */
796 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
797 return 1;
798
799 /* The open routine did something to the protections already? */
800 if (pgprot_val(vma->vm_page_prot) !=
801 pgprot_val(protection_map[vm_flags &
802 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]))
803 return 0;
804
805 /* Specialty mapping? */
806 if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
807 return 0;
808
809 /* Can the mapping track the dirty pages? */
810 return vma->vm_file && vma->vm_file->f_mapping &&
811 mapping_cap_account_dirty(vma->vm_file->f_mapping);
812}
813
805extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)); 814extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl));
806 815
807int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); 816int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
@@ -879,6 +888,56 @@ extern void free_area_init(unsigned long * zones_size);
879extern void free_area_init_node(int nid, pg_data_t *pgdat, 888extern void free_area_init_node(int nid, pg_data_t *pgdat,
880 unsigned long * zones_size, unsigned long zone_start_pfn, 889 unsigned long * zones_size, unsigned long zone_start_pfn,
881 unsigned long *zholes_size); 890 unsigned long *zholes_size);
891#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
892/*
893 * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
894 * zones, allocate the backing mem_map and account for memory holes in a more
895 * architecture independent manner. This is a substitute for creating the
896 * zone_sizes[] and zholes_size[] arrays and passing them to
897 * free_area_init_node()
898 *
899 * An architecture is expected to register range of page frames backed by
900 * physical memory with add_active_range() before calling
901 * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
902 * usage, an architecture is expected to do something like
903 *
904 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
905 * max_highmem_pfn};
906 * for_each_valid_physical_page_range()
907 * add_active_range(node_id, start_pfn, end_pfn)
908 * free_area_init_nodes(max_zone_pfns);
909 *
910 * If the architecture guarantees that there are no holes in the ranges
911 * registered with add_active_range(), free_bootmem_active_regions()
912 * will call free_bootmem_node() for each registered physical page range.
913 * Similarly sparse_memory_present_with_active_regions() calls
914 * memory_present() for each range when SPARSEMEM is enabled.
915 *
916 * See mm/page_alloc.c for more information on each function exposed by
917 * CONFIG_ARCH_POPULATES_NODE_MAP
918 */
919extern void free_area_init_nodes(unsigned long *max_zone_pfn);
920extern void add_active_range(unsigned int nid, unsigned long start_pfn,
921 unsigned long end_pfn);
922extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
923 unsigned long new_end_pfn);
924extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
925 unsigned long end_pfn);
926extern void remove_all_active_ranges(void);
927extern unsigned long absent_pages_in_range(unsigned long start_pfn,
928 unsigned long end_pfn);
929extern void get_pfn_range_for_nid(unsigned int nid,
930 unsigned long *start_pfn, unsigned long *end_pfn);
931extern unsigned long find_min_pfn_with_active_regions(void);
932extern unsigned long find_max_pfn_with_active_regions(void);
933extern void free_bootmem_with_active_regions(int nid,
934 unsigned long max_low_pfn);
935extern void sparse_memory_present_with_active_regions(int nid);
936#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
937extern int early_pfn_to_nid(unsigned long pfn);
938#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
939#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
940extern void set_dma_reserve(unsigned long new_dma_reserve);
882extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); 941extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
883extern void setup_per_zone_pages_min(void); 942extern void setup_per_zone_pages_min(void);
884extern void mem_init(void); 943extern void mem_init(void);
@@ -1072,7 +1131,7 @@ void drop_slab(void);
1072extern int randomize_va_space; 1131extern int randomize_va_space;
1073#endif 1132#endif
1074 1133
1075const char *arch_vma_name(struct vm_area_struct *vma); 1134__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma);
1076 1135
1077#endif /* __KERNEL__ */ 1136#endif /* __KERNEL__ */
1078#endif /* _LINUX_MM_H */ 1137#endif /* _LINUX_MM_H */