1 files changed, 154 insertions, 95 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 224178a000d2..7b703b6d4358 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -15,6 +15,8 @@
 #include <linux/fs.h>
 #include <linux/mutex.h>
 #include <linux/debug_locks.h>
+#include <linux/backing-dev.h>
+#include <linux/mm_types.h>
 struct mempolicy;
 struct anon_vma;
@@ -197,6 +199,7 @@ struct vm_operations_struct {
        void (*open)(struct vm_area_struct * area);
        void (*close)(struct vm_area_struct * area);
        struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
+        unsigned long (*nopfn)(struct vm_area_struct * area, unsigned long address);
        int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
        /* notification that a previously read-only page is about to become
@@ -214,61 +217,6 @@ struct vm_operations_struct {
 struct mmu_gather;
 struct inode;
-/*
- * Each physical page in the system has a struct page associated with
- * it to keep track of whatever it is we are using the page for at the
- * moment. Note that we have no way to track which tasks are using
- * a page.
- */
-struct page {
-        unsigned long flags;            /* Atomic flags, some possibly
-                                         * updated asynchronously */
-        atomic_t _count;                /* Usage count, see below. */
-        atomic_t _mapcount;             /* Count of ptes mapped in mms,
-                                         * to show when page is mapped
-                                         * & limit reverse map searches.
-                                         */
-        union {
-            struct {
-                unsigned long private;          /* Mapping-private opaque data:
-                                                 * usually used for buffer_heads
-                                                 * if PagePrivate set; used for
-                                                 * swp_entry_t if PageSwapCache;
-                                                 * indicates order in the buddy
-                                                 * system if PG_buddy is set.
-                                                 */
-                struct address_space *mapping;  /* If low bit clear, points to
-                                                 * inode address_space, or NULL.
-                                                 * If page mapped as anonymous
-                                                 * memory, low bit is set, and
-                                                 * it points to anon_vma object:
-                                                 * see PAGE_MAPPING_ANON below.
-                                                 */
-            };
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-            spinlock_t ptl;
-#endif
-        };
-        pgoff_t index;                  /* Our offset within mapping. */
-        struct list_head lru;           /* Pageout list, eg. active_list
-                                         * protected by zone->lru_lock !
-                                         */
-        /*
-         * On machines where all RAM is mapped into kernel address space,
-         * we can simply calculate the virtual address. On machines with
-         * highmem some memory is mapped into kernel virtual memory
-         * dynamically, so we need a place to store that address.
-         * Note that this field could be 16 bits on x86 ... ;)
-         *
-         * Architectures with slow multiplication can define
-         * WANT_PAGE_VIRTUAL in asm/page.h
-         */
-#if defined(WANT_PAGE_VIRTUAL)
-        void *virtual;                  /* Kernel virtual address (NULL if
-                                           not kmapped, ie. highmem) */
-#endif /* WANT_PAGE_VIRTUAL */
-};
 #define page_private(page)              ((page)->private)
 #define set_page_private(page, v)       ((page)->private = (v))
@@ -278,6 +226,12 @@ struct page {
 */
 #include <linux/page-flags.h>
+#ifdef CONFIG_DEBUG_VM
+#define VM_BUG_ON(cond) BUG_ON(cond)
+#else
+#define VM_BUG_ON(condition) do { } while(0)
+#endif
 /*
 * Methods to modify the page usage count.
 *
@@ -292,12 +246,11 @@ struct page {
 */
 /*
- * Drop a ref, return true if the logical refcount fell to zero (the page has
+ * Drop a ref, return true if the refcount fell to zero (the page has no users)
- * no users)
 */
 static inline int put_page_testzero(struct page *page)
 {
-        BUG_ON(atomic_read(&page->_count) == 0);
+        VM_BUG_ON(atomic_read(&page->_count) == 0);
        return atomic_dec_and_test(&page->_count);
 }
@@ -307,11 +260,10 @@ static inline int put_page_testzero(struct page *page)
 */
 static inline int get_page_unless_zero(struct page *page)
 {
+        VM_BUG_ON(PageCompound(page));
        return atomic_inc_not_zero(&page->_count);
 }
-extern void FASTCALL(__page_cache_release(struct page *));
 static inline int page_count(struct page *page)
 {
        if (unlikely(PageCompound(page)))
@@ -323,6 +275,7 @@ static inline void get_page(struct page *page)
 {
        if (unlikely(PageCompound(page)))
                page = (struct page *)page_private(page);
+        VM_BUG_ON(atomic_read(&page->_count) == 0);
        atomic_inc(&page->_count);
 }
@@ -349,43 +302,55 @@ void split_page(struct page *page, unsigned int order);
 * For the non-reserved pages, page_count(page) denotes a reference count.
 *   page_count() == 0 means the page is free. page->lru is then used for
 *   freelist management in the buddy allocator.
- *   page_count() == 1 means the page is used for exactly one purpose
+ *   page_count() > 0  means the page has been allocated.
- *   (e.g. a private data page of one process).
+ *
+ * Pages are allocated by the slab allocator in order to provide memory
+ * to kmalloc and kmem_cache_alloc. In this case, the management of the
+ * page, and the fields in 'struct page' are the responsibility of mm/slab.c
+ * unless a particular usage is carefully commented. (the responsibility of
+ * freeing the kmalloc memory is the caller's, of course).
 *
- * A page may be used for kmalloc() or anyone else who does a
+ * A page may be used by anyone else who does a __get_free_page().
- * __get_free_page(). In this case the page_count() is at least 1, and
+ * In this case, page_count still tracks the references, and should only
- * all other fields are unused but should be 0 or NULL. The
+ * be used through the normal accessor functions. The top bits of page->flags
- * management of this page is the responsibility of the one who uses
+ * and page->virtual store page management information, but all other fields
- * it.
+ * are unused and could be used privately, carefully. The management of this
+ * page is the responsibility of the one who allocated it, and those who have
+ * subsequently been given references to it.
 *
- * The other pages (we may call them "process pages") are completely
+ * The other pages (we may call them "pagecache pages") are completely
 * managed by the Linux memory manager: I/O, buffers, swapping etc.
 * The following discussion applies only to them.
 *
- * A page may belong to an inode's memory mapping. In this case,
+ * A pagecache page contains an opaque `private' member, which belongs to the
- * page->mapping is the pointer to the inode, and page->index is the
+ * page's address_space. Usually, this is the address of a circular list of
- * file offset of the page, in units of PAGE_CACHE_SIZE.
+ * the page's disk buffers. PG_private must be set to tell the VM to call
+ * into the filesystem to release these pages.
 *
- * A page contains an opaque `private' member, which belongs to the
+ * A page may belong to an inode's memory mapping. In this case, page->mapping
- * page's address_space.  Usually, this is the address of a circular
+ * is the pointer to the inode, and page->index is the file offset of the page,
- * list of the page's disk buffers.
+ * in units of PAGE_CACHE_SIZE.
 *
- * For pages belonging to inodes, the page_count() is the number of
+ * If pagecache pages are not associated with an inode, they are said to be
- * attaches, plus 1 if `private' contains something, plus one for
+ * anonymous pages. These may become associated with the swapcache, and in that
- * the page cache itself.
+ * case PG_swapcache is set, and page->private is an offset into the swapcache.
 *
- * Instead of keeping dirty/clean pages in per address-space lists, we instead
+ * In either case (swapcache or inode backed), the pagecache itself holds one
- * now tag pages as dirty/under writeback in the radix tree.
+ * reference to the page. Setting PG_private should also increment the
+ * refcount. The each user mapping also has a reference to the page.
 *
- * There is also a per-mapping radix tree mapping index to the page
+ * The pagecache pages are stored in a per-mapping radix tree, which is
- * in memory if present. The tree is rooted at mapping->root.  
+ * rooted at mapping->page_tree, and indexed by offset.
+ * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
+ * lists, we instead now tag pages as dirty/writeback in the radix tree.
 *
- * All process pages can do I/O:
+ * All pagecache pages may be subject to I/O:
 * - inode pages may need to be read from disk,
 * - inode pages which have been modified and are MAP_SHARED may need
- *   to be written to disk,
+ *   to be written back to the inode on disk,
- * - private pages which have been modified may need to be swapped out
+ * - anonymous pages (including MAP_PRIVATE file mappings) which have been
- *   to swap space and (later) to be read back into memory.
+ *   modified may need to be swapped out to swap space and (later) to be read
+ *   back into memory.
 */
 /*
@@ -463,7 +428,7 @@ void split_page(struct page *page, unsigned int order);
 #define SECTIONS_MASK           ((1UL << SECTIONS_WIDTH) - 1)
 #define ZONETABLE_MASK          ((1UL << ZONETABLE_SHIFT) - 1)
-static inline unsigned long page_zonenum(struct page *page)
+static inline enum zone_type page_zonenum(struct page *page)
 {
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
@@ -480,23 +445,33 @@ static inline struct zone *page_zone(struct page *page)
        return zone_table[page_zone_id(page)];
 }
+static inline unsigned long zone_to_nid(struct zone *zone)
+{
+#ifdef CONFIG_NUMA
+        return zone->node;
+#else
+        return 0;
+#endif
+}
 static inline unsigned long page_to_nid(struct page *page)
 {
        if (FLAGS_HAS_NODE)
                return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
        else
-                return page_zone(page)->zone_pgdat->node_id;
+                return zone_to_nid(page_zone(page));
 }
 static inline unsigned long page_to_section(struct page *page)
 {
        return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
 }
-static inline void set_page_zone(struct page *page, unsigned long zone)
+static inline void set_page_zone(struct page *page, enum zone_type zone)
 {
        page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
        page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
 }
 static inline void set_page_node(struct page *page, unsigned long node)
 {
        page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
@@ -508,7 +483,7 @@ static inline void set_page_section(struct page *page, unsigned long section)
        page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
 }
-static inline void set_page_links(struct page *page, unsigned long zone,
+static inline void set_page_links(struct page *page, enum zone_type zone,
        unsigned long node, unsigned long pfn)
 {
        set_page_zone(page, zone);
@@ -521,11 +496,6 @@ static inline void set_page_links(struct page *page, unsigned long zone,
 */
 #include <linux/vmstat.h>
-#ifndef CONFIG_DISCONTIGMEM
-/* The array of struct pages - for discontigmem use pgdat->lmem_map */
-extern struct page *mem_map;
-#endif
 static __always_inline void *lowmem_page_address(struct page *page)
 {
        return __va(page_to_pfn(page) << PAGE_SHIFT);
@@ -625,6 +595,12 @@ static inline int page_mapped(struct page *page)
 #define NOPAGE_OOM      ((struct page *) (-1))
 /*
+ * Error return values for the *_nopfn functions
+ */
+#define NOPFN_SIGBUS    ((unsigned long) -1)
+#define NOPFN_OOM       ((unsigned long) -2)
+/*
 * Different kinds of faults, as returned by handle_mm_fault().
 * Used to decide whether a process gets delivered SIGBUS or
 * just gets major/minor fault counters bumped up.
@@ -802,6 +778,39 @@ struct shrinker;
 extern struct shrinker *set_shrinker(int, shrinker_t);
 extern void remove_shrinker(struct shrinker *shrinker);
+/*
+ * Some shared mappigns will want the pages marked read-only
+ * to track write events. If so, we'll downgrade vm_page_prot
+ * to the private version (using protection_map[] without the
+ * VM_SHARED bit).
+ */
+static inline int vma_wants_writenotify(struct vm_area_struct *vma)
+{
+        unsigned int vm_flags = vma->vm_flags;
+        /* If it was private or non-writable, the write bit is already clear */
+        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+                return 0;
+        /* The backer wishes to know when pages are first written to? */
+        if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+                return 1;
+        /* The open routine did something to the protections already? */
+        if (pgprot_val(vma->vm_page_prot) !=
+            pgprot_val(protection_map[vm_flags &
+                    (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]))
+                return 0;
+        /* Specialty mapping? */
+        if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+                return 0;
+        /* Can the mapping track the dirty pages? */
+        return vma->vm_file && vma->vm_file->f_mapping &&
+                mapping_cap_account_dirty(vma->vm_file->f_mapping);
+}
 extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl));
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
@@ -879,6 +888,56 @@ extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, pg_data_t *pgdat,
        unsigned long * zones_size, unsigned long zone_start_pfn, 
        unsigned long *zholes_size);
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/*
+ * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
+ * zones, allocate the backing mem_map and account for memory holes in a more
+ * architecture independent manner. This is a substitute for creating the
+ * zone_sizes[] and zholes_size[] arrays and passing them to
+ * free_area_init_node()
+ *
+ * An architecture is expected to register range of page frames backed by
+ * physical memory with add_active_range() before calling
+ * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
+ * usage, an architecture is expected to do something like
+ *
+ * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
+ *                                                       max_highmem_pfn};
+ * for_each_valid_physical_page_range()
+ *      add_active_range(node_id, start_pfn, end_pfn)
+ * free_area_init_nodes(max_zone_pfns);
+ *
+ * If the architecture guarantees that there are no holes in the ranges
+ * registered with add_active_range(), free_bootmem_active_regions()
+ * will call free_bootmem_node() for each registered physical page range.
+ * Similarly sparse_memory_present_with_active_regions() calls
+ * memory_present() for each range when SPARSEMEM is enabled.
+ *
+ * See mm/page_alloc.c for more information on each function exposed by
+ * CONFIG_ARCH_POPULATES_NODE_MAP
+ */
+extern void free_area_init_nodes(unsigned long *max_zone_pfn);
+extern void add_active_range(unsigned int nid, unsigned long start_pfn,
+                                        unsigned long end_pfn);
+extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
+                                                unsigned long new_end_pfn);
+extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
+                                        unsigned long end_pfn);
+extern void remove_all_active_ranges(void);
+extern unsigned long absent_pages_in_range(unsigned long start_pfn,
+                                                unsigned long end_pfn);
+extern void get_pfn_range_for_nid(unsigned int nid,
+                        unsigned long *start_pfn, unsigned long *end_pfn);
+extern unsigned long find_min_pfn_with_active_regions(void);
+extern unsigned long find_max_pfn_with_active_regions(void);
+extern void free_bootmem_with_active_regions(int nid,
+                                                unsigned long max_low_pfn);
+extern void sparse_memory_present_with_active_regions(int nid);
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+extern int early_pfn_to_nid(unsigned long pfn);
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
 extern void setup_per_zone_pages_min(void);
 extern void mem_init(void);
@@ -1072,7 +1131,7 @@ void drop_slab(void);
 extern int randomize_va_space;
 #endif
-const char *arch_vma_name(struct vm_area_struct *vma);
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */